Skip to content

Commit

Permalink
Merge pull request #180 from jdi-testing/release/0.2.62
Browse files Browse the repository at this point in the history
Release 0.2.62
  • Loading branch information
ivnglkv authored Feb 9, 2024
2 parents c5af2ea + be62bff commit 97c123a
Show file tree
Hide file tree
Showing 24 changed files with 449 additions and 87 deletions.
1 change: 1 addition & 0 deletions .env.dist
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
SELENOID_PARALLEL_SESSIONS_COUNT=4
4 changes: 4 additions & 0 deletions .github/workflows/.release-workflow-rc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ jobs:
uses: actions/checkout@v3
with:
submodules: true
- name: Load default environment variables values
uses: cardinalby/export-env-action@v2
with:
envFile: '.env.dist'
- name: Build
run: docker compose -f docker-compose.dev.yaml up -d
- name: Lint and Test
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/.release-workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ jobs:
with:
commit_message: increment version
branch: develop
- name: Load default environment variables values
uses: cardinalby/export-env-action@v2
with:
envFile: '.env.dist'
- name: Build
run: docker compose -f docker-compose.dev.yaml up -d
- name: Lint and Test
Expand Down
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ dataset/cache-labels/*
dataset/df/20*
.idea/*
HTMLgenerator/output/*
venv/
.pytest_cache/*
.coverage
.coverage.*
Expand All @@ -51,5 +50,6 @@ coverage.xml
.DS_Store

**/.DS_Store
.env
.env.rc
.env.dev
.env.dev
14 changes: 13 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,12 @@ cd jdi-qasp-ml
```
git checkout branch_name
```
4. build and start containers:
4. Copy `.env.dist` file to `.env`:
```
cp .env.dist .env
```
5. Adjust variables in `.env` file to your needs (refer to the [Settings](#settings) section).
6. Build and start containers:
```
docker-compose -f docker-compose.dev.yaml up --build
```
Expand All @@ -219,6 +224,13 @@ git pull
```
docker-compose -f docker-compose.dev.yaml up
```

## Settings

| Variable name | Description | Default value |
|----------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------|
| SELENOID_PARALLEL_SESSIONS_COUNT | Total number of parallel Selenoid sessions.<br>Is also used to determine number of processes used to calculate visibility of page elements.<br>Set it to the number of parallel running threads supported by your processor. -2 optionally if you'd like to reduce CPU load. | 4 |

# Docker - get debugging info:
- http://localhost:5050/build - get the docker image build's datetime
- http://localhost:5050/files - get data sent by browser to model
Expand Down
3 changes: 2 additions & 1 deletion app/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ class PredictionRequestElement(BaseModel):


class PredictionInputModel(BaseModel):
__root__: List[PredictionRequestElement]
document: str
elements: str


class PredictedElement(BaseModel):
Expand Down
11 changes: 9 additions & 2 deletions app/mongodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from app import MONGO_DB_HOST, MONGO_DB_PORT
from app.models import LoggingInfoModel
from app.tasks import ENV

client = pymongo.MongoClient(MONGO_DB_HOST, MONGO_DB_PORT)
mongo_db = client.jdn
Expand All @@ -15,6 +16,10 @@


def get_session_id() -> int:
# doesn't use base if it works locally
if ENV == "LOCAL":
return 0

session_id_entry = mongo_db.sesion_id_collection.find_one()
if session_id_entry:
new_result = session_id_entry["session_id"] + 1
Expand All @@ -28,8 +33,10 @@ def get_session_id() -> int:


def create_logs_json_file() -> None:
logs_collection = mongo_db.logs
all_logs = logs_collection.find()
all_logs = {}
if ENV != "LOCAL":
logs_collection = mongo_db.logs
all_logs = logs_collection.find()

with open("logs.json", "w") as output_file:
json.dump(json.loads(dumps(all_logs)), output_file)
Expand Down
107 changes: 107 additions & 0 deletions app/selenium_app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
from typing import Iterable, Sized, Tuple, Dict, List

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import concurrent.futures

from app.logger import logger
from utils import config


def get_webdriver() -> webdriver.Remote:
"""Returns a remote Chrome webdriver instance"""
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-dev-shm-usage")

capabilities = {
"browserName": "chrome",
"browserVersion": "118.0",
"selenoid:options": {
"enableVideo": False
}
}

return webdriver.Remote(
command_executor="http://jdi-qasp-ml-selenoid:4444/wd/hub",
desired_capabilities=capabilities,
options=chrome_options,
)


def get_page_elements(driver: webdriver.Remote, page_content: str) -> List[WebElement]:
"""Returns a list of all elements contained in page_content"""
driver.execute_script(
"document.body.insertAdjacentHTML('beforeend', arguments[0]);",
page_content,
)
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))

return driver.find_elements(by=By.XPATH, value="//*")


def get_elements_visibility(page_content: str, starting_element_idx: int, ending_element_idx: int) -> Dict[str, bool]:
"""Returns a visibility of portion of elements contained in page_content
starting_element_idx and ending_element_idx are referring to the starting
and ending indexes for slice of page_content elements returned by
get_page_elements() function.
"""
driver = get_webdriver()
all_elements = get_page_elements(driver, page_content)

result = {}

for element in all_elements[starting_element_idx:ending_element_idx]:
element_jdn_hash = element.get_attribute("jdn-hash")
is_shown = element.is_displayed()
result[element_jdn_hash] = is_shown
logger.info(f"Element with jdn-hash {element_jdn_hash} {'Visible' if is_shown else 'Invisible'}")

driver.quit()

return result


def get_chunks_boundaries(data: Sized, desired_chunks_amount: int) -> Iterable[Tuple[int, int]]:
"""Returns split indexes for a list, enabling easy partitioning into desired chunks"""
data_size = len(data)
chunk_size = data_size // desired_chunks_amount

for i in range(desired_chunks_amount):
if i < (desired_chunks_amount - 1):
yield i * chunk_size, (i + 1) * chunk_size
else:
yield i * chunk_size, data_size


def get_element_id_to_is_displayed_mapping(page_content: str) -> Dict[str, bool]:
"""Returns visibility status of all elements in the page
Returned dictionary uses elements' jdn-hash property value as keys
"""
escaped_page_content = str(page_content).encode('utf-8').decode('unicode_escape')

driver = get_webdriver()
all_elements = get_page_elements(driver, escaped_page_content)
driver.quit()

num_of_workers = config.SELENOID_PARALLEL_SESSIONS_COUNT
jobs_chunks = get_chunks_boundaries(all_elements, num_of_workers)
result = {}

with concurrent.futures.ProcessPoolExecutor(max_workers=num_of_workers) as executor:
futures = [
executor.submit(get_elements_visibility, escaped_page_content, s, e)
for s, e in jobs_chunks
]
for future in concurrent.futures.as_completed(futures):
result.update(future.result())

return result
36 changes: 20 additions & 16 deletions app/tasks.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import datetime
import os

from lxml import etree, html

Expand All @@ -8,6 +9,8 @@
from app.redis_app import redis_app
from utils.robula import generate_xpath

ENV = os.getenv("ENV")


@celery_app.task(bind=True)
def task_schedule_xpath_generation(
Expand All @@ -21,22 +24,23 @@ def task_schedule_xpath_generation(
start_time = datetime.datetime.utcnow()
page = redis_app.get(document_uuid).decode("utf-8")
result = generate_xpath(element_id, page, document_uuid, config) # calculation itself
end_time = datetime.datetime.utcnow()
task_duration = end_time - start_time

document = html.fromstring(page)
element = document.xpath(element_id)[0]
tree = etree.ElementTree(document)
full_xpath = tree.getpath(element)
nesting_num = len(full_xpath.split("/")) - 1

task_kwargs = self.request.kwargs
task_kwargs["task_duration"] = str(
task_duration
) # for custom metrics logging to mongodb
task_kwargs["start_time"] = str(start_time) # for custom metrics logging to mongodb
task_kwargs["full_xpath"] = full_xpath # for custom metrics logging to mongodb
task_kwargs["nesting_num"] = nesting_num # for custom metrics logging to mongodb

# calculation of additional parameters if not locally
if ENV != "LOCAL":
end_time = datetime.datetime.utcnow()
task_duration = end_time - start_time

document = html.fromstring(page)
element = document.xpath(element_id)[0]
tree = etree.ElementTree(document)
full_xpath = tree.getpath(element)
nesting_num = len(full_xpath.split("/")) - 1

task_kwargs = self.request.kwargs
task_kwargs["task_duration"] = str(task_duration) # for custom metrics logging to mongodb
task_kwargs["start_time"] = str(start_time) # for custom metrics logging to mongodb
task_kwargs["full_xpath"] = full_xpath # for custom metrics logging to mongodb
task_kwargs["nesting_num"] = nesting_num # for custom metrics logging to mongodb

return result

Expand Down
17 changes: 17 additions & 0 deletions browsers.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"chrome": {
"default": "91.0",
"versions": {
"91.0": {
"image": "selenoid/vnc_chrome:91.0",
"port": "4444"
}, "114.0": {
"image": "selenoid/vnc_chrome:114.0",
"port": "4444"
}, "118.0": {
"image": "selenoid/vnc_chrome:118.0",
"port": "4444"
}
}
}
}
43 changes: 32 additions & 11 deletions docker-compose-rc.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,33 @@
version: '3'
version: '3.9'

name: jdi-qasp-ml
services:
chrome:
image: selenoid/vnc_chrome:118.0
attach: false

selenoid:
container_name: "jdi-qasp-ml-selenoid"
image: "aerokube/selenoid:latest"
ports:
- "4445:4444"
volumes:
- ".:/etc/selenoid"
- "./target:/output"
- "/var/run/docker.sock:/var/run/docker.sock"
- "./target:/opt/selenoid/video"
environment:
- "OVERRIDE_VIDEO_OUTPUT_DIR=$PWD/target"
command:
- "-conf"
- "/etc/selenoid/browsers.json"
- "-video-output-dir"
- "/opt/selenoid/video"
- "-container-network"
- "jdi-qasp-ml_default"
- "-limit"
- "${SELENOID_PARALLEL_SESSIONS_COUNT:?set it to the number of parallel running threads supported by your processor (-2 optionally)}"

api:
image: "ghcr.io/jdi-testing/jdi-qasp-ml:rc"
container_name: jdi-qasp-ml-api
Expand All @@ -11,11 +38,14 @@ services:
driver: "json-file"
options:
max-size: "256m"
environment:
ENV: LOCAL
SELENOID_PARALLEL_SESSIONS_COUNT: $SELENOID_PARALLEL_SESSIONS_COUNT

celery:
image: "ghcr.io/jdi-testing/jdi-qasp-ml:rc"
container_name: jdi-qasp-ml-celery
command: celery -A app.celery_app:celery_app worker -l info
command: bash -c "chmod +x start_celery.sh && ./start_celery.sh"
logging:
driver: "json-file"
options:
Expand All @@ -31,12 +61,3 @@ services:
driver: "json-file"
options:
max-size: "256m"

mongodb:
image: mongo:5.0
ports:
- 27017:27017
logging:
driver: "json-file"
options:
max-size: "256m"
29 changes: 29 additions & 0 deletions docker-compose-stable.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,39 @@
version: '3'

name: jdi-qasp-ml
services:
chrome:
image: selenoid/vnc_chrome:118.0
attach: false

selenoid:
container_name: "jdi-qasp-ml-selenoid"
image: "aerokube/selenoid:latest"
ports:
- "4445:4444"
volumes:
- ".:/etc/selenoid"
- "./target:/output"
- "/var/run/docker.sock:/var/run/docker.sock"
- "./target:/opt/selenoid/video"
environment:
- "OVERRIDE_VIDEO_OUTPUT_DIR=$PWD/target"
command:
- "-conf"
- "/etc/selenoid/browsers.json"
- "-video-output-dir"
- "/opt/selenoid/video"
- "-container-network"
- "jdi-qasp-ml_default"
- "-limit"
- "${SELENOID_PARALLEL_SESSIONS_COUNT:?set it to the number of parallel running threads supported by your processor (-2 optionally)}"

api:
image: "ghcr.io/jdi-testing/jdi-qasp-ml:stable"
container_name: jdi-qasp-ml-api
command: uvicorn app.main:api --host 0.0.0.0 --port 5000
environment:
- SELENOID_PARALLEL_SESSIONS_COUNT=$SELENOID_PARALLEL_SESSIONS_COUNT
ports:
- "5050:5000"
logging:
Expand Down
Loading

0 comments on commit 97c123a

Please sign in to comment.