diff --git a/.env.dist b/.env.dist new file mode 100644 index 00000000..d090e06f --- /dev/null +++ b/.env.dist @@ -0,0 +1 @@ +SELENOID_PARALLEL_SESSIONS_COUNT=4 diff --git a/.github/workflows/.release-workflow-rc.yml b/.github/workflows/.release-workflow-rc.yml index 65b38afe..0fceeb4c 100644 --- a/.github/workflows/.release-workflow-rc.yml +++ b/.github/workflows/.release-workflow-rc.yml @@ -24,6 +24,10 @@ jobs: uses: actions/checkout@v3 with: submodules: true + - name: Load default environment variables values + uses: cardinalby/export-env-action@v2 + with: + envFile: '.env.dist' - name: Build run: docker compose -f docker-compose.dev.yaml up -d - name: Lint and Test diff --git a/.github/workflows/.release-workflow.yml b/.github/workflows/.release-workflow.yml index 7ac93725..bcd6124a 100644 --- a/.github/workflows/.release-workflow.yml +++ b/.github/workflows/.release-workflow.yml @@ -40,6 +40,10 @@ jobs: with: commit_message: increment version branch: develop + - name: Load default environment variables values + uses: cardinalby/export-env-action@v2 + with: + envFile: '.env.dist' - name: Build run: docker compose -f docker-compose.dev.yaml up -d - name: Lint and Test diff --git a/.gitignore b/.gitignore index 61d44585..18b4031b 100644 --- a/.gitignore +++ b/.gitignore @@ -42,7 +42,6 @@ dataset/cache-labels/* dataset/df/20* .idea/* HTMLgenerator/output/* -venv/ .pytest_cache/* .coverage .coverage.* @@ -51,5 +50,6 @@ coverage.xml .DS_Store **/.DS_Store +.env .env.rc -.env.dev \ No newline at end of file +.env.dev diff --git a/README.md b/README.md index 00c826a7..63603dc9 100644 --- a/README.md +++ b/README.md @@ -202,7 +202,12 @@ cd jdi-qasp-ml ``` git checkout branch_name ``` -4. build and start containers: +4. Copy `.env.dist` file to `.env`: +``` +cp .env.dist .env +``` +5. Adjust variables in `.env` file to your needs (refer to the [Settings](#settings) section). +6. Build and start containers: ``` docker-compose -f docker-compose.dev.yaml up --build ``` @@ -219,6 +224,13 @@ git pull ``` docker-compose -f docker-compose.dev.yaml up ``` + +## Settings + +| Variable name | Description | Default value | +|----------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------| +| SELENOID_PARALLEL_SESSIONS_COUNT | Total number of parallel Selenoid sessions.
Is also used to determine number of processes used to calculate visibility of page elements.
Set it to the number of parallel running threads supported by your processor. -2 optionally if you'd like to reduce CPU load. | 4 | + # Docker - get debugging info: - http://localhost:5050/build - get the docker image build's datetime - http://localhost:5050/files - get data sent by browser to model diff --git a/app/models.py b/app/models.py index 7e7dbd42..24151c3f 100644 --- a/app/models.py +++ b/app/models.py @@ -54,7 +54,8 @@ class PredictionRequestElement(BaseModel): class PredictionInputModel(BaseModel): - __root__: List[PredictionRequestElement] + document: str + elements: str class PredictedElement(BaseModel): diff --git a/app/mongodb.py b/app/mongodb.py index fc2c4f96..ba47a3db 100644 --- a/app/mongodb.py +++ b/app/mongodb.py @@ -6,6 +6,7 @@ from app import MONGO_DB_HOST, MONGO_DB_PORT from app.models import LoggingInfoModel +from app.tasks import ENV client = pymongo.MongoClient(MONGO_DB_HOST, MONGO_DB_PORT) mongo_db = client.jdn @@ -15,6 +16,10 @@ def get_session_id() -> int: + # doesn't use base if it works locally + if ENV == "LOCAL": + return 0 + session_id_entry = mongo_db.sesion_id_collection.find_one() if session_id_entry: new_result = session_id_entry["session_id"] + 1 @@ -28,8 +33,10 @@ def get_session_id() -> int: def create_logs_json_file() -> None: - logs_collection = mongo_db.logs - all_logs = logs_collection.find() + all_logs = {} + if ENV != "LOCAL": + logs_collection = mongo_db.logs + all_logs = logs_collection.find() with open("logs.json", "w") as output_file: json.dump(json.loads(dumps(all_logs)), output_file) diff --git a/app/selenium_app.py b/app/selenium_app.py new file mode 100644 index 00000000..dfcfe73a --- /dev/null +++ b/app/selenium_app.py @@ -0,0 +1,107 @@ +from typing import Iterable, Sized, Tuple, Dict, List + +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.common.by import By +from selenium.webdriver.remote.webelement import WebElement +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.ui import WebDriverWait +import concurrent.futures + +from app.logger import logger +from utils import config + + +def get_webdriver() -> webdriver.Remote: + """Returns a remote Chrome webdriver instance""" + chrome_options = Options() + chrome_options.add_argument("--no-sandbox") + chrome_options.add_argument("--headless") + chrome_options.add_argument("--disable-dev-shm-usage") + + capabilities = { + "browserName": "chrome", + "browserVersion": "118.0", + "selenoid:options": { + "enableVideo": False + } + } + + return webdriver.Remote( + command_executor="http://jdi-qasp-ml-selenoid:4444/wd/hub", + desired_capabilities=capabilities, + options=chrome_options, + ) + + +def get_page_elements(driver: webdriver.Remote, page_content: str) -> List[WebElement]: + """Returns a list of all elements contained in page_content""" + driver.execute_script( + "document.body.insertAdjacentHTML('beforeend', arguments[0]);", + page_content, + ) + wait = WebDriverWait(driver, 10) + wait.until(EC.presence_of_element_located((By.TAG_NAME, "body"))) + + return driver.find_elements(by=By.XPATH, value="//*") + + +def get_elements_visibility(page_content: str, starting_element_idx: int, ending_element_idx: int) -> Dict[str, bool]: + """Returns a visibility of portion of elements contained in page_content + + starting_element_idx and ending_element_idx are referring to the starting + and ending indexes for slice of page_content elements returned by + get_page_elements() function. + """ + driver = get_webdriver() + all_elements = get_page_elements(driver, page_content) + + result = {} + + for element in all_elements[starting_element_idx:ending_element_idx]: + element_jdn_hash = element.get_attribute("jdn-hash") + is_shown = element.is_displayed() + result[element_jdn_hash] = is_shown + logger.info(f"Element with jdn-hash {element_jdn_hash} {'Visible' if is_shown else 'Invisible'}") + + driver.quit() + + return result + + +def get_chunks_boundaries(data: Sized, desired_chunks_amount: int) -> Iterable[Tuple[int, int]]: + """Returns split indexes for a list, enabling easy partitioning into desired chunks""" + data_size = len(data) + chunk_size = data_size // desired_chunks_amount + + for i in range(desired_chunks_amount): + if i < (desired_chunks_amount - 1): + yield i * chunk_size, (i + 1) * chunk_size + else: + yield i * chunk_size, data_size + + +def get_element_id_to_is_displayed_mapping(page_content: str) -> Dict[str, bool]: + """Returns visibility status of all elements in the page + + Returned dictionary uses elements' jdn-hash property value as keys + """ + escaped_page_content = str(page_content).encode('utf-8').decode('unicode_escape') + + driver = get_webdriver() + all_elements = get_page_elements(driver, escaped_page_content) + driver.quit() + + num_of_workers = config.SELENOID_PARALLEL_SESSIONS_COUNT + jobs_chunks = get_chunks_boundaries(all_elements, num_of_workers) + result = {} + + with concurrent.futures.ProcessPoolExecutor(max_workers=num_of_workers) as executor: + futures = [ + executor.submit(get_elements_visibility, escaped_page_content, s, e) + for s, e in jobs_chunks + ] + for future in concurrent.futures.as_completed(futures): + result.update(future.result()) + + return result diff --git a/app/tasks.py b/app/tasks.py index 7e65f8be..ca3e826a 100644 --- a/app/tasks.py +++ b/app/tasks.py @@ -1,4 +1,5 @@ import datetime +import os from lxml import etree, html @@ -8,6 +9,8 @@ from app.redis_app import redis_app from utils.robula import generate_xpath +ENV = os.getenv("ENV") + @celery_app.task(bind=True) def task_schedule_xpath_generation( @@ -21,22 +24,23 @@ def task_schedule_xpath_generation( start_time = datetime.datetime.utcnow() page = redis_app.get(document_uuid).decode("utf-8") result = generate_xpath(element_id, page, document_uuid, config) # calculation itself - end_time = datetime.datetime.utcnow() - task_duration = end_time - start_time - - document = html.fromstring(page) - element = document.xpath(element_id)[0] - tree = etree.ElementTree(document) - full_xpath = tree.getpath(element) - nesting_num = len(full_xpath.split("/")) - 1 - - task_kwargs = self.request.kwargs - task_kwargs["task_duration"] = str( - task_duration - ) # for custom metrics logging to mongodb - task_kwargs["start_time"] = str(start_time) # for custom metrics logging to mongodb - task_kwargs["full_xpath"] = full_xpath # for custom metrics logging to mongodb - task_kwargs["nesting_num"] = nesting_num # for custom metrics logging to mongodb + + # calculation of additional parameters if not locally + if ENV != "LOCAL": + end_time = datetime.datetime.utcnow() + task_duration = end_time - start_time + + document = html.fromstring(page) + element = document.xpath(element_id)[0] + tree = etree.ElementTree(document) + full_xpath = tree.getpath(element) + nesting_num = len(full_xpath.split("/")) - 1 + + task_kwargs = self.request.kwargs + task_kwargs["task_duration"] = str(task_duration) # for custom metrics logging to mongodb + task_kwargs["start_time"] = str(start_time) # for custom metrics logging to mongodb + task_kwargs["full_xpath"] = full_xpath # for custom metrics logging to mongodb + task_kwargs["nesting_num"] = nesting_num # for custom metrics logging to mongodb return result diff --git a/browsers.json b/browsers.json new file mode 100644 index 00000000..fd07c037 --- /dev/null +++ b/browsers.json @@ -0,0 +1,17 @@ +{ + "chrome": { + "default": "91.0", + "versions": { + "91.0": { + "image": "selenoid/vnc_chrome:91.0", + "port": "4444" + }, "114.0": { + "image": "selenoid/vnc_chrome:114.0", + "port": "4444" + }, "118.0": { + "image": "selenoid/vnc_chrome:118.0", + "port": "4444" + } + } + } +} \ No newline at end of file diff --git a/docker-compose-rc.yaml b/docker-compose-rc.yaml index 310a6c23..53aba8f2 100644 --- a/docker-compose-rc.yaml +++ b/docker-compose-rc.yaml @@ -1,6 +1,33 @@ -version: '3' +version: '3.9' +name: jdi-qasp-ml services: + chrome: + image: selenoid/vnc_chrome:118.0 + attach: false + + selenoid: + container_name: "jdi-qasp-ml-selenoid" + image: "aerokube/selenoid:latest" + ports: + - "4445:4444" + volumes: + - ".:/etc/selenoid" + - "./target:/output" + - "/var/run/docker.sock:/var/run/docker.sock" + - "./target:/opt/selenoid/video" + environment: + - "OVERRIDE_VIDEO_OUTPUT_DIR=$PWD/target" + command: + - "-conf" + - "/etc/selenoid/browsers.json" + - "-video-output-dir" + - "/opt/selenoid/video" + - "-container-network" + - "jdi-qasp-ml_default" + - "-limit" + - "${SELENOID_PARALLEL_SESSIONS_COUNT:?set it to the number of parallel running threads supported by your processor (-2 optionally)}" + api: image: "ghcr.io/jdi-testing/jdi-qasp-ml:rc" container_name: jdi-qasp-ml-api @@ -11,11 +38,14 @@ services: driver: "json-file" options: max-size: "256m" + environment: + ENV: LOCAL + SELENOID_PARALLEL_SESSIONS_COUNT: $SELENOID_PARALLEL_SESSIONS_COUNT celery: image: "ghcr.io/jdi-testing/jdi-qasp-ml:rc" container_name: jdi-qasp-ml-celery - command: celery -A app.celery_app:celery_app worker -l info + command: bash -c "chmod +x start_celery.sh && ./start_celery.sh" logging: driver: "json-file" options: @@ -31,12 +61,3 @@ services: driver: "json-file" options: max-size: "256m" - - mongodb: - image: mongo:5.0 - ports: - - 27017:27017 - logging: - driver: "json-file" - options: - max-size: "256m" diff --git a/docker-compose-stable.yaml b/docker-compose-stable.yaml index 620227eb..0c6820eb 100644 --- a/docker-compose-stable.yaml +++ b/docker-compose-stable.yaml @@ -1,10 +1,39 @@ version: '3' +name: jdi-qasp-ml services: + chrome: + image: selenoid/vnc_chrome:118.0 + attach: false + + selenoid: + container_name: "jdi-qasp-ml-selenoid" + image: "aerokube/selenoid:latest" + ports: + - "4445:4444" + volumes: + - ".:/etc/selenoid" + - "./target:/output" + - "/var/run/docker.sock:/var/run/docker.sock" + - "./target:/opt/selenoid/video" + environment: + - "OVERRIDE_VIDEO_OUTPUT_DIR=$PWD/target" + command: + - "-conf" + - "/etc/selenoid/browsers.json" + - "-video-output-dir" + - "/opt/selenoid/video" + - "-container-network" + - "jdi-qasp-ml_default" + - "-limit" + - "${SELENOID_PARALLEL_SESSIONS_COUNT:?set it to the number of parallel running threads supported by your processor (-2 optionally)}" + api: image: "ghcr.io/jdi-testing/jdi-qasp-ml:stable" container_name: jdi-qasp-ml-api command: uvicorn app.main:api --host 0.0.0.0 --port 5000 + environment: + - SELENOID_PARALLEL_SESSIONS_COUNT=$SELENOID_PARALLEL_SESSIONS_COUNT ports: - "5050:5000" logging: diff --git a/docker-compose.dev.yaml b/docker-compose.dev.yaml index 6f2200c3..48c66544 100644 --- a/docker-compose.dev.yaml +++ b/docker-compose.dev.yaml @@ -1,6 +1,31 @@ -version: '3' - +name: jdi-qasp-ml services: + chrome: + image: selenoid/vnc_chrome:118.0 + attach: false + + selenoid: + container_name: "jdi-qasp-ml-selenoid" + image: "aerokube/selenoid:latest" + ports: + - "4445:4444" + volumes: + - ".:/etc/selenoid" + - "./target:/output" + - "/var/run/docker.sock:/var/run/docker.sock" + - "./target:/opt/selenoid/video" + environment: + - "OVERRIDE_VIDEO_OUTPUT_DIR=$PWD/target" + command: + - "-conf" + - "/etc/selenoid/browsers.json" + - "-video-output-dir" + - "/opt/selenoid/video" + - "-container-network" + - "jdi-qasp-ml_default" + - "-limit" + - "${SELENOID_PARALLEL_SESSIONS_COUNT:?set it to the number of parallel running threads supported by your processor (-2 optionally)}" + api: container_name: jdi-qasp-ml-api command: uvicorn app.main:api --host 0.0.0.0 --port 5050 @@ -11,6 +36,8 @@ services: - "5050:5050" volumes: - .:/jdi-qasp-ml + environment: + - SELENOID_PARALLEL_SESSIONS_COUNT=$SELENOID_PARALLEL_SESSIONS_COUNT logging: driver: "json-file" options: @@ -22,8 +49,6 @@ services: build: context: . dockerfile: Dockerfile - volumes: - - .:/jdi-qasp-ml logging: driver: "json-file" options: diff --git a/docker-compose.remote_server.yaml b/docker-compose.remote_server.yaml index 64432d00..4e856e45 100644 --- a/docker-compose.remote_server.yaml +++ b/docker-compose.remote_server.yaml @@ -1,6 +1,33 @@ version: '3' +name: jdi-qasp-ml services: + chrome: + image: selenoid/vnc_chrome:118.0 + attach: false + + selenoid: + container_name: "jdi-qasp-ml-selenoid" + image: "aerokube/selenoid:latest" + ports: + - "4445:4444" + volumes: + - ".:/etc/selenoid" + - "./target:/output" + - "/var/run/docker.sock:/var/run/docker.sock" + - "./target:/opt/selenoid/video" + environment: + - "OVERRIDE_VIDEO_OUTPUT_DIR=$PWD/target" + command: + - "-conf" + - "/etc/selenoid/browsers.json" + - "-video-output-dir" + - "/opt/selenoid/video" + - "-container-network" + - "jdi-qasp-ml_default" + - "-limit" + - "${SELENOID_PARALLEL_SESSIONS_COUNT:?set it to the number of parallel running threads supported by your processor (-2 optionally)}" + api: container_name: jdi-qasp-ml-api command: uvicorn app.main:api --host 0.0.0.0 --port 80 @@ -13,6 +40,8 @@ services: - .:/jdi-qasp-ml env_file: - .env.rc + environment: + - SELENOID_PARALLEL_SESSIONS_COUNT=$SELENOID_PARALLEL_SESSIONS_COUNT logging: driver: "json-file" options: @@ -24,8 +53,6 @@ services: build: context: . dockerfile: Dockerfile - volumes: - - .:/jdi-qasp-ml env_file: - .env.rc logging: @@ -62,4 +89,4 @@ services: logging: driver: "json-file" options: - max-size: "256m" \ No newline at end of file + max-size: "256m" diff --git a/docker-compose.remote_server_dev.yaml b/docker-compose.remote_server_dev.yaml index db05ebf5..aeb9c74e 100644 --- a/docker-compose.remote_server_dev.yaml +++ b/docker-compose.remote_server_dev.yaml @@ -1,6 +1,33 @@ version: '3' +name: jdi-qasp-ml services: + chrome: + image: selenoid/vnc_chrome:118.0 + attach: false + + selenoid: + container_name: "jdi-qasp-ml-selenoid" + image: "aerokube/selenoid:latest" + ports: + - "4445:4444" + volumes: + - ".:/etc/selenoid" + - "./target:/output" + - "/var/run/docker.sock:/var/run/docker.sock" + - "./target:/opt/selenoid/video" + environment: + - "OVERRIDE_VIDEO_OUTPUT_DIR=$PWD/target" + command: + - "-conf" + - "/etc/selenoid/browsers.json" + - "-video-output-dir" + - "/opt/selenoid/video" + - "-container-network" + - "jdi-qasp-ml_default" + - "-limit" + - "${SELENOID_PARALLEL_SESSIONS_COUNT:?set it to the number of parallel running threads supported by your processor (-2 optionally)}" + api-dev: container_name: jdi-qasp-ml-api-dev command: uvicorn app.main:api --host 0.0.0.0 --port 80 @@ -13,6 +40,8 @@ services: - .:/jdi-qasp-ml env_file: - .env.dev + environment: + - SELENOID_PARALLEL_SESSIONS_COUNT=$SELENOID_PARALLEL_SESSIONS_COUNT logging: driver: "json-file" options: @@ -24,8 +53,6 @@ services: build: context: . dockerfile: Dockerfile - volumes: - - .:/jdi-qasp-ml env_file: - .env.dev logging: diff --git a/docker-compose.yaml b/docker-compose.yaml index 0b31d85d..0433b23e 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,6 +1,33 @@ -version: '3' +version: '3.9' +name: jdi-qasp-ml services: + chrome: + image: selenoid/vnc_chrome:118.0 + attach: false + + selenoid: + container_name: "jdi-qasp-ml-selenoid" + image: "aerokube/selenoid:latest" + ports: + - "4445:4444" + volumes: + - ".:/etc/selenoid" + - "./target:/output" + - "/var/run/docker.sock:/var/run/docker.sock" + - "./target:/opt/selenoid/video" + environment: + - "OVERRIDE_VIDEO_OUTPUT_DIR=$PWD/target" + command: + - "-conf" + - "/etc/selenoid/browsers.json" + - "-video-output-dir" + - "/opt/selenoid/video" + - "-container-network" + - "jdi-qasp-ml_default" + - "-limit" + - "${SELENOID_PARALLEL_SESSIONS_COUNT:?set it to the number of parallel running threads supported by your processor (-2 optionally)}" + api: image: "ghcr.io/jdi-testing/jdi-qasp-ml:latest" container_name: jdi-qasp-ml-api @@ -11,11 +38,14 @@ services: driver: "json-file" options: max-size: "256m" + environment: + ENV: LOCAL + SELENOID_PARALLEL_SESSIONS_COUNT: $SELENOID_PARALLEL_SESSIONS_COUNT celery: image: "ghcr.io/jdi-testing/jdi-qasp-ml:latest" container_name: jdi-qasp-ml-celery - command: celery -A app.celery_app:celery_app worker -l info + command: bash -c "chmod +x start_celery.sh && ./start_celery.sh" logging: driver: "json-file" options: @@ -31,14 +61,3 @@ services: driver: "json-file" options: max-size: "256m" - - mongodb: - image: mongo:5.0 - ports: - - 27017:27017 - volumes: - - ~/apps/mongo:/data/db - logging: - driver: "json-file" - options: - max-size: "256m" diff --git a/ds_methods/angular_predict.py b/ds_methods/angular_predict.py index c92225cc..1852604f 100644 --- a/ds_methods/angular_predict.py +++ b/ds_methods/angular_predict.py @@ -11,6 +11,7 @@ from tqdm.auto import trange from app import angular_df_path_full, angular_model_full +from app.selenium_app import get_element_id_to_is_displayed_mapping from utils.dataset import MUI_JDNDataset logger = logging.getLogger("jdi-qasp-ml") @@ -18,17 +19,25 @@ @alru_cache(maxsize=32) async def angular_predict_elements(body): + body_str = body.decode("utf-8") + body_json = json.loads(body_str) + elements_json = body_json.get("elements", []) + document_json = body_json.get("document", "") + # create softmax layser function to get probabilities from logits softmax = torch.nn.Softmax(dim=1) + # generate temporary filename filename = dt.datetime.now().strftime("%Y%m%d%H%M%S%f.json") with open(os.path.join(angular_df_path_full, filename), "wb") as fp: logger.info(f"saving {filename}") fp.write(body) fp.flush() + filename = filename.replace(".json", ".pkl") logger.info(f"saving {filename}") - df = pd.DataFrame(json.loads(body)) + df = pd.DataFrame(json.loads(elements_json)) + # fix bad data which can come in 'onmouseover', 'onmouseenter' df.onmouseover = df.onmouseover.apply( lambda x: "true" if x is not None else None @@ -36,13 +45,16 @@ async def angular_predict_elements(body): df.onmouseenter = df.onmouseenter.apply( lambda x: "true" if x is not None else None ) + df.to_pickle(f"{angular_df_path_full}/{filename}") + logger.info("Creating JDNDataset") dataset = MUI_JDNDataset( datasets_list=[filename.split(".")[0]], rebalance_and_shuffle=False, dataset_type="angular", ) + dataloader = DataLoader(dataset, shuffle=False, batch_size=1) device = "cpu" logger.info(f"Load model with hardcoded device: {device}") @@ -50,6 +62,7 @@ async def angular_predict_elements(body): f"{angular_model_full}/model.pth", map_location="cpu" ).to(device=device) model.eval() + logger.info("Predicting...") results = [] with trange(len(dataloader)) as bar: @@ -78,6 +91,7 @@ async def angular_predict_elements(body): ) bar.update(1) results_df = pd.DataFrame(results) + # # update the dataset with predictions dataset.df["predicted_label"] = results_df.y_pred_label.values dataset.df["predicted_probability"] = results_df.y_probability.values @@ -96,4 +110,8 @@ async def angular_predict_elements(body): else: del model gc.collect() - return results_df[columns_to_publish].to_dict(orient="records") + result = results_df[columns_to_publish].to_dict(orient="records") + element_id_to_is_displayed_map = get_element_id_to_is_displayed_mapping(document_json) + for element in result: + element["is_shown"] = element_id_to_is_displayed_map.get(element["element_id"], None) + return result diff --git a/ds_methods/html5_predict.py b/ds_methods/html5_predict.py index 4cff473d..454df308 100644 --- a/ds_methods/html5_predict.py +++ b/ds_methods/html5_predict.py @@ -8,12 +8,9 @@ import pandas as pd from async_lru import alru_cache -from app import ( - UPLOAD_DIRECTORY, - html5_classes_path, - html5_df_path, - html5_model, -) +from app import (UPLOAD_DIRECTORY, html5_classes_path, html5_df_path, + html5_model) +from app.selenium_app import get_element_id_to_is_displayed_mapping from utils.dataset import HTML5_JDNDataset logger = logging.getLogger("jdi-qasp-ml") @@ -21,6 +18,11 @@ @alru_cache(maxsize=32) async def html5_predict_elements(body): + body_str = body.decode("utf-8") + body_json = json.loads(body_str) + elements_json = body_json.get("elements", []) + document_json = body_json.get("document", "") + # generate temporary filename filename = dt.datetime.now().strftime("%Y%m%d%H%M%S%f.json") with open(os.path.join(UPLOAD_DIRECTORY, filename), "wb") as fp: @@ -30,7 +32,7 @@ async def html5_predict_elements(body): filename = filename.replace(".json", ".pkl") logger.info(f"saving {filename}") - df = pd.DataFrame(json.loads(body)) + df = pd.DataFrame(json.loads(elements_json)) # fix bad data which can come in 'onmouseover', 'onmouseenter' df.onmouseover = df.onmouseover.apply( @@ -48,6 +50,7 @@ async def html5_predict_elements(body): dataset_type="html5", rebalance_and_shuffle=False, ) + # load model logger.info("Loading the model") pkl_filename = "DT_model.pkl" @@ -89,4 +92,10 @@ async def html5_predict_elements(body): else: del model gc.collect() - return results_df[columns_to_publish].to_dict(orient="records") + result = results_df[columns_to_publish].to_dict(orient="records") + + logger.info("Determining visibility locators") + element_id_to_is_displayed_map = get_element_id_to_is_displayed_mapping(document_json) + for element in result: + element["is_shown"] = element_id_to_is_displayed_map.get(element["element_id"], None) + return result diff --git a/ds_methods/mui_predict.py b/ds_methods/mui_predict.py index 74b5cb09..640e31c5 100644 --- a/ds_methods/mui_predict.py +++ b/ds_methods/mui_predict.py @@ -11,6 +11,7 @@ from tqdm.auto import trange from app import UPLOAD_DIRECTORY, mui_df_path, mui_model +from app.selenium_app import get_element_id_to_is_displayed_mapping from utils.dataset import MUI_JDNDataset logger = logging.getLogger("jdi-qasp-ml") @@ -18,17 +19,25 @@ @alru_cache(maxsize=32) async def mui_predict_elements(body): + body_str = body.decode("utf-8") + body_json = json.loads(body_str) + elements_json = body_json.get("elements", []) + document_json = body_json.get("document", "") + # create softmax layser function to get probabilities from logits softmax = torch.nn.Softmax(dim=1) + # generate temporary filename filename = dt.datetime.now().strftime("%Y%m%d%H%M%S%f.json") with open(os.path.join(UPLOAD_DIRECTORY, filename), "wb") as fp: logger.info(f"saving {filename}") fp.write(body) fp.flush() + filename = filename.replace(".json", ".pkl") logger.info(f"saving {filename}") - df = pd.DataFrame(json.loads(body)) + df = pd.DataFrame(json.loads(elements_json)) + # fix bad data which can come in 'onmouseover', 'onmouseenter' df.onmouseover = df.onmouseover.apply( lambda x: "true" if x is not None else None @@ -36,11 +45,14 @@ async def mui_predict_elements(body): df.onmouseenter = df.onmouseenter.apply( lambda x: "true" if x is not None else None ) + df.to_pickle(f"{mui_df_path}/{filename}") + logger.info("Creating JDNDataset") dataset = MUI_JDNDataset( datasets_list=[filename.split(".")[0]], rebalance_and_shuffle=False ) + dataloader = DataLoader(dataset, shuffle=False, batch_size=1) device = "cpu" logger.info(f"Load model with hardcoded device: {device}") @@ -48,6 +60,7 @@ async def mui_predict_elements(body): device=device ) model.eval() + logger.info("Predicting...") results = [] with trange(len(dataloader)) as bar: @@ -80,7 +93,8 @@ async def mui_predict_elements(body): ) bar.update(1) results_df = pd.DataFrame(results) - # # update the dataset with predictions + + # update the dataset with predictions dataset.df["predicted_label"] = results_df.y_pred_label.values dataset.df["predicted_probability"] = results_df.y_probability.values columns_to_publish = [ @@ -98,4 +112,8 @@ async def mui_predict_elements(body): else: del model gc.collect() - return results_df[columns_to_publish].to_dict(orient="records") + result = results_df[columns_to_publish].to_dict(orient="records") + element_id_to_is_displayed_map = get_element_id_to_is_displayed_mapping(document_json) + for element in result: + element["is_shown"] = element_id_to_is_displayed_map.get(element["element_id"], None) + return result diff --git a/model/version/0.2.40 b/model/version/0.2.40 deleted file mode 100644 index e69de29b..00000000 diff --git a/model/version/0.2.62 b/model/version/0.2.62 new file mode 100644 index 00000000..d3f5a12f --- /dev/null +++ b/model/version/0.2.62 @@ -0,0 +1 @@ + diff --git a/start_celery.sh b/start_celery.sh new file mode 100644 index 00000000..8d17508f --- /dev/null +++ b/start_celery.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +CORES_TO_USE=$(($(nproc)-2>0?$(nproc)-2:1)) +echo $CORES_TO_USE +celery -A app.celery_app:celery_app worker -l info --concurrency $CORES_TO_USE diff --git a/utils/api_utils.py b/utils/api_utils.py index 214bd95c..afd017d5 100644 --- a/utils/api_utils.py +++ b/utils/api_utils.py @@ -13,7 +13,7 @@ from app.logger import logger from app.models import LoggingInfoModel, TaskStatusModel, XPathGenerationModel from app.redis_app import redis_app -from app.tasks import task_schedule_xpath_generation +from app.tasks import ENV, task_schedule_xpath_generation def get_task_status(task_id) -> TaskStatusModel: @@ -87,21 +87,22 @@ async def wait_until_task_reach_status( # deleting underscores in task_id if any to send to frontend result["id"] = result["id"].strip("_") - session_id = task_result_obj.kwargs.get("session_id") - website_url = task_result_obj.kwargs.get("website_url") - start_time = task_result_obj.kwargs.get("start_time") - task_duration = task_result_obj.kwargs.get("task_duration") - full_xpath = task_result_obj.kwargs.get("full_xpath") - nesting_num = task_result_obj.kwargs.get("nesting_num") - await mongodb.enrich_logs_with_generated_locators( - session_id, - website_url, - full_xpath, - nesting_num, - result, - start_time, - task_duration, - ) + if ENV != "LOCAL": + session_id = task_result_obj.kwargs.get("session_id") + website_url = task_result_obj.kwargs.get("website_url") + start_time = task_result_obj.kwargs.get("start_time") + task_duration = task_result_obj.kwargs.get("task_duration") + full_xpath = task_result_obj.kwargs.get("full_xpath") + nesting_num = task_result_obj.kwargs.get("nesting_num") + await mongodb.enrich_logs_with_generated_locators( + session_id, + website_url, + full_xpath, + nesting_num, + result, + start_time, + task_duration, + ) try: await ws.send_json( get_websocket_response( @@ -251,9 +252,10 @@ async def process_incoming_ws_request( elif action == "schedule_multiple_xpath_generations": logging_info = LoggingInfoModel(**logging_info) - mongodb.create_initial_log_entry( - logging_info - ) # for custom metrics logging purposes + if ENV != "LOCAL": + mongodb.create_initial_log_entry( + logging_info + ) # for custom metrics logging purposes payload = XPathGenerationModel(**payload) element_ids = payload.id diff --git a/utils/config.py b/utils/config.py index a6724c74..c18f4b90 100644 --- a/utils/config.py +++ b/utils/config.py @@ -37,4 +37,8 @@ SMTP_HOST = os.getenv("SMTP_HOST", "smtp.yandex.ru") RECIPIENT_EMAILS = os.getenv("RECIPIENT_EMAILS", "SupportJDI@epam.com") +SELENOID_PARALLEL_SESSIONS_COUNT = int( + os.getenv("SELENOID_PARALLEL_SESSIONS_COUNT", len(os.sched_getaffinity(0))) +) + logger.info("Module utils.config was loaded")