diff --git a/.env.dist b/.env.dist
new file mode 100644
index 00000000..d090e06f
--- /dev/null
+++ b/.env.dist
@@ -0,0 +1 @@
+SELENOID_PARALLEL_SESSIONS_COUNT=4
diff --git a/.github/workflows/.release-workflow-rc.yml b/.github/workflows/.release-workflow-rc.yml
index 65b38afe..0fceeb4c 100644
--- a/.github/workflows/.release-workflow-rc.yml
+++ b/.github/workflows/.release-workflow-rc.yml
@@ -24,6 +24,10 @@ jobs:
uses: actions/checkout@v3
with:
submodules: true
+ - name: Load default environment variables values
+ uses: cardinalby/export-env-action@v2
+ with:
+ envFile: '.env.dist'
- name: Build
run: docker compose -f docker-compose.dev.yaml up -d
- name: Lint and Test
diff --git a/.github/workflows/.release-workflow.yml b/.github/workflows/.release-workflow.yml
index 7ac93725..bcd6124a 100644
--- a/.github/workflows/.release-workflow.yml
+++ b/.github/workflows/.release-workflow.yml
@@ -40,6 +40,10 @@ jobs:
with:
commit_message: increment version
branch: develop
+ - name: Load default environment variables values
+ uses: cardinalby/export-env-action@v2
+ with:
+ envFile: '.env.dist'
- name: Build
run: docker compose -f docker-compose.dev.yaml up -d
- name: Lint and Test
diff --git a/.gitignore b/.gitignore
index 61d44585..18b4031b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -42,7 +42,6 @@ dataset/cache-labels/*
dataset/df/20*
.idea/*
HTMLgenerator/output/*
-venv/
.pytest_cache/*
.coverage
.coverage.*
@@ -51,5 +50,6 @@ coverage.xml
.DS_Store
**/.DS_Store
+.env
.env.rc
-.env.dev
\ No newline at end of file
+.env.dev
diff --git a/README.md b/README.md
index 00c826a7..63603dc9 100644
--- a/README.md
+++ b/README.md
@@ -202,7 +202,12 @@ cd jdi-qasp-ml
```
git checkout branch_name
```
-4. build and start containers:
+4. Copy `.env.dist` file to `.env`:
+```
+cp .env.dist .env
+```
+5. Adjust variables in `.env` file to your needs (refer to the [Settings](#settings) section).
+6. Build and start containers:
```
docker-compose -f docker-compose.dev.yaml up --build
```
@@ -219,6 +224,13 @@ git pull
```
docker-compose -f docker-compose.dev.yaml up
```
+
+## Settings
+
+| Variable name | Description | Default value |
+|----------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------|
+| SELENOID_PARALLEL_SESSIONS_COUNT | Total number of parallel Selenoid sessions.
Is also used to determine number of processes used to calculate visibility of page elements.
Set it to the number of parallel running threads supported by your processor. -2 optionally if you'd like to reduce CPU load. | 4 |
+
# Docker - get debugging info:
- http://localhost:5050/build - get the docker image build's datetime
- http://localhost:5050/files - get data sent by browser to model
diff --git a/app/models.py b/app/models.py
index 7e7dbd42..24151c3f 100644
--- a/app/models.py
+++ b/app/models.py
@@ -54,7 +54,8 @@ class PredictionRequestElement(BaseModel):
class PredictionInputModel(BaseModel):
- __root__: List[PredictionRequestElement]
+ document: str
+ elements: str
class PredictedElement(BaseModel):
diff --git a/app/mongodb.py b/app/mongodb.py
index fc2c4f96..ba47a3db 100644
--- a/app/mongodb.py
+++ b/app/mongodb.py
@@ -6,6 +6,7 @@
from app import MONGO_DB_HOST, MONGO_DB_PORT
from app.models import LoggingInfoModel
+from app.tasks import ENV
client = pymongo.MongoClient(MONGO_DB_HOST, MONGO_DB_PORT)
mongo_db = client.jdn
@@ -15,6 +16,10 @@
def get_session_id() -> int:
+ # doesn't use base if it works locally
+ if ENV == "LOCAL":
+ return 0
+
session_id_entry = mongo_db.sesion_id_collection.find_one()
if session_id_entry:
new_result = session_id_entry["session_id"] + 1
@@ -28,8 +33,10 @@ def get_session_id() -> int:
def create_logs_json_file() -> None:
- logs_collection = mongo_db.logs
- all_logs = logs_collection.find()
+ all_logs = {}
+ if ENV != "LOCAL":
+ logs_collection = mongo_db.logs
+ all_logs = logs_collection.find()
with open("logs.json", "w") as output_file:
json.dump(json.loads(dumps(all_logs)), output_file)
diff --git a/app/selenium_app.py b/app/selenium_app.py
new file mode 100644
index 00000000..dfcfe73a
--- /dev/null
+++ b/app/selenium_app.py
@@ -0,0 +1,107 @@
+from typing import Iterable, Sized, Tuple, Dict, List
+
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.remote.webelement import WebElement
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import WebDriverWait
+import concurrent.futures
+
+from app.logger import logger
+from utils import config
+
+
+def get_webdriver() -> webdriver.Remote:
+ """Returns a remote Chrome webdriver instance"""
+ chrome_options = Options()
+ chrome_options.add_argument("--no-sandbox")
+ chrome_options.add_argument("--headless")
+ chrome_options.add_argument("--disable-dev-shm-usage")
+
+ capabilities = {
+ "browserName": "chrome",
+ "browserVersion": "118.0",
+ "selenoid:options": {
+ "enableVideo": False
+ }
+ }
+
+ return webdriver.Remote(
+ command_executor="http://jdi-qasp-ml-selenoid:4444/wd/hub",
+ desired_capabilities=capabilities,
+ options=chrome_options,
+ )
+
+
+def get_page_elements(driver: webdriver.Remote, page_content: str) -> List[WebElement]:
+ """Returns a list of all elements contained in page_content"""
+ driver.execute_script(
+ "document.body.insertAdjacentHTML('beforeend', arguments[0]);",
+ page_content,
+ )
+ wait = WebDriverWait(driver, 10)
+ wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
+
+ return driver.find_elements(by=By.XPATH, value="//*")
+
+
+def get_elements_visibility(page_content: str, starting_element_idx: int, ending_element_idx: int) -> Dict[str, bool]:
+ """Returns a visibility of portion of elements contained in page_content
+
+ starting_element_idx and ending_element_idx are referring to the starting
+ and ending indexes for slice of page_content elements returned by
+ get_page_elements() function.
+ """
+ driver = get_webdriver()
+ all_elements = get_page_elements(driver, page_content)
+
+ result = {}
+
+ for element in all_elements[starting_element_idx:ending_element_idx]:
+ element_jdn_hash = element.get_attribute("jdn-hash")
+ is_shown = element.is_displayed()
+ result[element_jdn_hash] = is_shown
+ logger.info(f"Element with jdn-hash {element_jdn_hash} {'Visible' if is_shown else 'Invisible'}")
+
+ driver.quit()
+
+ return result
+
+
+def get_chunks_boundaries(data: Sized, desired_chunks_amount: int) -> Iterable[Tuple[int, int]]:
+ """Returns split indexes for a list, enabling easy partitioning into desired chunks"""
+ data_size = len(data)
+ chunk_size = data_size // desired_chunks_amount
+
+ for i in range(desired_chunks_amount):
+ if i < (desired_chunks_amount - 1):
+ yield i * chunk_size, (i + 1) * chunk_size
+ else:
+ yield i * chunk_size, data_size
+
+
+def get_element_id_to_is_displayed_mapping(page_content: str) -> Dict[str, bool]:
+ """Returns visibility status of all elements in the page
+
+ Returned dictionary uses elements' jdn-hash property value as keys
+ """
+ escaped_page_content = str(page_content).encode('utf-8').decode('unicode_escape')
+
+ driver = get_webdriver()
+ all_elements = get_page_elements(driver, escaped_page_content)
+ driver.quit()
+
+ num_of_workers = config.SELENOID_PARALLEL_SESSIONS_COUNT
+ jobs_chunks = get_chunks_boundaries(all_elements, num_of_workers)
+ result = {}
+
+ with concurrent.futures.ProcessPoolExecutor(max_workers=num_of_workers) as executor:
+ futures = [
+ executor.submit(get_elements_visibility, escaped_page_content, s, e)
+ for s, e in jobs_chunks
+ ]
+ for future in concurrent.futures.as_completed(futures):
+ result.update(future.result())
+
+ return result
diff --git a/app/tasks.py b/app/tasks.py
index 7e65f8be..ca3e826a 100644
--- a/app/tasks.py
+++ b/app/tasks.py
@@ -1,4 +1,5 @@
import datetime
+import os
from lxml import etree, html
@@ -8,6 +9,8 @@
from app.redis_app import redis_app
from utils.robula import generate_xpath
+ENV = os.getenv("ENV")
+
@celery_app.task(bind=True)
def task_schedule_xpath_generation(
@@ -21,22 +24,23 @@ def task_schedule_xpath_generation(
start_time = datetime.datetime.utcnow()
page = redis_app.get(document_uuid).decode("utf-8")
result = generate_xpath(element_id, page, document_uuid, config) # calculation itself
- end_time = datetime.datetime.utcnow()
- task_duration = end_time - start_time
-
- document = html.fromstring(page)
- element = document.xpath(element_id)[0]
- tree = etree.ElementTree(document)
- full_xpath = tree.getpath(element)
- nesting_num = len(full_xpath.split("/")) - 1
-
- task_kwargs = self.request.kwargs
- task_kwargs["task_duration"] = str(
- task_duration
- ) # for custom metrics logging to mongodb
- task_kwargs["start_time"] = str(start_time) # for custom metrics logging to mongodb
- task_kwargs["full_xpath"] = full_xpath # for custom metrics logging to mongodb
- task_kwargs["nesting_num"] = nesting_num # for custom metrics logging to mongodb
+
+ # calculation of additional parameters if not locally
+ if ENV != "LOCAL":
+ end_time = datetime.datetime.utcnow()
+ task_duration = end_time - start_time
+
+ document = html.fromstring(page)
+ element = document.xpath(element_id)[0]
+ tree = etree.ElementTree(document)
+ full_xpath = tree.getpath(element)
+ nesting_num = len(full_xpath.split("/")) - 1
+
+ task_kwargs = self.request.kwargs
+ task_kwargs["task_duration"] = str(task_duration) # for custom metrics logging to mongodb
+ task_kwargs["start_time"] = str(start_time) # for custom metrics logging to mongodb
+ task_kwargs["full_xpath"] = full_xpath # for custom metrics logging to mongodb
+ task_kwargs["nesting_num"] = nesting_num # for custom metrics logging to mongodb
return result
diff --git a/browsers.json b/browsers.json
new file mode 100644
index 00000000..fd07c037
--- /dev/null
+++ b/browsers.json
@@ -0,0 +1,17 @@
+{
+ "chrome": {
+ "default": "91.0",
+ "versions": {
+ "91.0": {
+ "image": "selenoid/vnc_chrome:91.0",
+ "port": "4444"
+ }, "114.0": {
+ "image": "selenoid/vnc_chrome:114.0",
+ "port": "4444"
+ }, "118.0": {
+ "image": "selenoid/vnc_chrome:118.0",
+ "port": "4444"
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/docker-compose-rc.yaml b/docker-compose-rc.yaml
index 310a6c23..53aba8f2 100644
--- a/docker-compose-rc.yaml
+++ b/docker-compose-rc.yaml
@@ -1,6 +1,33 @@
-version: '3'
+version: '3.9'
+name: jdi-qasp-ml
services:
+ chrome:
+ image: selenoid/vnc_chrome:118.0
+ attach: false
+
+ selenoid:
+ container_name: "jdi-qasp-ml-selenoid"
+ image: "aerokube/selenoid:latest"
+ ports:
+ - "4445:4444"
+ volumes:
+ - ".:/etc/selenoid"
+ - "./target:/output"
+ - "/var/run/docker.sock:/var/run/docker.sock"
+ - "./target:/opt/selenoid/video"
+ environment:
+ - "OVERRIDE_VIDEO_OUTPUT_DIR=$PWD/target"
+ command:
+ - "-conf"
+ - "/etc/selenoid/browsers.json"
+ - "-video-output-dir"
+ - "/opt/selenoid/video"
+ - "-container-network"
+ - "jdi-qasp-ml_default"
+ - "-limit"
+ - "${SELENOID_PARALLEL_SESSIONS_COUNT:?set it to the number of parallel running threads supported by your processor (-2 optionally)}"
+
api:
image: "ghcr.io/jdi-testing/jdi-qasp-ml:rc"
container_name: jdi-qasp-ml-api
@@ -11,11 +38,14 @@ services:
driver: "json-file"
options:
max-size: "256m"
+ environment:
+ ENV: LOCAL
+ SELENOID_PARALLEL_SESSIONS_COUNT: $SELENOID_PARALLEL_SESSIONS_COUNT
celery:
image: "ghcr.io/jdi-testing/jdi-qasp-ml:rc"
container_name: jdi-qasp-ml-celery
- command: celery -A app.celery_app:celery_app worker -l info
+ command: bash -c "chmod +x start_celery.sh && ./start_celery.sh"
logging:
driver: "json-file"
options:
@@ -31,12 +61,3 @@ services:
driver: "json-file"
options:
max-size: "256m"
-
- mongodb:
- image: mongo:5.0
- ports:
- - 27017:27017
- logging:
- driver: "json-file"
- options:
- max-size: "256m"
diff --git a/docker-compose-stable.yaml b/docker-compose-stable.yaml
index 620227eb..0c6820eb 100644
--- a/docker-compose-stable.yaml
+++ b/docker-compose-stable.yaml
@@ -1,10 +1,39 @@
version: '3'
+name: jdi-qasp-ml
services:
+ chrome:
+ image: selenoid/vnc_chrome:118.0
+ attach: false
+
+ selenoid:
+ container_name: "jdi-qasp-ml-selenoid"
+ image: "aerokube/selenoid:latest"
+ ports:
+ - "4445:4444"
+ volumes:
+ - ".:/etc/selenoid"
+ - "./target:/output"
+ - "/var/run/docker.sock:/var/run/docker.sock"
+ - "./target:/opt/selenoid/video"
+ environment:
+ - "OVERRIDE_VIDEO_OUTPUT_DIR=$PWD/target"
+ command:
+ - "-conf"
+ - "/etc/selenoid/browsers.json"
+ - "-video-output-dir"
+ - "/opt/selenoid/video"
+ - "-container-network"
+ - "jdi-qasp-ml_default"
+ - "-limit"
+ - "${SELENOID_PARALLEL_SESSIONS_COUNT:?set it to the number of parallel running threads supported by your processor (-2 optionally)}"
+
api:
image: "ghcr.io/jdi-testing/jdi-qasp-ml:stable"
container_name: jdi-qasp-ml-api
command: uvicorn app.main:api --host 0.0.0.0 --port 5000
+ environment:
+ - SELENOID_PARALLEL_SESSIONS_COUNT=$SELENOID_PARALLEL_SESSIONS_COUNT
ports:
- "5050:5000"
logging:
diff --git a/docker-compose.dev.yaml b/docker-compose.dev.yaml
index 6f2200c3..48c66544 100644
--- a/docker-compose.dev.yaml
+++ b/docker-compose.dev.yaml
@@ -1,6 +1,31 @@
-version: '3'
-
+name: jdi-qasp-ml
services:
+ chrome:
+ image: selenoid/vnc_chrome:118.0
+ attach: false
+
+ selenoid:
+ container_name: "jdi-qasp-ml-selenoid"
+ image: "aerokube/selenoid:latest"
+ ports:
+ - "4445:4444"
+ volumes:
+ - ".:/etc/selenoid"
+ - "./target:/output"
+ - "/var/run/docker.sock:/var/run/docker.sock"
+ - "./target:/opt/selenoid/video"
+ environment:
+ - "OVERRIDE_VIDEO_OUTPUT_DIR=$PWD/target"
+ command:
+ - "-conf"
+ - "/etc/selenoid/browsers.json"
+ - "-video-output-dir"
+ - "/opt/selenoid/video"
+ - "-container-network"
+ - "jdi-qasp-ml_default"
+ - "-limit"
+ - "${SELENOID_PARALLEL_SESSIONS_COUNT:?set it to the number of parallel running threads supported by your processor (-2 optionally)}"
+
api:
container_name: jdi-qasp-ml-api
command: uvicorn app.main:api --host 0.0.0.0 --port 5050
@@ -11,6 +36,8 @@ services:
- "5050:5050"
volumes:
- .:/jdi-qasp-ml
+ environment:
+ - SELENOID_PARALLEL_SESSIONS_COUNT=$SELENOID_PARALLEL_SESSIONS_COUNT
logging:
driver: "json-file"
options:
@@ -22,8 +49,6 @@ services:
build:
context: .
dockerfile: Dockerfile
- volumes:
- - .:/jdi-qasp-ml
logging:
driver: "json-file"
options:
diff --git a/docker-compose.remote_server.yaml b/docker-compose.remote_server.yaml
index 64432d00..4e856e45 100644
--- a/docker-compose.remote_server.yaml
+++ b/docker-compose.remote_server.yaml
@@ -1,6 +1,33 @@
version: '3'
+name: jdi-qasp-ml
services:
+ chrome:
+ image: selenoid/vnc_chrome:118.0
+ attach: false
+
+ selenoid:
+ container_name: "jdi-qasp-ml-selenoid"
+ image: "aerokube/selenoid:latest"
+ ports:
+ - "4445:4444"
+ volumes:
+ - ".:/etc/selenoid"
+ - "./target:/output"
+ - "/var/run/docker.sock:/var/run/docker.sock"
+ - "./target:/opt/selenoid/video"
+ environment:
+ - "OVERRIDE_VIDEO_OUTPUT_DIR=$PWD/target"
+ command:
+ - "-conf"
+ - "/etc/selenoid/browsers.json"
+ - "-video-output-dir"
+ - "/opt/selenoid/video"
+ - "-container-network"
+ - "jdi-qasp-ml_default"
+ - "-limit"
+ - "${SELENOID_PARALLEL_SESSIONS_COUNT:?set it to the number of parallel running threads supported by your processor (-2 optionally)}"
+
api:
container_name: jdi-qasp-ml-api
command: uvicorn app.main:api --host 0.0.0.0 --port 80
@@ -13,6 +40,8 @@ services:
- .:/jdi-qasp-ml
env_file:
- .env.rc
+ environment:
+ - SELENOID_PARALLEL_SESSIONS_COUNT=$SELENOID_PARALLEL_SESSIONS_COUNT
logging:
driver: "json-file"
options:
@@ -24,8 +53,6 @@ services:
build:
context: .
dockerfile: Dockerfile
- volumes:
- - .:/jdi-qasp-ml
env_file:
- .env.rc
logging:
@@ -62,4 +89,4 @@ services:
logging:
driver: "json-file"
options:
- max-size: "256m"
\ No newline at end of file
+ max-size: "256m"
diff --git a/docker-compose.remote_server_dev.yaml b/docker-compose.remote_server_dev.yaml
index db05ebf5..aeb9c74e 100644
--- a/docker-compose.remote_server_dev.yaml
+++ b/docker-compose.remote_server_dev.yaml
@@ -1,6 +1,33 @@
version: '3'
+name: jdi-qasp-ml
services:
+ chrome:
+ image: selenoid/vnc_chrome:118.0
+ attach: false
+
+ selenoid:
+ container_name: "jdi-qasp-ml-selenoid"
+ image: "aerokube/selenoid:latest"
+ ports:
+ - "4445:4444"
+ volumes:
+ - ".:/etc/selenoid"
+ - "./target:/output"
+ - "/var/run/docker.sock:/var/run/docker.sock"
+ - "./target:/opt/selenoid/video"
+ environment:
+ - "OVERRIDE_VIDEO_OUTPUT_DIR=$PWD/target"
+ command:
+ - "-conf"
+ - "/etc/selenoid/browsers.json"
+ - "-video-output-dir"
+ - "/opt/selenoid/video"
+ - "-container-network"
+ - "jdi-qasp-ml_default"
+ - "-limit"
+ - "${SELENOID_PARALLEL_SESSIONS_COUNT:?set it to the number of parallel running threads supported by your processor (-2 optionally)}"
+
api-dev:
container_name: jdi-qasp-ml-api-dev
command: uvicorn app.main:api --host 0.0.0.0 --port 80
@@ -13,6 +40,8 @@ services:
- .:/jdi-qasp-ml
env_file:
- .env.dev
+ environment:
+ - SELENOID_PARALLEL_SESSIONS_COUNT=$SELENOID_PARALLEL_SESSIONS_COUNT
logging:
driver: "json-file"
options:
@@ -24,8 +53,6 @@ services:
build:
context: .
dockerfile: Dockerfile
- volumes:
- - .:/jdi-qasp-ml
env_file:
- .env.dev
logging:
diff --git a/docker-compose.yaml b/docker-compose.yaml
index 0b31d85d..0433b23e 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -1,6 +1,33 @@
-version: '3'
+version: '3.9'
+name: jdi-qasp-ml
services:
+ chrome:
+ image: selenoid/vnc_chrome:118.0
+ attach: false
+
+ selenoid:
+ container_name: "jdi-qasp-ml-selenoid"
+ image: "aerokube/selenoid:latest"
+ ports:
+ - "4445:4444"
+ volumes:
+ - ".:/etc/selenoid"
+ - "./target:/output"
+ - "/var/run/docker.sock:/var/run/docker.sock"
+ - "./target:/opt/selenoid/video"
+ environment:
+ - "OVERRIDE_VIDEO_OUTPUT_DIR=$PWD/target"
+ command:
+ - "-conf"
+ - "/etc/selenoid/browsers.json"
+ - "-video-output-dir"
+ - "/opt/selenoid/video"
+ - "-container-network"
+ - "jdi-qasp-ml_default"
+ - "-limit"
+ - "${SELENOID_PARALLEL_SESSIONS_COUNT:?set it to the number of parallel running threads supported by your processor (-2 optionally)}"
+
api:
image: "ghcr.io/jdi-testing/jdi-qasp-ml:latest"
container_name: jdi-qasp-ml-api
@@ -11,11 +38,14 @@ services:
driver: "json-file"
options:
max-size: "256m"
+ environment:
+ ENV: LOCAL
+ SELENOID_PARALLEL_SESSIONS_COUNT: $SELENOID_PARALLEL_SESSIONS_COUNT
celery:
image: "ghcr.io/jdi-testing/jdi-qasp-ml:latest"
container_name: jdi-qasp-ml-celery
- command: celery -A app.celery_app:celery_app worker -l info
+ command: bash -c "chmod +x start_celery.sh && ./start_celery.sh"
logging:
driver: "json-file"
options:
@@ -31,14 +61,3 @@ services:
driver: "json-file"
options:
max-size: "256m"
-
- mongodb:
- image: mongo:5.0
- ports:
- - 27017:27017
- volumes:
- - ~/apps/mongo:/data/db
- logging:
- driver: "json-file"
- options:
- max-size: "256m"
diff --git a/ds_methods/angular_predict.py b/ds_methods/angular_predict.py
index c92225cc..1852604f 100644
--- a/ds_methods/angular_predict.py
+++ b/ds_methods/angular_predict.py
@@ -11,6 +11,7 @@
from tqdm.auto import trange
from app import angular_df_path_full, angular_model_full
+from app.selenium_app import get_element_id_to_is_displayed_mapping
from utils.dataset import MUI_JDNDataset
logger = logging.getLogger("jdi-qasp-ml")
@@ -18,17 +19,25 @@
@alru_cache(maxsize=32)
async def angular_predict_elements(body):
+ body_str = body.decode("utf-8")
+ body_json = json.loads(body_str)
+ elements_json = body_json.get("elements", [])
+ document_json = body_json.get("document", "")
+
# create softmax layser function to get probabilities from logits
softmax = torch.nn.Softmax(dim=1)
+
# generate temporary filename
filename = dt.datetime.now().strftime("%Y%m%d%H%M%S%f.json")
with open(os.path.join(angular_df_path_full, filename), "wb") as fp:
logger.info(f"saving {filename}")
fp.write(body)
fp.flush()
+
filename = filename.replace(".json", ".pkl")
logger.info(f"saving {filename}")
- df = pd.DataFrame(json.loads(body))
+ df = pd.DataFrame(json.loads(elements_json))
+
# fix bad data which can come in 'onmouseover', 'onmouseenter'
df.onmouseover = df.onmouseover.apply(
lambda x: "true" if x is not None else None
@@ -36,13 +45,16 @@ async def angular_predict_elements(body):
df.onmouseenter = df.onmouseenter.apply(
lambda x: "true" if x is not None else None
)
+
df.to_pickle(f"{angular_df_path_full}/{filename}")
+
logger.info("Creating JDNDataset")
dataset = MUI_JDNDataset(
datasets_list=[filename.split(".")[0]],
rebalance_and_shuffle=False,
dataset_type="angular",
)
+
dataloader = DataLoader(dataset, shuffle=False, batch_size=1)
device = "cpu"
logger.info(f"Load model with hardcoded device: {device}")
@@ -50,6 +62,7 @@ async def angular_predict_elements(body):
f"{angular_model_full}/model.pth", map_location="cpu"
).to(device=device)
model.eval()
+
logger.info("Predicting...")
results = []
with trange(len(dataloader)) as bar:
@@ -78,6 +91,7 @@ async def angular_predict_elements(body):
)
bar.update(1)
results_df = pd.DataFrame(results)
+
# # update the dataset with predictions
dataset.df["predicted_label"] = results_df.y_pred_label.values
dataset.df["predicted_probability"] = results_df.y_probability.values
@@ -96,4 +110,8 @@ async def angular_predict_elements(body):
else:
del model
gc.collect()
- return results_df[columns_to_publish].to_dict(orient="records")
+ result = results_df[columns_to_publish].to_dict(orient="records")
+ element_id_to_is_displayed_map = get_element_id_to_is_displayed_mapping(document_json)
+ for element in result:
+ element["is_shown"] = element_id_to_is_displayed_map.get(element["element_id"], None)
+ return result
diff --git a/ds_methods/html5_predict.py b/ds_methods/html5_predict.py
index 4cff473d..454df308 100644
--- a/ds_methods/html5_predict.py
+++ b/ds_methods/html5_predict.py
@@ -8,12 +8,9 @@
import pandas as pd
from async_lru import alru_cache
-from app import (
- UPLOAD_DIRECTORY,
- html5_classes_path,
- html5_df_path,
- html5_model,
-)
+from app import (UPLOAD_DIRECTORY, html5_classes_path, html5_df_path,
+ html5_model)
+from app.selenium_app import get_element_id_to_is_displayed_mapping
from utils.dataset import HTML5_JDNDataset
logger = logging.getLogger("jdi-qasp-ml")
@@ -21,6 +18,11 @@
@alru_cache(maxsize=32)
async def html5_predict_elements(body):
+ body_str = body.decode("utf-8")
+ body_json = json.loads(body_str)
+ elements_json = body_json.get("elements", [])
+ document_json = body_json.get("document", "")
+
# generate temporary filename
filename = dt.datetime.now().strftime("%Y%m%d%H%M%S%f.json")
with open(os.path.join(UPLOAD_DIRECTORY, filename), "wb") as fp:
@@ -30,7 +32,7 @@ async def html5_predict_elements(body):
filename = filename.replace(".json", ".pkl")
logger.info(f"saving {filename}")
- df = pd.DataFrame(json.loads(body))
+ df = pd.DataFrame(json.loads(elements_json))
# fix bad data which can come in 'onmouseover', 'onmouseenter'
df.onmouseover = df.onmouseover.apply(
@@ -48,6 +50,7 @@ async def html5_predict_elements(body):
dataset_type="html5",
rebalance_and_shuffle=False,
)
+
# load model
logger.info("Loading the model")
pkl_filename = "DT_model.pkl"
@@ -89,4 +92,10 @@ async def html5_predict_elements(body):
else:
del model
gc.collect()
- return results_df[columns_to_publish].to_dict(orient="records")
+ result = results_df[columns_to_publish].to_dict(orient="records")
+
+ logger.info("Determining visibility locators")
+ element_id_to_is_displayed_map = get_element_id_to_is_displayed_mapping(document_json)
+ for element in result:
+ element["is_shown"] = element_id_to_is_displayed_map.get(element["element_id"], None)
+ return result
diff --git a/ds_methods/mui_predict.py b/ds_methods/mui_predict.py
index 74b5cb09..640e31c5 100644
--- a/ds_methods/mui_predict.py
+++ b/ds_methods/mui_predict.py
@@ -11,6 +11,7 @@
from tqdm.auto import trange
from app import UPLOAD_DIRECTORY, mui_df_path, mui_model
+from app.selenium_app import get_element_id_to_is_displayed_mapping
from utils.dataset import MUI_JDNDataset
logger = logging.getLogger("jdi-qasp-ml")
@@ -18,17 +19,25 @@
@alru_cache(maxsize=32)
async def mui_predict_elements(body):
+ body_str = body.decode("utf-8")
+ body_json = json.loads(body_str)
+ elements_json = body_json.get("elements", [])
+ document_json = body_json.get("document", "")
+
# create softmax layser function to get probabilities from logits
softmax = torch.nn.Softmax(dim=1)
+
# generate temporary filename
filename = dt.datetime.now().strftime("%Y%m%d%H%M%S%f.json")
with open(os.path.join(UPLOAD_DIRECTORY, filename), "wb") as fp:
logger.info(f"saving {filename}")
fp.write(body)
fp.flush()
+
filename = filename.replace(".json", ".pkl")
logger.info(f"saving {filename}")
- df = pd.DataFrame(json.loads(body))
+ df = pd.DataFrame(json.loads(elements_json))
+
# fix bad data which can come in 'onmouseover', 'onmouseenter'
df.onmouseover = df.onmouseover.apply(
lambda x: "true" if x is not None else None
@@ -36,11 +45,14 @@ async def mui_predict_elements(body):
df.onmouseenter = df.onmouseenter.apply(
lambda x: "true" if x is not None else None
)
+
df.to_pickle(f"{mui_df_path}/{filename}")
+
logger.info("Creating JDNDataset")
dataset = MUI_JDNDataset(
datasets_list=[filename.split(".")[0]], rebalance_and_shuffle=False
)
+
dataloader = DataLoader(dataset, shuffle=False, batch_size=1)
device = "cpu"
logger.info(f"Load model with hardcoded device: {device}")
@@ -48,6 +60,7 @@ async def mui_predict_elements(body):
device=device
)
model.eval()
+
logger.info("Predicting...")
results = []
with trange(len(dataloader)) as bar:
@@ -80,7 +93,8 @@ async def mui_predict_elements(body):
)
bar.update(1)
results_df = pd.DataFrame(results)
- # # update the dataset with predictions
+
+ # update the dataset with predictions
dataset.df["predicted_label"] = results_df.y_pred_label.values
dataset.df["predicted_probability"] = results_df.y_probability.values
columns_to_publish = [
@@ -98,4 +112,8 @@ async def mui_predict_elements(body):
else:
del model
gc.collect()
- return results_df[columns_to_publish].to_dict(orient="records")
+ result = results_df[columns_to_publish].to_dict(orient="records")
+ element_id_to_is_displayed_map = get_element_id_to_is_displayed_mapping(document_json)
+ for element in result:
+ element["is_shown"] = element_id_to_is_displayed_map.get(element["element_id"], None)
+ return result
diff --git a/model/version/0.2.40 b/model/version/0.2.40
deleted file mode 100644
index e69de29b..00000000
diff --git a/model/version/0.2.62 b/model/version/0.2.62
new file mode 100644
index 00000000..d3f5a12f
--- /dev/null
+++ b/model/version/0.2.62
@@ -0,0 +1 @@
+
diff --git a/start_celery.sh b/start_celery.sh
new file mode 100644
index 00000000..8d17508f
--- /dev/null
+++ b/start_celery.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+CORES_TO_USE=$(($(nproc)-2>0?$(nproc)-2:1))
+echo $CORES_TO_USE
+celery -A app.celery_app:celery_app worker -l info --concurrency $CORES_TO_USE
diff --git a/utils/api_utils.py b/utils/api_utils.py
index 214bd95c..afd017d5 100644
--- a/utils/api_utils.py
+++ b/utils/api_utils.py
@@ -13,7 +13,7 @@
from app.logger import logger
from app.models import LoggingInfoModel, TaskStatusModel, XPathGenerationModel
from app.redis_app import redis_app
-from app.tasks import task_schedule_xpath_generation
+from app.tasks import ENV, task_schedule_xpath_generation
def get_task_status(task_id) -> TaskStatusModel:
@@ -87,21 +87,22 @@ async def wait_until_task_reach_status(
# deleting underscores in task_id if any to send to frontend
result["id"] = result["id"].strip("_")
- session_id = task_result_obj.kwargs.get("session_id")
- website_url = task_result_obj.kwargs.get("website_url")
- start_time = task_result_obj.kwargs.get("start_time")
- task_duration = task_result_obj.kwargs.get("task_duration")
- full_xpath = task_result_obj.kwargs.get("full_xpath")
- nesting_num = task_result_obj.kwargs.get("nesting_num")
- await mongodb.enrich_logs_with_generated_locators(
- session_id,
- website_url,
- full_xpath,
- nesting_num,
- result,
- start_time,
- task_duration,
- )
+ if ENV != "LOCAL":
+ session_id = task_result_obj.kwargs.get("session_id")
+ website_url = task_result_obj.kwargs.get("website_url")
+ start_time = task_result_obj.kwargs.get("start_time")
+ task_duration = task_result_obj.kwargs.get("task_duration")
+ full_xpath = task_result_obj.kwargs.get("full_xpath")
+ nesting_num = task_result_obj.kwargs.get("nesting_num")
+ await mongodb.enrich_logs_with_generated_locators(
+ session_id,
+ website_url,
+ full_xpath,
+ nesting_num,
+ result,
+ start_time,
+ task_duration,
+ )
try:
await ws.send_json(
get_websocket_response(
@@ -251,9 +252,10 @@ async def process_incoming_ws_request(
elif action == "schedule_multiple_xpath_generations":
logging_info = LoggingInfoModel(**logging_info)
- mongodb.create_initial_log_entry(
- logging_info
- ) # for custom metrics logging purposes
+ if ENV != "LOCAL":
+ mongodb.create_initial_log_entry(
+ logging_info
+ ) # for custom metrics logging purposes
payload = XPathGenerationModel(**payload)
element_ids = payload.id
diff --git a/utils/config.py b/utils/config.py
index a6724c74..c18f4b90 100644
--- a/utils/config.py
+++ b/utils/config.py
@@ -37,4 +37,8 @@
SMTP_HOST = os.getenv("SMTP_HOST", "smtp.yandex.ru")
RECIPIENT_EMAILS = os.getenv("RECIPIENT_EMAILS", "SupportJDI@epam.com")
+SELENOID_PARALLEL_SESSIONS_COUNT = int(
+ os.getenv("SELENOID_PARALLEL_SESSIONS_COUNT", len(os.sched_getaffinity(0)))
+)
+
logger.info("Module utils.config was loaded")