CERT-Polska · mickol34 · Oct 9, 2024 · Oct 14, 2024 · Oct 15, 2024 · Oct 17, 2024
diff --git a/src/db.py b/src/db.py
@@ -24,6 +24,7 @@
 from .models.job import Job
 from .models.jobagent import JobAgent
 from .models.match import Match
+from .models.queryresult import QueryResult
 from .schema import MatchesSchema, ConfigSchema
 from .config import app_config
 
@@ -111,6 +112,19 @@ def add_match(self, job: JobId, match: Match) -> None:
             session.add(match)
             session.commit()
 
+    def add_queryresult(self, job_id: int | None, files: List[str]) -> None:
+        with self.session() as session:
+            obj = QueryResult(job_id=job_id, files=files)
+            session.add(obj)
+            session.commit()
+
+    def remove_queryresult(self, job_id: int | None) -> None:
+        with self.session() as session:
+            session.query(QueryResult).where(
+                QueryResult.job_id == job_id
+            ).delete()
+            session.commit()
+
     def job_contains(self, job: JobId, ordinal: int, file_path: str) -> bool:
         """Make sure that the file path is in the job results."""
         with self.session() as session:

diff --git a/src/e2etests/test_api.py b/src/e2etests/test_api.py
@@ -9,6 +9,7 @@
 import requests
 import random
 import os
+import pprint
 
 from ..lib.ursadb import UrsaDb  # noqa
 
@@ -261,7 +262,7 @@ def request_query(log, i, taints=[]):
             "taints": taints,
         },
     )
-    log.info("API response: %s", res.json())
+    log.info("API response: %s\n", pprint.pformat(res.json()))
     res.raise_for_status()
 
     query_hash = res.json()["query_hash"]
@@ -270,7 +271,7 @@ def request_query(log, i, taints=[]):
         res = requests.get(
             f"http://web:5000/api/matches/{query_hash}?offset=0&limit=50"
         )
-        log.info("API response: %s", res.json())
+        log.info("API response: %s\n", pprint.pformat(res.json()))
         if res.json()["job"]["status"] == "done":
             break
         time.sleep(1)

diff --git a/src/lib/ursadb.py b/src/lib/ursadb.py
@@ -63,7 +63,7 @@ def query(
             command += f"with taints {taints_whole_str} "
         if dataset:
             command += f'with datasets ["{dataset}"] '
-        command += f"into iterator {query};"
+        command += f"{query};"
 
         start = time.perf_counter()
         res = self.__execute(command, recv_timeout=-1)
@@ -75,8 +75,7 @@ def query(
 
         return {
             "time": (end - start),
-            "iterator": res["result"]["iterator"],
-            "file_count": res["result"]["file_count"],
+            "files": res["result"]["files"],
         }
 
     def pop(self, iterator: str, count: int) -> PopResult:

diff --git a/src/migrations/versions/4e4c88411541_create_queryresult_model.py b/src/migrations/versions/4e4c88411541_create_queryresult_model.py
@@ -0,0 +1,32 @@
+"""create Queryresult model
+Revision ID: 4e4c88411541
+Revises: dbb81bd4d47f
+Create Date: 2024-10-17 14:31:49.278443
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "4e4c88411541"
+down_revision = "dbb81bd4d47f"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.create_table(
+        "queryresult",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("job_id", sa.Integer(), nullable=False),
+        sa.Column("files", sa.ARRAY(sa.String()), nullable=True),
+        sa.ForeignKeyConstraint(
+            ["job_id"],
+            ["job.internal_id"],
+        ),
+        sa.PrimaryKeyConstraint("id"),
+    )
+
+
+def downgrade() -> None:
+    op.drop_table("queryresult")
diff --git a/src/models/queryresult.py b/src/models/queryresult.py
@@ -0,0 +1,8 @@
+from sqlmodel import Field, SQLModel, ARRAY, Column, String
+from typing import List, Union
+
+
+class QueryResult(SQLModel, table=True):
+    id: Union[int, None] = Field(default=None, primary_key=True)
+    job_id: Union[int, None] = Field(foreign_key="job.internal_id")
+    files: List[str] = Field(sa_column=Column(ARRAY(String)))
diff --git a/src/tasks.py b/src/tasks.py
@@ -4,6 +4,7 @@
 from redis import Redis
 from contextlib import contextmanager
 import yara  # type: ignore
+from itertools import accumulate
 
 from .db import Database, JobId
 from .util import make_sha256_tag
@@ -240,9 +241,10 @@ def query_ursadb(job_id: JobId, dataset_id: str, ursadb_query: str) -> None:
         if "error" in result:
             raise RuntimeError(result["error"])
 
-        file_count = result["file_count"]
-        iterator = result["iterator"]
-        logging.info(f"Iterator {iterator} contains {file_count} files")
+        files = result["files"]
+        agent.db.add_queryresult(job.internal_id, files)
+
+        file_count = len(files)
 
         total_files = agent.db.update_job_files(job_id, file_count)
         if job.files_limit and total_files > job.files_limit:
@@ -251,42 +253,36 @@ def query_ursadb(job_id: JobId, dataset_id: str, ursadb_query: str) -> None:
                 "Try a more precise query."
             )
 
-        batches = __get_batch_sizes(file_count)
-        # add len(batches) new tasks, -1 to account for this task
-        agent.add_tasks_in_progress(job, len(batches) - 1)
+        batch_sizes = __get_batch_sizes(file_count)
+        # add len(batch_sizes) new tasks, -1 to account for this task
+        agent.add_tasks_in_progress(job, len(batch_sizes) - 1)
+
+        batched_files = (
+            files[batch_end - batch_size : batch_end]
+            for batch_end, batch_size in zip(
+                accumulate(batch_sizes), batch_sizes
+            )
+        )
 
-        for batch in batches:
+        for batch_files in batched_files:
             agent.queue.enqueue(
                 run_yara_batch,
                 job_id,
-                iterator,
-                batch,
+                batch_files,
                 job_timeout=app_config.rq.job_timeout,
             )
 
         agent.db.dataset_query_done(job_id)
+        agent.db.remove_queryresult(job.internal_id)
 
 
-def run_yara_batch(job_id: JobId, iterator: str, batch_size: int) -> None:
+def run_yara_batch(job_id: JobId, batch_files: List[str]) -> None:
     """Actually scans files, and updates a database with the results."""
     with job_context(job_id) as agent:
         job = agent.db.get_job(job_id)
         if job.status == "cancelled":
             logging.info("Job was cancelled, returning...")
             return
 
-        pop_result = agent.ursa.pop(iterator, batch_size)
-        logging.info("job %s: Pop successful: %s", job_id, pop_result)
-        if pop_result.was_locked:
-            # Iterator is currently locked, re-enqueue self
-            agent.queue.enqueue(
-                run_yara_batch,
-                job_id,
-                iterator,
-                batch_size,
-                job_timeout=app_config.rq.job_timeout,
-            )
-            return
-
-        agent.execute_yara(job, pop_result.files)
+        agent.execute_yara(job, batch_files)
         agent.add_tasks_in_progress(job, -1)