Sage-Bionetworks · GiaJordan · Oct 29, 2024 · Oct 29, 2024 · Oct 29, 2024 · Oct 29, 2024
@@ -1904,6 +1904,8 @@ def get_manifest(
         # TODO: avoid explicitly exposing Synapse store functionality
         # just instantiate a Store class and let it decide at runtime/config
         # the store type
+        # TODO: determine which parts of fileview are necessary for `get` operations
+        # and pass query parameters at object instantiation to avoid having to re-query
         if access_token:
             # for getting an existing manifest on AWS
             store = SynapseStorage(access_token=access_token)

@@ -2119,7 +2119,9 @@ def filename_validation(
 
         where_clauses = []
 
-        dataset_clause = f"parentId='{dataset_scope}'"
+        dataset_clause = SynapseStorage.build_clause_from_dataset_id(
+            dataset_id=dataset_scope
+        )
         where_clauses.append(dataset_clause)
 
         self._login(

@@ -416,6 +416,30 @@ def query_fileview(
                 else:
                     raise AccessCredentialsError(self.storageFileview)
 
+    @staticmethod
+    def build_clause_from_dataset_id(
+        dataset_id: Optional[str] = None, dataset_folder_list: Optional[list] = None
+    ) -> str:
+        """
+        Method to build a where clause for a Synapse FileView query based on a dataset ID that can be used before an object is initialized.
+        Args:
+            dataset_id: Synapse ID of a dataset that should be used to limit the query
+            dataset_folder_list: List of Synapse IDs of a dataset and all its subfolders that should be used to limit the query
+        Returns:
+            clause for the query or an empty string if no dataset ID is provided
+        """
+        # Calling this method without specifying synIDs will complete but will not scope the view
+        if (not dataset_id) and (not dataset_folder_list):
+            return ""
+
+        # This will be used to gather files under a dataset recursively with a fileview query instead of walking
+        if dataset_folder_list:
+            search_folders = ", ".join(f"'{synId}'" for synId in dataset_folder_list)
+            return f"parentId IN ({search_folders})"
+
+        # `dataset_id` should be provided when all files are stored directly under the dataset folder
+        return f"parentId='{dataset_id}'"
+
     def _build_query(
         self, columns: Optional[list] = None, where_clauses: Optional[list] = None
     ):
@@ -666,7 +690,7 @@ def getStorageDatasetsInProject(self, projectId: str) -> list[tuple[str, str]]:
     def getFilesInStorageDataset(
         self, datasetId: str, fileNames: List = None, fullpath: bool = True
     ) -> List[Tuple[str, str]]:
-        """Gets all files in a given dataset folder.
+        """Gets all files (excluding manifest files) in a given dataset folder.
 
         Args:
             datasetId: synapse ID of a storage dataset.
@@ -680,105 +704,43 @@ def getFilesInStorageDataset(
         Raises:
             ValueError: Dataset ID not found.
         """
-        # select all files within a given storage dataset folder (top level folder in
-        # a Synapse storage project or folder marked with contentType = 'dataset')
-        walked_path = synapseutils.walk(
-            self.syn, datasetId, includeTypes=["folder", "file"]
-        )
+        file_list = []
 
-        current_entity_location = self.synapse_entity_tracker.get(
-            synapse_id=datasetId, syn=self.syn, download_file=False
-        )
+        # Get path to dataset folder from fileview to avoid building a new fileview and walking to determine folders and files within
+        child_path = self.storageFileviewTable.loc[
+            self.storageFileviewTable["parentId"] == datasetId, "path"
+        ][0]
+        parent = child_path.split("/")[0]
+        dataset_path = f"'{parent}/%'"
 
-        def walk_back_to_project(
-            current_location: Entity, location_prefix: str, skip_entry: bool
-        ) -> str:
-            """
-            Recursively walk back up the project structure to get the paths of the
-            names of each of the directories where we started the walk function.
-
-            Args:
-                current_location (Entity): The current entity location in the project structure.
-                location_prefix (str): The prefix to prepend to the path.
-                skip_entry (bool): Whether to skip the current entry in the path. When
-                    this is True it means we are looking at our starting point. If our
-                    starting point is the project itself we can go ahead and return
-                    back the project as the prefix.
-
-            Returns:
-                str: The path of the names of each of the directories up to the project root.
-            """
-            if (
-                skip_entry
-                and "concreteType" in current_location
-                and current_location["concreteType"] == PROJECT_ENTITY
-            ):
-                return f"{current_location.name}/{location_prefix}"
+        # When querying, only include files to exclude entity files and subdirectories
+        where_clauses = [f"path like {dataset_path}", "type='file'"]
 
-            updated_prefix = (
-                location_prefix
-                if skip_entry
-                else f"{current_location.name}/{location_prefix}"
-            )
-            if (
-                "concreteType" in current_location
-                and current_location["concreteType"] == PROJECT_ENTITY
-            ):
-                return updated_prefix
-            current_location = self.synapse_entity_tracker.get(
-                synapse_id=current_location["parentId"],
-                syn=self.syn,
-                download_file=False,
-            )
-            return walk_back_to_project(
-                current_location=current_location,
-                location_prefix=updated_prefix,
-                skip_entry=False,
-            )
+        # Requery the fileview to specifically get the files in the given dataset
+        self.query_fileview(columns=["id", "path"], where_clauses=where_clauses)
 
-        prefix = walk_back_to_project(
-            current_location=current_entity_location,
-            location_prefix="",
-            skip_entry=True,
-        )
+        # Exclude manifest files
+        non_manifest_files = self.storageFileviewTable.loc[
+            ~self.storageFileviewTable["path"].str.contains("synapse_storage_manifest"),
+            :,
+        ]
 
-        project_id = self.getDatasetProject(datasetId)
-        project = self.synapse_entity_tracker.get(
-            synapse_id=project_id, syn=self.syn, download_file=False
-        )
-        project_name = project.name
-        file_list = []
+        # Remove all files that are not in the list of fileNames
+        if fileNames:
+            filename_regex = "|".join(fileNames)
 
-        # iterate over all results
-        for dirpath, _, path_filenames in walked_path:
-            # iterate over all files in a folder
-            for path_filename in path_filenames:
-                if ("manifest" not in path_filename[0] and not fileNames) or (
-                    fileNames and path_filename[0] in fileNames
-                ):
-                    # don't add manifest to list of files unless it is specified in the
-                    # list of specified fileNames; return all found files
-                    # except the manifest if no fileNames have been specified
-                    # TODO: refactor for clarity/maintainability
-
-                    if fullpath:
-                        # append directory path to filename
-                        if dirpath[0].startswith(f"{project_name}/"):
-                            path_without_project_prefix = (
-                                dirpath[0] + "/"
-                            ).removeprefix(f"{project_name}/")
-                            path_filename = (
-                                prefix + path_without_project_prefix + path_filename[0],
-                                path_filename[1],
-                            )
-                        else:
-                            path_filename = (
-                                prefix + dirpath[0] + "/" + path_filename[0],
-                                path_filename[1],
-                            )
+            matching_files = non_manifest_files["path"].str.contains(
+                filename_regex, case=False, regex=True
+            )
+
+            non_manifest_files = non_manifest_files.loc[matching_files, :]
+
+        # Truncate path if necessary
+        if not fullpath:
+            non_manifest_files.path = non_manifest_files.path.apply(os.path.basename)
 
-                    # add file name file id tuple, rearranged so that id is first and name follows
-                    file_list.append(path_filename[::-1])
+        # Return list of files as expected by other methods
+        file_list = list(non_manifest_files.itertuples(index=False, name=None))
 
         return file_list
 

@@ -4,10 +4,11 @@
 # pylint: disable=anomalous-backslash-in-string
 
 import logging
-
-from typing import Any, Mapping, Sequence, Union, Optional
-from functools import reduce
 import re
+from functools import reduce
+from typing import Any, Mapping, Optional, Sequence, Union
+
+from schematic.utils.general import SYN_ID_REGEX
 
 logger = logging.getLogger(__name__)
 
@@ -69,7 +70,7 @@ def parse_syn_ids(
     if not syn_ids:
         return None
 
-    project_regex = re.compile("(syn\d+\,?)+")
+    project_regex = re.compile(SYN_ID_REGEX)
     valid = project_regex.fullmatch(syn_ids)
 
     if not valid:

@@ -24,6 +24,8 @@
 
 T = TypeVar("T")
 
+SYN_ID_REGEX = r"(syn\d+\,?)+"
+
 
 def find_duplicates(_list: list[T]) -> set[T]:
     """Find duplicate items in a list"""

@@ -692,8 +692,8 @@ paths:
         - Synapse Storage
   /storage/dataset/files:
     get:
-      summary: Get all files in a given dataset folder
-      description: Get all files in a given dataset folder
+      summary: Get all files (excluding manifest files) in a given dataset folder
+      description: Get all files (excluding manifest files) in a given dataset folder
       operationId: schematic_api.api.routes.get_files_storage_dataset
       security:
         - access_token: []

@@ -4,14 +4,14 @@
 import uuid
 from io import BytesIO
 
+import numpy as np
+import pandas as pd
 import pytest
 import requests
-from openpyxl import load_workbook
 from click.testing import CliRunner
-import pandas as pd
-import numpy as np
+from openpyxl import load_workbook
 
-from schematic.configuration.configuration import Configuration, CONFIG
+from schematic.configuration.configuration import CONFIG, Configuration
 from schematic.manifest.commands import manifest
 from schematic.models.commands import model
 from tests.conftest import ConfigurationForTesting
@@ -665,18 +665,18 @@ def test_generate_bulk_rna_google_sheet_manifest(
             # Reset config to it's default values
             CONFIG.load_config("config_example.yml")
 
-        assert result.output.split("\n")[7] == (
+        assert result.output.split("\n")[8] == (
             "Find the manifest template using this Google Sheet URL:"
         )
-        assert result.output.split("\n")[8].startswith(
+        assert result.output.split("\n")[9].startswith(
             "https://docs.google.com/spreadsheets/d/"
         )
-        assert result.output.split("\n")[9] == (
+        assert result.output.split("\n")[10] == (
             "Find the manifest template using this CSV file path: "
             "./CLI_gs_bulk_rna.csv"
         )
 
-        google_sheet_url = result.output.split("\n")[8]
+        google_sheet_url = result.output.split("\n")[9]
 
         # Download the Google Sheets content as an Excel file and load into openpyxl
         export_url = f"{google_sheet_url}/export?format=xlsx"
@@ -908,18 +908,18 @@ def test_generate_bulk_rna_google_sheet_manifest_with_annotations(
                 os.remove("tests/data/example.BulkRNA-seqAssay.schema.json")
             os.remove("./CLI_gs_bulk_rna_annos.csv")
 
-        assert result.output.split("\n")[10] == (
+        assert result.output.split("\n")[11] == (
             "Find the manifest template using this Google Sheet URL:"
         )
-        assert result.output.split("\n")[11].startswith(
+        assert result.output.split("\n")[12].startswith(
             "https://docs.google.com/spreadsheets/d/"
         )
-        assert result.output.split("\n")[12] == (
+        assert result.output.split("\n")[13] == (
             "Find the manifest template using this CSV file path: "
             "./CLI_gs_bulk_rna_annos.csv"
         )
 
-        google_sheet_url = result.output.split("\n")[11]
+        google_sheet_url = result.output.split("\n")[12]
 
         # Download the Google Sheets content as an Excel file and load into openpyxl
         export_url = f"{google_sheet_url}/export?format=xlsx"