From 4f6928e03908e9e7e6307efdf0ffb31908bb8ca9 Mon Sep 17 00:00:00 2001 From: pavlovicmilena Date: Tue, 3 Aug 2021 16:59:05 +0200 Subject: [PATCH] standardize metadata to include "identifier" instead of "repertoire_identifier" --- docs/source/galaxy/galaxy_intro.rst | 2 +- immuneML/IO/dataset_export/AIRRExporter.py | 3 ++- immuneML/IO/dataset_import/ImmuneMLImport.py | 4 ++-- immuneML/data_model/dataset/RepertoireDataset.py | 2 +- immuneML/encodings/distance_encoding/DistanceEncoder.py | 6 +++--- immuneML/environment/Constants.py | 2 +- immuneML/util/ImportHelper.py | 1 - immuneML/util/RepertoireBuilder.py | 2 +- immuneML/workflows/steps/SignalImplanter.py | 2 +- 9 files changed, 12 insertions(+), 12 deletions(-) diff --git a/docs/source/galaxy/galaxy_intro.rst b/docs/source/galaxy/galaxy_intro.rst index a9ab8a366..46be1ab27 100644 --- a/docs/source/galaxy/galaxy_intro.rst +++ b/docs/source/galaxy/galaxy_intro.rst @@ -163,4 +163,4 @@ you expected of the tool, and click 'Report'. .. image:: ../_static/images/galaxy/report_bug.png :alt: bug report - :width: 250 + :width: 80% diff --git a/immuneML/IO/dataset_export/AIRRExporter.py b/immuneML/IO/dataset_export/AIRRExporter.py index 6cddfb8a0..cddabff70 100644 --- a/immuneML/IO/dataset_export/AIRRExporter.py +++ b/immuneML/IO/dataset_export/AIRRExporter.py @@ -84,8 +84,9 @@ def get_sequence_aa_field(region_type): @staticmethod def export_updated_metadata(dataset: RepertoireDataset, result_path: Path, repertoire_folder: str): df = pd.read_csv(dataset.metadata_file, comment=Constants.COMMENT_SIGN) - identifiers = df["repertoire_identifier"].values.tolist() if "repertoire_identifier" in df.columns else dataset.get_example_ids() + identifiers = df["identifier"].values.tolist() if "identifier" in df.columns else dataset.get_example_ids() df["filename"] = [str(Path(repertoire_folder) / f"{repertoire.data_filename.stem}.tsv") for repertoire in dataset.get_data()] + df['identifier'] = identifiers df.to_csv(result_path / "metadata.csv", index=False) @staticmethod diff --git a/immuneML/IO/dataset_import/ImmuneMLImport.py b/immuneML/IO/dataset_import/ImmuneMLImport.py index 754749ba5..d632d014b 100644 --- a/immuneML/IO/dataset_import/ImmuneMLImport.py +++ b/immuneML/IO/dataset_import/ImmuneMLImport.py @@ -140,8 +140,8 @@ def _update_receptor_paths(pickle_params, dataset: ElementDataset): return dataset @staticmethod - def _discover_repertoire_path(pickle_params, dataset): - dataset_dir = ImmuneMLImport._discover_dataset_dir(pickle_params) + def _discover_repertoire_path(params, dataset): + dataset_dir = ImmuneMLImport._discover_dataset_dir(params) if len(list(dataset_dir.glob("*.npy"))) == len(dataset.repertoires): path = dataset_dir diff --git a/immuneML/data_model/dataset/RepertoireDataset.py b/immuneML/data_model/dataset/RepertoireDataset.py index 269bba82f..357d3f4e6 100644 --- a/immuneML/data_model/dataset/RepertoireDataset.py +++ b/immuneML/data_model/dataset/RepertoireDataset.py @@ -26,7 +26,7 @@ def build(cls, **kwargs): filename = filename.parent.parent / Path(row['filename']).name repertoire = Repertoire(data_filename=filename, metadata_filename=filename.parent / f'{filename.stem}_metadata.yaml', - identifier=row['repertoire_identifier']) + identifier=row['identifier']) repertoires.append(repertoire) if "repertoire_ids" in kwargs.keys() and "repertoires" not in kwargs.keys() and kwargs['repertoire_ids'] is not None: diff --git a/immuneML/encodings/distance_encoding/DistanceEncoder.py b/immuneML/encodings/distance_encoding/DistanceEncoder.py index e1c604cca..364d67691 100644 --- a/immuneML/encodings/distance_encoding/DistanceEncoder.py +++ b/immuneML/encodings/distance_encoding/DistanceEncoder.py @@ -107,13 +107,13 @@ def build_distance_matrix(self, dataset: RepertoireDataset, params: EncoderParam def build_labels(self, dataset: RepertoireDataset, params: EncoderParams) -> dict: - lbl = ["repertoire_identifier"] + lbl = ["identifier"] lbl.extend(params.label_config.get_labels_by_name()) tmp_labels = dataset.get_metadata(lbl, return_df=True) - tmp_labels = tmp_labels.iloc[pd.Index(tmp_labels['repertoire_identifier']).get_indexer(dataset.get_repertoire_ids())] + tmp_labels = tmp_labels.iloc[pd.Index(tmp_labels['identifier']).get_indexer(dataset.get_repertoire_ids())] tmp_labels = tmp_labels.to_dict("list") - del tmp_labels["repertoire_identifier"] + del tmp_labels["identifier"] return tmp_labels diff --git a/immuneML/environment/Constants.py b/immuneML/environment/Constants.py index 23976a9b8..38788c77a 100644 --- a/immuneML/environment/Constants.py +++ b/immuneML/environment/Constants.py @@ -1,6 +1,6 @@ class Constants: - VERSION = "2.0.2" + VERSION = "2.0.3" # encoding constants FEATURE_DELIMITER = "///" diff --git a/immuneML/util/ImportHelper.py b/immuneML/util/ImportHelper.py index 356cdfb78..2f3e3d88e 100644 --- a/immuneML/util/ImportHelper.py +++ b/immuneML/util/ImportHelper.py @@ -495,7 +495,6 @@ def import_receptors_by_id(df, identifier, chain_pair, metadata_columns) -> List f"Missing {chain_pair.value[i]} chain for receptor with identifier {identifier}, this receptor will be omitted.") return [] - # todo add options like IRIS import: option to import all dual chains or just the first pair / all V genes when uncertain annotation, etc # todo add possibility to import multiple chain combo's? (BCR heavy-light & heavy-kappa, as seen in 10xGenomics?) return [ImportHelper.build_receptor_from_rows(first_row.iloc[0], second_row.iloc[0], identifier, chain_pair, metadata_columns)] diff --git a/immuneML/util/RepertoireBuilder.py b/immuneML/util/RepertoireBuilder.py index 63aaee088..381cf05fa 100644 --- a/immuneML/util/RepertoireBuilder.py +++ b/immuneML/util/RepertoireBuilder.py @@ -56,7 +56,7 @@ def build(sequences: list, path: Path, labels: dict = None, seq_metadata: list = df = pd.DataFrame({**{"filename": [repertoire.data_filename for repertoire in repertoires], "subject_id": subject_ids, - "repertoire_identifier": [repertoire.identifier for repertoire in repertoires]}, + "identifier": [repertoire.identifier for repertoire in repertoires]}, **(labels if labels is not None else {})}) df.to_csv(path / "metadata.csv", index=False) diff --git a/immuneML/workflows/steps/SignalImplanter.py b/immuneML/workflows/steps/SignalImplanter.py index d718c0827..9b0038324 100644 --- a/immuneML/workflows/steps/SignalImplanter.py +++ b/immuneML/workflows/steps/SignalImplanter.py @@ -118,7 +118,7 @@ def _create_metadata_file(processed_repertoires: List[Repertoire], simulation_st path = simulation_state.result_path / "metadata.csv" - new_df = pd.DataFrame([{**repertoire.metadata, **{'repertoire_identifier': repertoire.identifier}} for repertoire in processed_repertoires]) + new_df = pd.DataFrame([{**repertoire.metadata, **{'identifier': repertoire.identifier}} for repertoire in processed_repertoires]) new_df.drop('field_list', axis=1, inplace=True) new_df["filename"] = [repertoire.data_filename.name for repertoire in processed_repertoires] new_df.to_csv(path, index=False)