From d01daef355c0a8d07dabbb271108438c68a69c12 Mon Sep 17 00:00:00 2001 From: MalteEbner Date: Fri, 16 Apr 2021 10:04:15 +0200 Subject: [PATCH 01/27] added bitmask difference with test --- lightly/api/bitmask.py | 15 +++++++++++++++ tests/api/test_BitMask.py | 6 ++++++ 2 files changed, 21 insertions(+) diff --git a/lightly/api/bitmask.py b/lightly/api/bitmask.py index 1cc9c9e27..43ee288e8 100644 --- a/lightly/api/bitmask.py +++ b/lightly/api/bitmask.py @@ -67,6 +67,11 @@ def _intersection(x: int, y: int) -> int: """ return x & y +def _difference(x: int, y: int) -> int: + """Uses difference to get the intersection of the two masks. + """ + return x - y + def _get_kth_bit(x: int, k: int) -> int: """Returns the kth bit in the mask from the right. @@ -173,6 +178,16 @@ def intersection(self, other): """ self.x = _intersection(self.x, other.x) + def difference(self, other): + """Calculates the difference of two bit masks. + Examples: + >>> mask1 = BitMask.from_bin('0b0111') + >>> mask2 = BitMask.from_bin('0b1100') + >>> mask1.difference(mask2) + >>> # mask1.binstring is '0b0011' + """ + self.x = _difference(self.x, other.x) + def get_kth_bit(self, k: int) -> bool: """Returns the boolean value of the kth bit from the right. """ diff --git a/tests/api/test_BitMask.py b/tests/api/test_BitMask.py index d6997cae0..736879d9c 100644 --- a/tests/api/test_BitMask.py +++ b/tests/api/test_BitMask.py @@ -80,6 +80,12 @@ def test_intersection(self): mask_a.intersection(mask_b) self.assertEqual(mask_a.x, int("0b100", 2)) + def test_difference(self): + mask_a = BitMask.from_bin("0b101") + mask_b = BitMask.from_bin("0b001") + mask_a.difference(mask_b) + self.assertEqual(mask_a.x, int("0b100", 2)) + def test_nonzero_bits(self): mask = BitMask.from_bin("0b0") From 16c37b77faaedc740c58bb04e501c505d64bbb01 Mon Sep 17 00:00:00 2001 From: MalteEbner Date: Fri, 16 Apr 2021 10:35:18 +0200 Subject: [PATCH 02/27] added Bitmask.from_length() constructor --- lightly/api/bitmask.py | 10 +++++++++- tests/api/test_BitMask.py | 13 ++++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/lightly/api/bitmask.py b/lightly/api/bitmask.py index 43ee288e8..9ad2886b0 100644 --- a/lightly/api/bitmask.py +++ b/lightly/api/bitmask.py @@ -67,6 +67,7 @@ def _intersection(x: int, y: int) -> int: """ return x & y + def _difference(x: int, y: int) -> int: """Uses difference to get the intersection of the two masks. """ @@ -128,6 +129,13 @@ def from_bin(cls, binstring: str): """ return cls(_bin_to_int(binstring)) + @classmethod + def from_length(cls, length: int): + """Creates a all-true bitmask of a predefined length + """ + binstring = '0b' + '1' * length + return cls.from_bin(binstring) + def to_hex(self): """Creates a BitMask from a hex string. """ @@ -209,4 +217,4 @@ def unset_kth_bit(self, k: int): >>> mask.unset_kth_bit(2) >>> # mask.binstring is '0b1011' """ - self.x = _unset_kth_bit(self.x, k) \ No newline at end of file + self.x = _unset_kth_bit(self.x, k) diff --git a/tests/api/test_BitMask.py b/tests/api/test_BitMask.py index 736879d9c..5dc91f855 100644 --- a/tests/api/test_BitMask.py +++ b/tests/api/test_BitMask.py @@ -21,6 +21,17 @@ def test_get_and_set(self): mask.unset_kth_bit(4) self.assertFalse(mask.get_kth_bit(4)) + def test_large_bitmasks(self): + bitstring = "0b" + "1" * 5678 + mask = BitMask.from_bin(bitstring) + mask_as_bitstring = mask.to_bin() + self.assertEqual(mask_as_bitstring, bitstring) + + def test_bitmask_from_length(self): + length = 4 + mask = BitMask.from_length(length) + self.assertEqual(mask.to_bin(), "0b1111") + def test_get_and_set_outside_of_range(self): mask = BitMask.from_bin("0b11110000") @@ -99,4 +110,4 @@ def test_nonzero_bits(self): also_indices = mask.to_indices() for i, j in zip(indices, also_indices): - self.assertEqual(i, j) \ No newline at end of file + self.assertEqual(i, j) From b21e56eaf80eef953b364ee735c0ce1acf278a98 Mon Sep 17 00:00:00 2001 From: MalteEbner Date: Fri, 16 Apr 2021 10:51:01 +0200 Subject: [PATCH 03/27] added Bitmask.subset_a_list() method --- lightly/api/bitmask.py | 12 ++++++++++++ tests/api/test_BitMask.py | 8 ++++++++ 2 files changed, 20 insertions(+) diff --git a/lightly/api/bitmask.py b/lightly/api/bitmask.py index 9ad2886b0..495ec1ff5 100644 --- a/lightly/api/bitmask.py +++ b/lightly/api/bitmask.py @@ -196,6 +196,18 @@ def difference(self, other): """ self.x = _difference(self.x, other.x) + def subset_a_list(self, list_: List): + """Returns a subset of a list depending on the bitmask + Examples: + >>> list_to_subset = [4, 7, 9, 1] + >>> mask = BitMask.from_bin("0b0101") + >>> masked_list = mask.subset_a_list(list_to_subset) + >>> # masked_list = [7, 1] + """ + bits = self.to_bin() + reversed_masked_list = [e for e, bit in zip(reversed(list_),reversed(bits)) if bit == "1"] + return list(reversed(reversed_masked_list)) + def get_kth_bit(self, k: int) -> bool: """Returns the boolean value of the kth bit from the right. """ diff --git a/tests/api/test_BitMask.py b/tests/api/test_BitMask.py index 5dc91f855..2c02c0f5f 100644 --- a/tests/api/test_BitMask.py +++ b/tests/api/test_BitMask.py @@ -97,6 +97,14 @@ def test_difference(self): mask_a.difference(mask_b) self.assertEqual(mask_a.x, int("0b100", 2)) + def test_subset_a_list(self): + list_ = [4, 7, 9, 1] + mask = BitMask.from_bin("0b0101") + target_masked_list = [7, 1] + masked_list = mask.subset_a_list(list_) + self.assertEqual(target_masked_list,masked_list) + + def test_nonzero_bits(self): mask = BitMask.from_bin("0b0") From ef07f20edf687dba681e86d0fa7bbf51da7429ad Mon Sep 17 00:00:00 2001 From: MalteEbner Date: Fri, 16 Apr 2021 11:14:26 +0200 Subject: [PATCH 04/27] bitmasks: added equality and minus operator overloads --- lightly/api/bitmask.py | 6 ++++++ tests/api/test_BitMask.py | 12 ++++++++++++ 2 files changed, 18 insertions(+) diff --git a/lightly/api/bitmask.py b/lightly/api/bitmask.py index 495ec1ff5..78ca34a26 100644 --- a/lightly/api/bitmask.py +++ b/lightly/api/bitmask.py @@ -196,6 +196,12 @@ def difference(self, other): """ self.x = _difference(self.x, other.x) + def __sub__(self, other): + return BitMask(self.x - other.x) + + def __eq__(self, other): + return self.to_bin() == other.to_bin() + def subset_a_list(self, list_: List): """Returns a subset of a list depending on the bitmask Examples: diff --git a/tests/api/test_BitMask.py b/tests/api/test_BitMask.py index 2c02c0f5f..0a5d0ba9a 100644 --- a/tests/api/test_BitMask.py +++ b/tests/api/test_BitMask.py @@ -97,6 +97,18 @@ def test_difference(self): mask_a.difference(mask_b) self.assertEqual(mask_a.x, int("0b100", 2)) + def test_operator_minus(self): + mask_a = BitMask.from_bin("0b101") + mask_b = BitMask.from_bin("0b001") + mask_target = BitMask.from_bin("0b100") + self.assertEqual(mask_a-mask_b, mask_target) + + def test_equal(self): + mask_a = BitMask.from_bin("0b101") + mask_b = BitMask.from_bin("0b101") + self.assertEqual(mask_a, mask_b) + + def test_subset_a_list(self): list_ = [4, 7, 9, 1] mask = BitMask.from_bin("0b0101") From a26e5f6df14b320a2b7f6bd8fcf3fb74121b5f15 Mon Sep 17 00:00:00 2001 From: MalteEbner Date: Fri, 16 Apr 2021 11:15:01 +0200 Subject: [PATCH 05/27] added "added set" to agent and computed the three sets more efficient with bitmasks --- lightly/active_learning/agents/agent.py | 79 ++++++++++--------- .../test_active_learning_agent.py | 7 +- 2 files changed, 45 insertions(+), 41 deletions(-) diff --git a/lightly/active_learning/agents/agent.py b/lightly/active_learning/agents/agent.py index 8b708b5c5..2eab9020d 100644 --- a/lightly/active_learning/agents/agent.py +++ b/lightly/active_learning/agents/agent.py @@ -76,28 +76,29 @@ def _set_labeled_and_unlabeled_set(self, preselected_tag_data: TagData = None): optional param, then it must not be loaded from the API """ - if self.preselected_tag_id is None: - self.labeled_set = [] - else: + self.bitmask_labeled_set = BitMask.from_hex("0x0") + self.bitmask_added_set = BitMask.from_hex("0x0") + if self.preselected_tag_id is not None: if preselected_tag_data is None: preselected_tag_data = self.api_workflow_client.tags_api.get_tag_by_tag_id( self.api_workflow_client.dataset_id, tag_id=self.preselected_tag_id) - chosen_samples_ids = BitMask.from_hex(preselected_tag_data.bit_mask_data).to_indices() - self.labeled_set = [self.api_workflow_client.filenames_on_server[i] for i in chosen_samples_ids] + new_bitmask_labeled_set = BitMask.from_hex(preselected_tag_data.bit_mask_data) + self.bitmask_added_set = new_bitmask_labeled_set - self.bitmask_labeled_set + self.bitmask_labeled_set = new_bitmask_labeled_set - if not hasattr(self, "unlabeled_set"): - if self.query_tag_id is None: - self.unlabeled_set = self.api_workflow_client.filenames_on_server - else: - query_tag_data = self.api_workflow_client.tags_api.get_tag_by_tag_id( - self.api_workflow_client.dataset_id, tag_id=self.query_tag_id) - chosen_samples_ids = BitMask.from_hex(query_tag_data.bit_mask_data).to_indices() - self.unlabeled_set = [self.api_workflow_client.filenames_on_server[i] for i in chosen_samples_ids] + if self.query_tag_id is None: + bitmask_query_tag = BitMask.from_length(len(self.api_workflow_client.filenames_on_server)) + else: + query_tag_data = self.api_workflow_client.tags_api.get_tag_by_tag_id( + self.api_workflow_client.dataset_id, tag_id=self.query_tag_id) + bitmask_query_tag = BitMask.from_hex(query_tag_data.bit_mask_data) + self.bitmask_unlabeled_set = bitmask_query_tag - self.bitmask_labeled_set - filenames_labeled = set(self.labeled_set) - self.unlabeled_set = [f for f in self.unlabeled_set if f not in filenames_labeled] + self.labeled_set = self.bitmask_labeled_set.subset_a_list(self.api_workflow_client.filenames_on_server) + self.added_set = self.bitmask_added_set.subset_a_list(self.api_workflow_client.filenames_on_server) + self.unlabeled_set = self.bitmask_unlabeled_set.subset_a_list(self.api_workflow_client.filenames_on_server) - def query(self, sampler_config: SamplerConfig, al_scorer: Scorer = None) -> List[str]: + def query(self, sampler_config: SamplerConfig, al_scorer: Scorer = None) -> Tuple[List[str], List[str]]: """Performs an active learning query. As part of it, the self.labeled_set and self.unlabeled_set are updated @@ -110,7 +111,7 @@ def query(self, sampler_config: SamplerConfig, al_scorer: Scorer = None) -> List An instance of a class inheriting from Scorer, e.g. a ClassificationScorer. Returns: - The filenames of the samples in the new labeled_set. + The filenames of the samples in the new labeled_set and the added filenames. """ # check input @@ -119,29 +120,29 @@ def query(self, sampler_config: SamplerConfig, al_scorer: Scorer = None) -> List "including the current labeled set " "(sampler_config.n_samples) " "is smaller than the number of samples in the current labeled set.") - return self.labeled_set - - # calculate scores - if al_scorer is not None: - no_unlabeled_samples = len(self.unlabeled_set) - no_samples_with_predictions = len(al_scorer.model_output) - if no_unlabeled_samples != no_samples_with_predictions: - raise ValueError(f"The scorer must have exactly as much samples as in the unlabeled set," - f"but there are {no_samples_with_predictions} predictions in the scorer," - f"but {no_unlabeled_samples} in the unlabeled set.") - scores_dict = al_scorer._calculate_scores() + else: - scores_dict = None + # calculate scores + if al_scorer is not None: + no_unlabeled_samples = len(self.unlabeled_set) + no_samples_with_predictions = len(al_scorer.model_output) + if no_unlabeled_samples != no_samples_with_predictions: + raise ValueError(f"The scorer must have exactly as much samples as in the unlabeled set," + f"but there are {no_samples_with_predictions} predictions in the scorer," + f"but {no_unlabeled_samples} in the unlabeled set.") + scores_dict = al_scorer._calculate_scores() + else: + scores_dict = None - # perform the sampling - new_tag_data = self.api_workflow_client.sampling( - sampler_config=sampler_config, - al_scores=scores_dict, - preselected_tag_id=self.preselected_tag_id, - query_tag_id=self.query_tag_id) + # perform the sampling + new_tag_data = self.api_workflow_client.sampling( + sampler_config=sampler_config, + al_scores=scores_dict, + preselected_tag_id=self.preselected_tag_id, + query_tag_id=self.query_tag_id) - # set the newly chosen tag as the new preselected_tag_id and update the sets - self.preselected_tag_id = new_tag_data.id - self._set_labeled_and_unlabeled_set(new_tag_data) + # set the newly chosen tag as the new preselected_tag_id and update the sets + self.preselected_tag_id = new_tag_data.id + self._set_labeled_and_unlabeled_set(new_tag_data) - return self.labeled_set + return self.labeled_set, self.added_set diff --git a/tests/active_learning/test_active_learning_agent.py b/tests/active_learning/test_active_learning_agent.py index a14a399a3..8212d6178 100644 --- a/tests/active_learning/test_active_learning_agent.py +++ b/tests/active_learning/test_active_learning_agent.py @@ -30,7 +30,10 @@ def test_agent(self): predictions = np.random.rand(len(agent.unlabeled_set), 10).astype(np.float32) predictions_normalized = predictions / np.sum(predictions, axis=1)[:, np.newaxis] al_scorer = ScorerClassification(predictions_normalized) - chosen_filenames = agent.query(sampler_config=sampler_config, al_scorer=al_scorer) + labeled_set, added_set = agent.query(sampler_config=sampler_config, al_scorer=al_scorer) else: sampler_config = SamplerConfig(n_samples=n_samples) - chosen_filenames = agent.query(sampler_config=sampler_config) + labeled_set, added_set = agent.query(sampler_config=sampler_config) + + assert len(added_set) <= len(labeled_set) + assert set(added_set).issubset(labeled_set) From ba192987699e6507833b2e88c8d940925193ea4b Mon Sep 17 00:00:00 2001 From: MalteEbner Date: Fri, 16 Apr 2021 17:28:31 +0200 Subject: [PATCH 06/27] added generated tag arithmetics endpoints --- .../swagger_client/__init__.py | 3 + .../swagger_client/api/tags_api.py | 107 ++++++++ .../swagger_client/models/__init__.py | 3 + .../models/tag_arithmetics_operation.py | 102 ++++++++ .../models/tag_arithmetics_request.py | 230 ++++++++++++++++++ .../models/tag_bit_mask_response.py | 124 ++++++++++ 6 files changed, 569 insertions(+) create mode 100644 lightly/openapi_generated/swagger_client/models/tag_arithmetics_operation.py create mode 100644 lightly/openapi_generated/swagger_client/models/tag_arithmetics_request.py create mode 100644 lightly/openapi_generated/swagger_client/models/tag_bit_mask_response.py diff --git a/lightly/openapi_generated/swagger_client/__init__.py b/lightly/openapi_generated/swagger_client/__init__.py index b810383d4..1f3b68217 100644 --- a/lightly/openapi_generated/swagger_client/__init__.py +++ b/lightly/openapi_generated/swagger_client/__init__.py @@ -73,7 +73,10 @@ from lightly.openapi_generated.swagger_client.models.sampling_create_request import SamplingCreateRequest from lightly.openapi_generated.swagger_client.models.sampling_method import SamplingMethod from lightly.openapi_generated.swagger_client.models.tag_active_learning_scores_data import TagActiveLearningScoresData +from lightly.openapi_generated.swagger_client.models.tag_arithmetics_operation import TagArithmeticsOperation +from lightly.openapi_generated.swagger_client.models.tag_arithmetics_request import TagArithmeticsRequest from lightly.openapi_generated.swagger_client.models.tag_bit_mask_data import TagBitMaskData +from lightly.openapi_generated.swagger_client.models.tag_bit_mask_response import TagBitMaskResponse from lightly.openapi_generated.swagger_client.models.tag_change_data import TagChangeData from lightly.openapi_generated.swagger_client.models.tag_create_request import TagCreateRequest from lightly.openapi_generated.swagger_client.models.tag_creator import TagCreator diff --git a/lightly/openapi_generated/swagger_client/api/tags_api.py b/lightly/openapi_generated/swagger_client/api/tags_api.py index fa9e859a2..fca7de663 100644 --- a/lightly/openapi_generated/swagger_client/api/tags_api.py +++ b/lightly/openapi_generated/swagger_client/api/tags_api.py @@ -547,3 +547,110 @@ def get_tags_by_dataset_id_with_http_info(self, dataset_id, **kwargs): # noqa: _preload_content=params.get('_preload_content', True), _request_timeout=params.get('_request_timeout'), collection_formats=collection_formats) + + def perform_tag_arithmetics(self, body, dataset_id, **kwargs): # noqa: E501 + """perform_tag_arithmetics # noqa: E501 + + performs tag arithmetics to compute a new bitmask out of two existing and optionally create a tag for it # noqa: E501 + This method makes a synchronous HTTP request by default. To make an + asynchronous HTTP request, please pass async_req=True + >>> thread = api.perform_tag_arithmetics(body, dataset_id, async_req=True) + >>> result = thread.get() + + :param async_req bool + :param TagArithmeticsRequest body: (required) + :param MongoObjectID dataset_id: ObjectId of the dataset (required) + :return: TagBitMaskResponse + If the method is called asynchronously, + returns the request thread. + """ + kwargs['_return_http_data_only'] = True + if kwargs.get('async_req'): + return self.perform_tag_arithmetics_with_http_info(body, dataset_id, **kwargs) # noqa: E501 + else: + (data) = self.perform_tag_arithmetics_with_http_info(body, dataset_id, **kwargs) # noqa: E501 + return data + + def perform_tag_arithmetics_with_http_info(self, body, dataset_id, **kwargs): # noqa: E501 + """perform_tag_arithmetics # noqa: E501 + + performs tag arithmetics to compute a new bitmask out of two existing and optionally create a tag for it # noqa: E501 + This method makes a synchronous HTTP request by default. To make an + asynchronous HTTP request, please pass async_req=True + >>> thread = api.perform_tag_arithmetics_with_http_info(body, dataset_id, async_req=True) + >>> result = thread.get() + + :param async_req bool + :param TagArithmeticsRequest body: (required) + :param MongoObjectID dataset_id: ObjectId of the dataset (required) + :return: TagBitMaskResponse + If the method is called asynchronously, + returns the request thread. + """ + + all_params = ['body', 'dataset_id'] # noqa: E501 + all_params.append('async_req') + all_params.append('_return_http_data_only') + all_params.append('_preload_content') + all_params.append('_request_timeout') + + params = locals() + for key, val in six.iteritems(params['kwargs']): + if key not in all_params: + raise TypeError( + "Got an unexpected keyword argument '%s'" + " to method perform_tag_arithmetics" % key + ) + params[key] = val + del params['kwargs'] + # verify the required parameter 'body' is set + if self.api_client.client_side_validation and ('body' not in params or + params['body'] is None): # noqa: E501 + raise ValueError("Missing the required parameter `body` when calling `perform_tag_arithmetics`") # noqa: E501 + # verify the required parameter 'dataset_id' is set + if self.api_client.client_side_validation and ('dataset_id' not in params or + params['dataset_id'] is None): # noqa: E501 + raise ValueError("Missing the required parameter `dataset_id` when calling `perform_tag_arithmetics`") # noqa: E501 + + collection_formats = {} + + path_params = {} + if 'dataset_id' in params: + path_params['datasetId'] = params['dataset_id'] # noqa: E501 + + query_params = [] + + header_params = {} + + form_params = [] + local_var_files = {} + + body_params = None + if 'body' in params: + body_params = params['body'] + # HTTP header `Accept` + header_params['Accept'] = self.api_client.select_header_accept( + ['application/json']) # noqa: E501 + + # HTTP header `Content-Type` + header_params['Content-Type'] = self.api_client.select_header_content_type( # noqa: E501 + ['application/json']) # noqa: E501 + + # Authentication setting + auth_settings = ['ApiKeyAuth', 'auth0Bearer'] # noqa: E501 + + return self.api_client.call_api( + '/v1/datasets/{datasetId}/tags/arithmetics', 'POST', + path_params, + query_params, + header_params, + body=body_params, + post_params=form_params, + files=local_var_files, + response_type='TagBitMaskResponse', # noqa: E501 + auth_settings=auth_settings, + async_req=params.get('async_req'), + _return_http_data_only=params.get('_return_http_data_only'), + _preload_content=params.get('_preload_content', True), + _request_timeout=params.get('_request_timeout'), + collection_formats=collection_formats) diff --git a/lightly/openapi_generated/swagger_client/models/__init__.py b/lightly/openapi_generated/swagger_client/models/__init__.py index ee9d8b864..5bc544857 100644 --- a/lightly/openapi_generated/swagger_client/models/__init__.py +++ b/lightly/openapi_generated/swagger_client/models/__init__.py @@ -56,7 +56,10 @@ from lightly.openapi_generated.swagger_client.models.sampling_create_request import SamplingCreateRequest from lightly.openapi_generated.swagger_client.models.sampling_method import SamplingMethod from lightly.openapi_generated.swagger_client.models.tag_active_learning_scores_data import TagActiveLearningScoresData +from lightly.openapi_generated.swagger_client.models.tag_arithmetics_operation import TagArithmeticsOperation +from lightly.openapi_generated.swagger_client.models.tag_arithmetics_request import TagArithmeticsRequest from lightly.openapi_generated.swagger_client.models.tag_bit_mask_data import TagBitMaskData +from lightly.openapi_generated.swagger_client.models.tag_bit_mask_response import TagBitMaskResponse from lightly.openapi_generated.swagger_client.models.tag_change_data import TagChangeData from lightly.openapi_generated.swagger_client.models.tag_create_request import TagCreateRequest from lightly.openapi_generated.swagger_client.models.tag_creator import TagCreator diff --git a/lightly/openapi_generated/swagger_client/models/tag_arithmetics_operation.py b/lightly/openapi_generated/swagger_client/models/tag_arithmetics_operation.py new file mode 100644 index 000000000..99bddd916 --- /dev/null +++ b/lightly/openapi_generated/swagger_client/models/tag_arithmetics_operation.py @@ -0,0 +1,102 @@ +# coding: utf-8 + +""" + Lightly API + + Lightly.ai enables you to do self-supervised learning in an easy and intuitive way. The lightly.ai OpenAPI spec defines how one can interact with our REST API to unleash the full potential of lightly.ai # noqa: E501 + + OpenAPI spec version: 1.0.0 + Contact: support@lightly.ai + Generated by: https://github.com/swagger-api/swagger-codegen.git +""" + + +import pprint +import re # noqa: F401 + +import six + +from lightly.openapi_generated.swagger_client.configuration import Configuration + + +class TagArithmeticsOperation(object): + """NOTE: This class is auto generated by the swagger code generator program. + + Do not edit the class manually. + """ + + """ + allowed enum values + """ + UNION = "UNION" + INTERSECTION = "INTERSECTION" + DIFFERENCE = "DIFFERENCE" + + """ + Attributes: + swagger_types (dict): The key is attribute name + and the value is attribute type. + attribute_map (dict): The key is attribute name + and the value is json key in definition. + """ + swagger_types = { + } + + attribute_map = { + } + + def __init__(self, _configuration=None): # noqa: E501 + """TagArithmeticsOperation - a model defined in Swagger""" # noqa: E501 + if _configuration is None: + _configuration = Configuration() + self._configuration = _configuration + self.discriminator = None + + def to_dict(self): + """Returns the model properties as a dict""" + result = {} + + for attr, _ in six.iteritems(self.swagger_types): + value = getattr(self, attr) + if isinstance(value, list): + result[attr] = list(map( + lambda x: x.to_dict() if hasattr(x, "to_dict") else x, + value + )) + elif hasattr(value, "to_dict"): + result[attr] = value.to_dict() + elif isinstance(value, dict): + result[attr] = dict(map( + lambda item: (item[0], item[1].to_dict()) + if hasattr(item[1], "to_dict") else item, + value.items() + )) + else: + result[attr] = value + if issubclass(TagArithmeticsOperation, dict): + for key, value in self.items(): + result[key] = value + + return result + + def to_str(self): + """Returns the string representation of the model""" + return pprint.pformat(self.to_dict()) + + def __repr__(self): + """For `print` and `pprint`""" + return self.to_str() + + def __eq__(self, other): + """Returns true if both objects are equal""" + if not isinstance(other, TagArithmeticsOperation): + return False + + return self.to_dict() == other.to_dict() + + def __ne__(self, other): + """Returns true if both objects are not equal""" + if not isinstance(other, TagArithmeticsOperation): + return True + + return self.to_dict() != other.to_dict() diff --git a/lightly/openapi_generated/swagger_client/models/tag_arithmetics_request.py b/lightly/openapi_generated/swagger_client/models/tag_arithmetics_request.py new file mode 100644 index 000000000..7ef31eb04 --- /dev/null +++ b/lightly/openapi_generated/swagger_client/models/tag_arithmetics_request.py @@ -0,0 +1,230 @@ +# coding: utf-8 + +""" + Lightly API + + Lightly.ai enables you to do self-supervised learning in an easy and intuitive way. The lightly.ai OpenAPI spec defines how one can interact with our REST API to unleash the full potential of lightly.ai # noqa: E501 + + OpenAPI spec version: 1.0.0 + Contact: support@lightly.ai + Generated by: https://github.com/swagger-api/swagger-codegen.git +""" + + +import pprint +import re # noqa: F401 + +import six + +from lightly.openapi_generated.swagger_client.configuration import Configuration + + +class TagArithmeticsRequest(object): + """NOTE: This class is auto generated by the swagger code generator program. + + Do not edit the class manually. + """ + + """ + Attributes: + swagger_types (dict): The key is attribute name + and the value is attribute type. + attribute_map (dict): The key is attribute name + and the value is json key in definition. + """ + swagger_types = { + 'tag_id1': 'MongoObjectID', + 'tag_id2': 'MongoObjectID', + 'operation': 'TagArithmeticsOperation', + 'new_tag_name': 'TagName', + 'creator': 'TagCreator' + } + + attribute_map = { + 'tag_id1': 'tagId1', + 'tag_id2': 'tagId2', + 'operation': 'operation', + 'new_tag_name': 'newTagName', + 'creator': 'creator' + } + + def __init__(self, tag_id1=None, tag_id2=None, operation=None, new_tag_name=None, creator=None, _configuration=None): # noqa: E501 + """TagArithmeticsRequest - a model defined in Swagger""" # noqa: E501 + if _configuration is None: + _configuration = Configuration() + self._configuration = _configuration + + self._tag_id1 = None + self._tag_id2 = None + self._operation = None + self._new_tag_name = None + self._creator = None + self.discriminator = None + + self.tag_id1 = tag_id1 + self.tag_id2 = tag_id2 + self.operation = operation + if new_tag_name is not None: + self.new_tag_name = new_tag_name + if creator is not None: + self.creator = creator + + @property + def tag_id1(self): + """Gets the tag_id1 of this TagArithmeticsRequest. # noqa: E501 + + + :return: The tag_id1 of this TagArithmeticsRequest. # noqa: E501 + :rtype: MongoObjectID + """ + return self._tag_id1 + + @tag_id1.setter + def tag_id1(self, tag_id1): + """Sets the tag_id1 of this TagArithmeticsRequest. + + + :param tag_id1: The tag_id1 of this TagArithmeticsRequest. # noqa: E501 + :type: MongoObjectID + """ + if self._configuration.client_side_validation and tag_id1 is None: + raise ValueError("Invalid value for `tag_id1`, must not be `None`") # noqa: E501 + + self._tag_id1 = tag_id1 + + @property + def tag_id2(self): + """Gets the tag_id2 of this TagArithmeticsRequest. # noqa: E501 + + + :return: The tag_id2 of this TagArithmeticsRequest. # noqa: E501 + :rtype: MongoObjectID + """ + return self._tag_id2 + + @tag_id2.setter + def tag_id2(self, tag_id2): + """Sets the tag_id2 of this TagArithmeticsRequest. + + + :param tag_id2: The tag_id2 of this TagArithmeticsRequest. # noqa: E501 + :type: MongoObjectID + """ + if self._configuration.client_side_validation and tag_id2 is None: + raise ValueError("Invalid value for `tag_id2`, must not be `None`") # noqa: E501 + + self._tag_id2 = tag_id2 + + @property + def operation(self): + """Gets the operation of this TagArithmeticsRequest. # noqa: E501 + + + :return: The operation of this TagArithmeticsRequest. # noqa: E501 + :rtype: TagArithmeticsOperation + """ + return self._operation + + @operation.setter + def operation(self, operation): + """Sets the operation of this TagArithmeticsRequest. + + + :param operation: The operation of this TagArithmeticsRequest. # noqa: E501 + :type: TagArithmeticsOperation + """ + if self._configuration.client_side_validation and operation is None: + raise ValueError("Invalid value for `operation`, must not be `None`") # noqa: E501 + + self._operation = operation + + @property + def new_tag_name(self): + """Gets the new_tag_name of this TagArithmeticsRequest. # noqa: E501 + + + :return: The new_tag_name of this TagArithmeticsRequest. # noqa: E501 + :rtype: TagName + """ + return self._new_tag_name + + @new_tag_name.setter + def new_tag_name(self, new_tag_name): + """Sets the new_tag_name of this TagArithmeticsRequest. + + + :param new_tag_name: The new_tag_name of this TagArithmeticsRequest. # noqa: E501 + :type: TagName + """ + + self._new_tag_name = new_tag_name + + @property + def creator(self): + """Gets the creator of this TagArithmeticsRequest. # noqa: E501 + + + :return: The creator of this TagArithmeticsRequest. # noqa: E501 + :rtype: TagCreator + """ + return self._creator + + @creator.setter + def creator(self, creator): + """Sets the creator of this TagArithmeticsRequest. + + + :param creator: The creator of this TagArithmeticsRequest. # noqa: E501 + :type: TagCreator + """ + + self._creator = creator + + def to_dict(self): + """Returns the model properties as a dict""" + result = {} + + for attr, _ in six.iteritems(self.swagger_types): + value = getattr(self, attr) + if isinstance(value, list): + result[attr] = list(map( + lambda x: x.to_dict() if hasattr(x, "to_dict") else x, + value + )) + elif hasattr(value, "to_dict"): + result[attr] = value.to_dict() + elif isinstance(value, dict): + result[attr] = dict(map( + lambda item: (item[0], item[1].to_dict()) + if hasattr(item[1], "to_dict") else item, + value.items() + )) + else: + result[attr] = value + if issubclass(TagArithmeticsRequest, dict): + for key, value in self.items(): + result[key] = value + + return result + + def to_str(self): + """Returns the string representation of the model""" + return pprint.pformat(self.to_dict()) + + def __repr__(self): + """For `print` and `pprint`""" + return self.to_str() + + def __eq__(self, other): + """Returns true if both objects are equal""" + if not isinstance(other, TagArithmeticsRequest): + return False + + return self.to_dict() == other.to_dict() + + def __ne__(self, other): + """Returns true if both objects are not equal""" + if not isinstance(other, TagArithmeticsRequest): + return True + + return self.to_dict() != other.to_dict() diff --git a/lightly/openapi_generated/swagger_client/models/tag_bit_mask_response.py b/lightly/openapi_generated/swagger_client/models/tag_bit_mask_response.py new file mode 100644 index 000000000..ce40bfb40 --- /dev/null +++ b/lightly/openapi_generated/swagger_client/models/tag_bit_mask_response.py @@ -0,0 +1,124 @@ +# coding: utf-8 + +""" + Lightly API + + Lightly.ai enables you to do self-supervised learning in an easy and intuitive way. The lightly.ai OpenAPI spec defines how one can interact with our REST API to unleash the full potential of lightly.ai # noqa: E501 + + OpenAPI spec version: 1.0.0 + Contact: support@lightly.ai + Generated by: https://github.com/swagger-api/swagger-codegen.git +""" + + +import pprint +import re # noqa: F401 + +import six + +from lightly.openapi_generated.swagger_client.configuration import Configuration + + +class TagBitMaskResponse(object): + """NOTE: This class is auto generated by the swagger code generator program. + + Do not edit the class manually. + """ + + """ + Attributes: + swagger_types (dict): The key is attribute name + and the value is attribute type. + attribute_map (dict): The key is attribute name + and the value is json key in definition. + """ + swagger_types = { + 'bit_mask_data': 'TagBitMaskData' + } + + attribute_map = { + 'bit_mask_data': 'bitMaskData' + } + + def __init__(self, bit_mask_data=None, _configuration=None): # noqa: E501 + """TagBitMaskResponse - a model defined in Swagger""" # noqa: E501 + if _configuration is None: + _configuration = Configuration() + self._configuration = _configuration + + self._bit_mask_data = None + self.discriminator = None + + self.bit_mask_data = bit_mask_data + + @property + def bit_mask_data(self): + """Gets the bit_mask_data of this TagBitMaskResponse. # noqa: E501 + + + :return: The bit_mask_data of this TagBitMaskResponse. # noqa: E501 + :rtype: TagBitMaskData + """ + return self._bit_mask_data + + @bit_mask_data.setter + def bit_mask_data(self, bit_mask_data): + """Sets the bit_mask_data of this TagBitMaskResponse. + + + :param bit_mask_data: The bit_mask_data of this TagBitMaskResponse. # noqa: E501 + :type: TagBitMaskData + """ + if self._configuration.client_side_validation and bit_mask_data is None: + raise ValueError("Invalid value for `bit_mask_data`, must not be `None`") # noqa: E501 + + self._bit_mask_data = bit_mask_data + + def to_dict(self): + """Returns the model properties as a dict""" + result = {} + + for attr, _ in six.iteritems(self.swagger_types): + value = getattr(self, attr) + if isinstance(value, list): + result[attr] = list(map( + lambda x: x.to_dict() if hasattr(x, "to_dict") else x, + value + )) + elif hasattr(value, "to_dict"): + result[attr] = value.to_dict() + elif isinstance(value, dict): + result[attr] = dict(map( + lambda item: (item[0], item[1].to_dict()) + if hasattr(item[1], "to_dict") else item, + value.items() + )) + else: + result[attr] = value + if issubclass(TagBitMaskResponse, dict): + for key, value in self.items(): + result[key] = value + + return result + + def to_str(self): + """Returns the string representation of the model""" + return pprint.pformat(self.to_dict()) + + def __repr__(self): + """For `print` and `pprint`""" + return self.to_str() + + def __eq__(self, other): + """Returns true if both objects are equal""" + if not isinstance(other, TagBitMaskResponse): + return False + + return self.to_dict() == other.to_dict() + + def __ne__(self, other): + """Returns true if both objects are not equal""" + if not isinstance(other, TagBitMaskResponse): + return True + + return self.to_dict() != other.to_dict() From 8c4546760a964171cecc25a2709d3d284060af6f Mon Sep 17 00:00:00 2001 From: MalteEbner Date: Fri, 16 Apr 2021 19:46:47 +0200 Subject: [PATCH 07/27] created tests for download_cli --- lightly/cli/__init__.py | 2 +- lightly/cli/config/config.yaml | 4 ++ .../{download_cli.py => download_cli_file.py} | 0 .../mocked_api_workflow_client.py | 18 +++-- tests/cli/test_cli_download.py | 67 +++++++++++++++++++ 5 files changed, 84 insertions(+), 7 deletions(-) rename lightly/cli/{download_cli.py => download_cli_file.py} (100%) create mode 100644 tests/cli/test_cli_download.py diff --git a/lightly/cli/__init__.py b/lightly/cli/__init__.py index e64a86144..f8ff2886b 100644 --- a/lightly/cli/__init__.py +++ b/lightly/cli/__init__.py @@ -10,4 +10,4 @@ from lightly.cli.train_cli import train_cli from lightly.cli.embed_cli import embed_cli from lightly.cli.upload_cli import upload_cli -from lightly.cli.download_cli import download_cli +from lightly.cli.download_cli_file import download_cli diff --git a/lightly/cli/config/config.yaml b/lightly/cli/config/config.yaml index 7524b406f..be8a525a2 100644 --- a/lightly/cli/config/config.yaml +++ b/lightly/cli/config/config.yaml @@ -18,6 +18,7 @@ resize: -1 # Allow resizing of the images before uploading, u embedding_name: 'default' # Name of the embedding to be used on the platform. emb_upload_bsz: 32 # Number of embeddings which are uploaded in a single batch. tag_name: 'initial-tag' # Name of the requested tag on the Lightly platform. +exclude_parent_tag: False # If true, only the samples in the defined tag, but without the parent tag, are taken ### training and embeddings pre_trained: True # Whether to use a pre-trained model or not @@ -129,6 +130,9 @@ hydra: Download a list of files in a given tag from the Lightly web solution > lightly-download tag_name='my-tag' dataset_id='your_dataset_id' token='your_access_token' + Download a list of files in a given tag without filenames from the parent tag from the Lightly web solution + > lightly-download tag_name='my-tag' dataset_id='your_dataset_id' token='your_access_token' exclude_parent_tag=True + Copy all files in a given tag from a source directory to a target directory > lightly-download tag_name='my-tag' dataset_id='your_dataset_id' token='your_access_token' input_dir='data/' output_dir='new_data/' diff --git a/lightly/cli/download_cli.py b/lightly/cli/download_cli_file.py similarity index 100% rename from lightly/cli/download_cli.py rename to lightly/cli/download_cli_file.py diff --git a/tests/api_workflow/mocked_api_workflow_client.py b/tests/api_workflow/mocked_api_workflow_client.py index 17659615d..0cdace1ae 100644 --- a/tests/api_workflow/mocked_api_workflow_client.py +++ b/tests/api_workflow/mocked_api_workflow_client.py @@ -16,7 +16,7 @@ from typing import * from lightly.openapi_generated.swagger_client import ScoresApi, CreateEntityResponse, SamplesApi, SampleCreateRequest, \ - InitialTagCreateRequest, ApiClient, VersioningApi, QuotaApi + InitialTagCreateRequest, ApiClient, VersioningApi, QuotaApi, TagArithmeticsRequest, TagBitMaskResponse from lightly.openapi_generated.swagger_client.api.embeddings_api import EmbeddingsApi from lightly.openapi_generated.swagger_client.api.jobs_api import JobsApi from lightly.openapi_generated.swagger_client.api.mappings_api import MappingsApi @@ -93,19 +93,25 @@ def get_tag_by_tag_id(self, dataset_id, tag_id, **kwargs): def get_tags_by_dataset_id(self, dataset_id, **kwargs): tag_1 = TagData(id='inital_tag_id', dataset_id=dataset_id, prev_tag_id=None, - bit_mask_data="0x80bda23e9", name='initial-tag', tot_size=15, + bit_mask_data="0xF", name='initial-tag', tot_size=4, created_at=1577836800, changes=dict()) tag_2 = TagData(id='query_tag_id_xyz', dataset_id=dataset_id, prev_tag_id="initial-tag", - bit_mask_data="0x80bda23e9", name='query_tag_name_xyz', tot_size=15, + bit_mask_data="0xF", name='query_tag_name_xyz', tot_size=4, created_at=1577836800, changes=dict()) tag_3 = TagData(id='preselected_tag_id_xyz', dataset_id=dataset_id, prev_tag_id="initial-tag", - bit_mask_data="0x80bda23e9", name='preselected_tag_name_xyz', tot_size=15, + bit_mask_data="0x1", name='preselected_tag_name_xyz', tot_size=4, created_at=1577836800, changes=dict()) - tags = [tag_1, tag_2, tag_3] - no_tags_to_return = getattr(self, "no_tags", 3) + tag_4 = TagData(id='sampled_tag_xyz', dataset_id=dataset_id, prev_tag_id="preselected_tag_id_xyz", + bit_mask_data="0x3", name='sampled_tag_xyz', tot_size=4, + created_at=1577836800, changes=dict()) + tags = [tag_1, tag_2, tag_3, tag_4] + no_tags_to_return = getattr(self, "no_tags", 4) tags = tags[:no_tags_to_return] return tags + def perform_tag_arithmetics(self, body: TagArithmeticsRequest, dataset_id, **kwargs): + return TagBitMaskResponse(bit_mask_data="0x2") + class MockedScoresApi(ScoresApi): def create_or_update_active_learning_score_by_tag_id(self, body, dataset_id, tag_id, **kwargs) -> \ diff --git a/tests/cli/test_cli_download.py b/tests/cli/test_cli_download.py new file mode 100644 index 000000000..a9e3ccd2d --- /dev/null +++ b/tests/cli/test_cli_download.py @@ -0,0 +1,67 @@ +import os +import re +import tempfile + +from hydra.experimental import compose, initialize + +import lightly +from tests.api_workflow.mocked_api_workflow_client import MockedApiWorkflowSetup, MockedApiWorkflowClient + + +#in download_cli_file.py: from lightly.api.api_workflow_client import ApiWorkflowClient + +class TestCLIDownload(MockedApiWorkflowSetup): + + @classmethod + def setUpClass(cls) -> None: + lightly.cli.download_cli_file.ApiWorkflowClient = MockedApiWorkflowClient + initialize(config_path="../../lightly/cli/config", job_name="test_app") + + def setUp(self): + self.cfg = compose(config_name="config", overrides=["token='123'", "dataset_id='XYZ'"]) + + + def parse_cli_string(self, cli_words: str): + cli_words = cli_words.replace("lightly-download ", "") + cli_words = re.split("=| ", cli_words) + assert len(cli_words) % 2 == 0 + dict_keys = cli_words[0::2] + dict_values = cli_words[1::2] + for key, value in zip(dict_keys, dict_values): + value = value.strip('\"') + value = value.strip('\'') + self.cfg[key] = value + + def test_parse_cli_string(self): + cli_string = "lightly-download token='123' dataset_id='XYZ'" + self.parse_cli_string(cli_string) + assert self.cfg["token"] == '123' + assert self.cfg["dataset_id"] == 'XYZ' + + def test_download_base(self): + cli_string = "lightly-download token='123' dataset_id='XYZ'" + self.parse_cli_string(cli_string) + lightly.cli.download_cli_file.download_cli(self.cfg) + + def test_download_tag_name(self): + cli_string = "lightly-download token='123' dataset_id='XYZ' tag_name='sampled_tag_xyz'" + self.parse_cli_string(cli_string) + lightly.cli.download_cli_file.download_cli(self.cfg) + + def test_download_tag_name_nonexisting(self): + cli_string = "lightly-download token='123' dataset_id='XYZ' tag_name='nonexisting_xyz'" + self.parse_cli_string(cli_string) + lightly.cli.download_cli_file.download_cli(self.cfg) + + def test_download_tag_name_exclude_parent(self): + cli_string = "lightly-download token='123' dataset_id='XYZ' tag_name='sampled_tag_xyz' exclude_parent_tag=True" + self.parse_cli_string(cli_string) + lightly.cli.download_cli_file.download_cli(self.cfg) + + def tearDown(self) -> None: + try: + os.remove(f"{self.cfg['tag_name']}.txt") + except FileNotFoundError: + pass + + From 3518293f270cf4183c0c4c24422ff72312ddd08e Mon Sep 17 00:00:00 2001 From: MalteEbner Date: Fri, 16 Apr 2021 19:53:37 +0200 Subject: [PATCH 08/27] bugfix: "lightly-download = lightly.cli.download_cli_file:entry" --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b9af51836..f0d9527f8 100644 --- a/setup.py +++ b/setup.py @@ -57,7 +57,7 @@ def load_requirements(path_dir=PATH_ROOT, filename='base.txt', comment_char='#') "lightly-embed = lightly.cli.embed_cli:entry", "lightly-magic = lightly.cli.lightly_cli:entry", "lightly-upload = lightly.cli.upload_cli:entry", - "lightly-download = lightly.cli.download_cli:entry", + "lightly-download = lightly.cli.download_cli_file:entry", "lightly-version = lightly.cli.version_cli:entry", ] } From 43dee8cc7b251794de5af09644a71c3f18421e9a Mon Sep 17 00:00:00 2001 From: MalteEbner Date: Mon, 19 Apr 2021 00:49:17 +0200 Subject: [PATCH 09/27] Bugfix: actual change got lost in renaming --- lightly/cli/download_cli_file.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/lightly/cli/download_cli_file.py b/lightly/cli/download_cli_file.py index feacb2a37..57ce8367d 100644 --- a/lightly/cli/download_cli_file.py +++ b/lightly/cli/download_cli_file.py @@ -20,10 +20,11 @@ from lightly.api.utils import getenv from lightly.api.api_workflow_client import ApiWorkflowClient from lightly.api.bitmask import BitMask +from lightly.openapi_generated.swagger_client import TagData, TagArithmeticsRequest, TagArithmeticsOperation, \ + TagBitMaskResponse def _download_cli(cfg, is_cli_call=True): - tag_name = cfg['tag_name'] dataset_id = cfg['dataset_id'] token = cfg['token'] @@ -50,12 +51,24 @@ def _download_cli(cfg, is_cli_call=True): return # get tag data - tag_data = api_workflow_client.tags_api.get_tag_by_tag_id( + tag_data: TagData = api_workflow_client.tags_api.get_tag_by_tag_id( dataset_id=dataset_id, tag_id=tag_id ) - + + if cfg["exclude_parent_tag"]: + parent_tag_id = tag_data.prev_tag_id + tag_arithmetics_request = TagArithmeticsRequest( + tag_id1=tag_data.id, + tag_id2=parent_tag_id, + operation=TagArithmeticsOperation.DIFFERENCE) + bit_mask_response: TagBitMaskResponse \ + = api_workflow_client.tags_api.perform_tag_arithmetics(body=tag_arithmetics_request, dataset_id=dataset_id) + bit_mask_data = bit_mask_response.bit_mask_data + else: + bit_mask_data = tag_data.bit_mask_data + # get samples - chosen_samples_ids = BitMask.from_hex(tag_data.bit_mask_data).to_indices() + chosen_samples_ids = BitMask.from_hex(bit_mask_data).to_indices() samples = [api_workflow_client.filenames_on_server[i] for i in chosen_samples_ids] # store sample names in a .txt file From 5214e42f56286ff1749ce0dd17e2cd6a990bd7b0 Mon Sep 17 00:00:00 2001 From: MalteEbner Date: Mon, 19 Apr 2021 08:40:32 +0200 Subject: [PATCH 10/27] upload_cli: use number of workers specified --- lightly/cli/upload_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightly/cli/upload_cli.py b/lightly/cli/upload_cli.py index a5c648d4d..d4ac0abb9 100644 --- a/lightly/cli/upload_cli.py +++ b/lightly/cli/upload_cli.py @@ -52,7 +52,7 @@ def _upload_cli(cfg, is_cli_call=True): mode = cfg['upload'] dataset = LightlyDataset(input_dir=input_dir, transform=transform) api_workflow_client.upload_dataset( - input=dataset, mode=mode + input=dataset, mode=mode, max_workers=cfg['loader']['num_workers'] ) if path_to_embeddings: From c39541f501547fa2f1708e3153488a0d0618a2f2 Mon Sep 17 00:00:00 2001 From: MalteEbner Date: Mon, 19 Apr 2021 08:52:38 +0200 Subject: [PATCH 11/27] umocked test_api now uses upload_cli to test setting loader.num_workers --- tests/UNMOCKED_end2end_tests/test_api.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/UNMOCKED_end2end_tests/test_api.py b/tests/UNMOCKED_end2end_tests/test_api.py index 7f15b2a70..da919e31d 100644 --- a/tests/UNMOCKED_end2end_tests/test_api.py +++ b/tests/UNMOCKED_end2end_tests/test_api.py @@ -4,6 +4,9 @@ from typing import List, Tuple import numpy as np +from hydra.experimental import initialize, compose + +from lightly.cli import upload_cli from lightly.data.dataset import LightlyDataset from lightly.active_learning.scorers.classification import ScorerClassification @@ -69,7 +72,14 @@ def create_new_dataset_with_embeddings(path_to_dataset: str, api_workflow_client.create_new_dataset_with_unique_name(dataset_basename=dataset_name) # upload to the dataset - api_workflow_client.upload_dataset(input=path_to_dataset) + initialize(config_path="../../lightly/cli/config", job_name="test_app") + cfg = compose(config_name="config", overrides=[ + f"input_dir='{path_to_dataset}'", + f"token='{token}'", + f"dataset_id={api_workflow_client.dataset_id}", + f"loader.num_workers=9" + ]) + upload_cli(cfg) # calculate and save the embeddings path_to_embeddings_csv = f"{path_to_dataset}/embeddings.csv" From a9364cd8acd00ca023703258e65141c6730969b2 Mon Sep 17 00:00:00 2001 From: MalteEbner Date: Mon, 19 Apr 2021 09:32:07 +0200 Subject: [PATCH 12/27] upload_dataset: handle case where max_workers=0 --- lightly/api/api_workflow_upload_dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lightly/api/api_workflow_upload_dataset.py b/lightly/api/api_workflow_upload_dataset.py index a64220a64..4d1795fd1 100644 --- a/lightly/api/api_workflow_upload_dataset.py +++ b/lightly/api/api_workflow_upload_dataset.py @@ -61,6 +61,7 @@ def upload_dataset(self, input: Union[str, LightlyDataset], max_workers: int = 8 # handle the case where len(dataset) < max_workers max_workers = min(len(dataset), max_workers) + max_workers = max(max_workers, 1) # upload the samples if verbose: From e53f4ff5562144065342e3792ba68a3e58ad8310 Mon Sep 17 00:00:00 2001 From: MalteEbner Date: Mon, 19 Apr 2021 11:54:09 +0200 Subject: [PATCH 13/27] much more tests and bugfixes for bitmask and agent --- lightly/active_learning/agents/agent.py | 72 ++++++++++--------- lightly/api/bitmask.py | 17 ++--- .../test_active_learning_agent.py | 27 ++++++- tests/api/test_BitMask.py | 47 +++++++++--- 4 files changed, 107 insertions(+), 56 deletions(-) diff --git a/lightly/active_learning/agents/agent.py b/lightly/active_learning/agents/agent.py index 2eab9020d..493817c61 100644 --- a/lightly/active_learning/agents/agent.py +++ b/lightly/active_learning/agents/agent.py @@ -49,7 +49,8 @@ class ActiveLearningAgent: """ - def __init__(self, api_workflow_client: ApiWorkflowClient, query_tag_name: str = None, preselected_tag_name: str = None): + def __init__(self, api_workflow_client: ApiWorkflowClient, query_tag_name: str = None, + preselected_tag_name: str = None): self.api_workflow_client = api_workflow_client if query_tag_name is not None or preselected_tag_name is not None: @@ -76,10 +77,12 @@ def _set_labeled_and_unlabeled_set(self, preselected_tag_data: TagData = None): optional param, then it must not be loaded from the API """ - self.bitmask_labeled_set = BitMask.from_hex("0x0") - self.bitmask_added_set = BitMask.from_hex("0x0") - if self.preselected_tag_id is not None: - if preselected_tag_data is None: + + if not hasattr(self, "bitmask_labeled_set"): + self.bitmask_labeled_set = BitMask.from_hex("0x0") # empty labeled set + self.bitmask_added_set = BitMask.from_hex("0x0") # empty added set + if self.preselected_tag_id is not None: # else the default values (empty labeled and added set) are kept + if preselected_tag_data is None: # if it is not passed as argument, it must be loaded from the API preselected_tag_data = self.api_workflow_client.tags_api.get_tag_by_tag_id( self.api_workflow_client.dataset_id, tag_id=self.preselected_tag_id) new_bitmask_labeled_set = BitMask.from_hex(preselected_tag_data.bit_mask_data) @@ -111,38 +114,43 @@ def query(self, sampler_config: SamplerConfig, al_scorer: Scorer = None) -> Tupl An instance of a class inheriting from Scorer, e.g. a ClassificationScorer. Returns: - The filenames of the samples in the new labeled_set and the added filenames. + The filenames of the samples in the new labeled_set + and the filenames of the samples chosen by the sampler. + This added_set was added to the old labeled_set + to form the new labeled_set. """ # check input if sampler_config.n_samples < len(self.labeled_set): warnings.warn("ActiveLearningAgent.query: The number of samples which should be sampled " - "including the current labeled set " - "(sampler_config.n_samples) " - "is smaller than the number of samples in the current labeled set.") - + "including the current labeled set " + "(sampler_config.n_samples) " + "is smaller than the number of samples in the current labeled set." + "Skipping the sampling and returning the old labeled_set and" + "no ne filenames.") + return self.labeled_set, [] + + # calculate scores + if al_scorer is not None: + no_unlabeled_samples = len(self.unlabeled_set) + no_samples_with_predictions = len(al_scorer.model_output) + if no_unlabeled_samples != no_samples_with_predictions: + raise ValueError(f"The scorer must have exactly as many samples as in the unlabeled set," + f"but there are {no_samples_with_predictions} predictions in the scorer," + f"but {no_unlabeled_samples} in the unlabeled set.") + scores_dict = al_scorer._calculate_scores() else: - # calculate scores - if al_scorer is not None: - no_unlabeled_samples = len(self.unlabeled_set) - no_samples_with_predictions = len(al_scorer.model_output) - if no_unlabeled_samples != no_samples_with_predictions: - raise ValueError(f"The scorer must have exactly as much samples as in the unlabeled set," - f"but there are {no_samples_with_predictions} predictions in the scorer," - f"but {no_unlabeled_samples} in the unlabeled set.") - scores_dict = al_scorer._calculate_scores() - else: - scores_dict = None - - # perform the sampling - new_tag_data = self.api_workflow_client.sampling( - sampler_config=sampler_config, - al_scores=scores_dict, - preselected_tag_id=self.preselected_tag_id, - query_tag_id=self.query_tag_id) - - # set the newly chosen tag as the new preselected_tag_id and update the sets - self.preselected_tag_id = new_tag_data.id - self._set_labeled_and_unlabeled_set(new_tag_data) + scores_dict = None + + # perform the sampling + new_tag_data = self.api_workflow_client.sampling( + sampler_config=sampler_config, + al_scores=scores_dict, + preselected_tag_id=self.preselected_tag_id, + query_tag_id=self.query_tag_id) + + # set the newly chosen tag as the new preselected_tag_id and update the sets + self.preselected_tag_id = new_tag_data.id + self._set_labeled_and_unlabeled_set(new_tag_data) return self.labeled_set, self.added_set diff --git a/lightly/api/bitmask.py b/lightly/api/bitmask.py index 78ca34a26..0c3b1da7d 100644 --- a/lightly/api/bitmask.py +++ b/lightly/api/bitmask.py @@ -2,7 +2,7 @@ # Copyright (c) 2020. Lightly AG and its affiliates. # All Rights Reserved - +from copy import deepcopy from typing import List @@ -68,12 +68,6 @@ def _intersection(x: int, y: int) -> int: return x & y -def _difference(x: int, y: int) -> int: - """Uses difference to get the intersection of the two masks. - """ - return x - y - - def _get_kth_bit(x: int, k: int) -> int: """Returns the kth bit in the mask from the right. """ @@ -108,7 +102,7 @@ class BitMask: >>> # for a dataset with 10 images, assume the following tag >>> # 0001011001 where the 1st, 4th, 5th and 7th image are selected >>> # this tag would be stored as 0x59. - >>> hexstring = 0x59 # what you receive from the api + >>> hexstring = '0x59' # what you receive from the api >>> mask = BitMask.from_hex(hexstring) # create a bitmask from it >>> indices = mask.to_indices() # get list of indices which are one >>> # indices is [0, 3, 4, 6] @@ -194,10 +188,13 @@ def difference(self, other): >>> mask1.difference(mask2) >>> # mask1.binstring is '0b0011' """ - self.x = _difference(self.x, other.x) + self.union(other) + self.x = self.x - other.x def __sub__(self, other): - return BitMask(self.x - other.x) + ret = deepcopy(self) + ret.difference(other) + return ret def __eq__(self, other): return self.to_bin() == other.to_bin() diff --git a/tests/active_learning/test_active_learning_agent.py b/tests/active_learning/test_active_learning_agent.py index 8212d6178..a84d2a53d 100644 --- a/tests/active_learning/test_active_learning_agent.py +++ b/tests/active_learning/test_active_learning_agent.py @@ -20,13 +20,16 @@ def test_agent(self): for method in [SamplingMethod.CORAL, SamplingMethod.CORESET, SamplingMethod.RANDOM]: for agent in [agent_0, agent_1, agent_2, agent_3]: for batch_size in [2, 6]: + n_old_labeled = len(agent.labeled_set) + n_old_unlabeled = len(agent.unlabeled_set) + n_samples = len(agent.labeled_set) + batch_size - if method == SamplingMethod.CORAL and len(agent.labeled_set) > 0: + if method == SamplingMethod.CORAL and len(agent.labeled_set) == 0: sampler_config = SamplerConfig(n_samples=n_samples, method=SamplingMethod.CORESET) else: sampler_config = SamplerConfig(n_samples=n_samples, method=method) - if sampler_config.method == SamplingMethod.CORESET: + if sampler_config.method == SamplingMethod.CORAL: predictions = np.random.rand(len(agent.unlabeled_set), 10).astype(np.float32) predictions_normalized = predictions / np.sum(predictions, axis=1)[:, np.newaxis] al_scorer = ScorerClassification(predictions_normalized) @@ -35,5 +38,23 @@ def test_agent(self): sampler_config = SamplerConfig(n_samples=n_samples) labeled_set, added_set = agent.query(sampler_config=sampler_config) - assert len(added_set) <= len(labeled_set) + self.assertEqual(n_old_labeled + len(added_set), len(labeled_set)) assert set(added_set).issubset(labeled_set) + self.assertEqual(len(list(set(agent.labeled_set) & set(agent.unlabeled_set))), 0) + self.assertEqual(n_old_unlabeled - len(added_set), len(agent.unlabeled_set)) + + def test_agent_wrong_scores(self): + self.api_workflow_client.embedding_id = "embedding_id_xyz" + + agent = ActiveLearningAgent(self.api_workflow_client, preselected_tag_name="preselected_tag_name_xyz") + method = SamplingMethod.CORAL + n_samples = len(agent.labeled_set) + 2 + + n_predictions = len(agent.unlabeled_set) - 3 # the -3 should cause en error + predictions = np.random.rand(n_predictions, 10).astype(np.float32) + predictions_normalized = predictions / np.sum(predictions, axis=1)[:, np.newaxis] + al_scorer = ScorerClassification(predictions_normalized) + + sampler_config = SamplerConfig(n_samples=n_samples, method=method) + with self.assertRaises(ValueError): + labeled_set, added_set = agent.query(sampler_config=sampler_config, al_scorer=al_scorer) diff --git a/tests/api/test_BitMask.py b/tests/api/test_BitMask.py index 0a5d0ba9a..8e362d719 100644 --- a/tests/api/test_BitMask.py +++ b/tests/api/test_BitMask.py @@ -1,4 +1,6 @@ import unittest +from random import random, seed, randint + from lightly.api.bitmask import BitMask N = 10 @@ -91,31 +93,54 @@ def test_intersection(self): mask_a.intersection(mask_b) self.assertEqual(mask_a.x, int("0b100", 2)) - def test_difference(self): - mask_a = BitMask.from_bin("0b101") - mask_b = BitMask.from_bin("0b001") + def assert_difference(self, bistring_1: str, bitstring_2: str, target: str): + mask_a = BitMask.from_bin(bistring_1) + mask_b = BitMask.from_bin(bitstring_2) mask_a.difference(mask_b) - self.assertEqual(mask_a.x, int("0b100", 2)) + self.assertEqual(mask_a.x, int(target, 2)) + + def test_differences(self): + self.assert_difference("0b101", "0b001", "0b100") + self.assert_difference("0b0111", "0b1100", "0b0011") + self.assert_difference("0b10111", "0b01100", "0b10011") + + def random_bitsting(self, length: int): + bitsting = '0b' + for i in range(length): + bitsting += str(randint(0, 1)) + return bitsting + + def test_difference_random(self): + seed(42) + for rep in range(10): + for string_length in range(1, 100, 10): + bitstring_1 = self.random_bitsting(string_length) + bitstring_2 = self.random_bitsting(string_length) + target = '0b' + for bit_1, bit_2 in zip(bitstring_1[2:], bitstring_2[2:]): + if bit_1 == '1' and bit_2 == '0': + target += '1' + else: + target += '0' + self.assert_difference(bitstring_1, bitstring_2, target) def test_operator_minus(self): - mask_a = BitMask.from_bin("0b101") - mask_b = BitMask.from_bin("0b001") - mask_target = BitMask.from_bin("0b100") - self.assertEqual(mask_a-mask_b, mask_target) + mask_a = BitMask.from_bin("0b10111") + mask_b = BitMask.from_bin("0b01100") + mask_target = BitMask.from_bin("0b10011") + self.assertEqual(mask_a - mask_b, mask_target) def test_equal(self): mask_a = BitMask.from_bin("0b101") mask_b = BitMask.from_bin("0b101") self.assertEqual(mask_a, mask_b) - def test_subset_a_list(self): list_ = [4, 7, 9, 1] mask = BitMask.from_bin("0b0101") target_masked_list = [7, 1] masked_list = mask.subset_a_list(list_) - self.assertEqual(target_masked_list,masked_list) - + self.assertEqual(target_masked_list, masked_list) def test_nonzero_bits(self): From 396d5319f4f7e4465d8017e97691a7928e1dc8d0 Mon Sep 17 00:00:00 2001 From: MalteEbner Date: Mon, 19 Apr 2021 12:02:46 +0200 Subject: [PATCH 14/27] renamed bitmask.subset_a_list to masked_select_from_list --- lightly/active_learning/agents/agent.py | 6 +++--- lightly/api/bitmask.py | 4 ++-- tests/api/test_BitMask.py | 8 ++++++-- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/lightly/active_learning/agents/agent.py b/lightly/active_learning/agents/agent.py index 493817c61..550522b07 100644 --- a/lightly/active_learning/agents/agent.py +++ b/lightly/active_learning/agents/agent.py @@ -97,9 +97,9 @@ def _set_labeled_and_unlabeled_set(self, preselected_tag_data: TagData = None): bitmask_query_tag = BitMask.from_hex(query_tag_data.bit_mask_data) self.bitmask_unlabeled_set = bitmask_query_tag - self.bitmask_labeled_set - self.labeled_set = self.bitmask_labeled_set.subset_a_list(self.api_workflow_client.filenames_on_server) - self.added_set = self.bitmask_added_set.subset_a_list(self.api_workflow_client.filenames_on_server) - self.unlabeled_set = self.bitmask_unlabeled_set.subset_a_list(self.api_workflow_client.filenames_on_server) + self.labeled_set = self.bitmask_labeled_set.masked_select_from_list(self.api_workflow_client.filenames_on_server) + self.added_set = self.bitmask_added_set.masked_select_from_list(self.api_workflow_client.filenames_on_server) + self.unlabeled_set = self.bitmask_unlabeled_set.masked_select_from_list(self.api_workflow_client.filenames_on_server) def query(self, sampler_config: SamplerConfig, al_scorer: Scorer = None) -> Tuple[List[str], List[str]]: """Performs an active learning query. diff --git a/lightly/api/bitmask.py b/lightly/api/bitmask.py index 0c3b1da7d..d0cb0f011 100644 --- a/lightly/api/bitmask.py +++ b/lightly/api/bitmask.py @@ -199,12 +199,12 @@ def __sub__(self, other): def __eq__(self, other): return self.to_bin() == other.to_bin() - def subset_a_list(self, list_: List): + def masked_select_from_list(self, list_: List): """Returns a subset of a list depending on the bitmask Examples: >>> list_to_subset = [4, 7, 9, 1] >>> mask = BitMask.from_bin("0b0101") - >>> masked_list = mask.subset_a_list(list_to_subset) + >>> masked_list = mask.masked_select_from_list(list_to_subset) >>> # masked_list = [7, 1] """ bits = self.to_bin() diff --git a/tests/api/test_BitMask.py b/tests/api/test_BitMask.py index 8e362d719..a0eab9851 100644 --- a/tests/api/test_BitMask.py +++ b/tests/api/test_BitMask.py @@ -1,4 +1,5 @@ import unittest +from copy import deepcopy from random import random, seed, randint from lightly.api.bitmask import BitMask @@ -126,9 +127,12 @@ def test_difference_random(self): def test_operator_minus(self): mask_a = BitMask.from_bin("0b10111") + mask_a_old = deepcopy(mask_a) mask_b = BitMask.from_bin("0b01100") mask_target = BitMask.from_bin("0b10011") - self.assertEqual(mask_a - mask_b, mask_target) + diff = mask_a - mask_b + self.assertEqual(diff, mask_target) + self.assertEqual(mask_a_old, mask_a) # make sure the original mask is unchanged. def test_equal(self): mask_a = BitMask.from_bin("0b101") @@ -139,7 +143,7 @@ def test_subset_a_list(self): list_ = [4, 7, 9, 1] mask = BitMask.from_bin("0b0101") target_masked_list = [7, 1] - masked_list = mask.subset_a_list(list_) + masked_list = mask.masked_select_from_list(list_) self.assertEqual(target_masked_list, masked_list) def test_nonzero_bits(self): From b61ad0fce78da3980861cb6a83d0282412feb5e0 Mon Sep 17 00:00:00 2001 From: MalteEbner Date: Mon, 19 Apr 2021 13:15:08 +0200 Subject: [PATCH 15/27] renamed download_cli_file back to download_cli by using sys.modules --- lightly/cli/__init__.py | 2 +- .../cli/{download_cli_file.py => download_cli.py} | 3 ++- setup.py | 2 +- tests/cli/test_cli_download.py | 14 ++++++++------ 4 files changed, 12 insertions(+), 9 deletions(-) rename lightly/cli/{download_cli_file.py => download_cli.py} (98%) diff --git a/lightly/cli/__init__.py b/lightly/cli/__init__.py index f8ff2886b..e64a86144 100644 --- a/lightly/cli/__init__.py +++ b/lightly/cli/__init__.py @@ -10,4 +10,4 @@ from lightly.cli.train_cli import train_cli from lightly.cli.embed_cli import embed_cli from lightly.cli.upload_cli import upload_cli -from lightly.cli.download_cli_file import download_cli +from lightly.cli.download_cli import download_cli diff --git a/lightly/cli/download_cli_file.py b/lightly/cli/download_cli.py similarity index 98% rename from lightly/cli/download_cli_file.py rename to lightly/cli/download_cli.py index 57ce8367d..5ee69c3f8 100644 --- a/lightly/cli/download_cli_file.py +++ b/lightly/cli/download_cli.py @@ -10,6 +10,7 @@ import os import shutil +import warnings import hydra from tqdm import tqdm @@ -47,7 +48,7 @@ def _download_cli(cfg, is_cli_call=True): tag_name_id_dict = dict([tag.name, tag.id] for tag in api_workflow_client._get_all_tags()) tag_id = tag_name_id_dict.get(tag_name, None) if tag_id is None: - print(f'The specified tag {tag_name} does not exist.') + warnings.warn(f'The specified tag {tag_name} does not exist.') return # get tag data diff --git a/setup.py b/setup.py index f0d9527f8..b9af51836 100644 --- a/setup.py +++ b/setup.py @@ -57,7 +57,7 @@ def load_requirements(path_dir=PATH_ROOT, filename='base.txt', comment_char='#') "lightly-embed = lightly.cli.embed_cli:entry", "lightly-magic = lightly.cli.lightly_cli:entry", "lightly-upload = lightly.cli.upload_cli:entry", - "lightly-download = lightly.cli.download_cli_file:entry", + "lightly-download = lightly.cli.download_cli:entry", "lightly-version = lightly.cli.version_cli:entry", ] } diff --git a/tests/cli/test_cli_download.py b/tests/cli/test_cli_download.py index a9e3ccd2d..8ad66cce1 100644 --- a/tests/cli/test_cli_download.py +++ b/tests/cli/test_cli_download.py @@ -1,5 +1,6 @@ import os import re +import sys import tempfile from hydra.experimental import compose, initialize @@ -8,13 +9,13 @@ from tests.api_workflow.mocked_api_workflow_client import MockedApiWorkflowSetup, MockedApiWorkflowClient -#in download_cli_file.py: from lightly.api.api_workflow_client import ApiWorkflowClient +#in download_cli.py: from lightly.api.api_workflow_client import ApiWorkflowClient class TestCLIDownload(MockedApiWorkflowSetup): @classmethod def setUpClass(cls) -> None: - lightly.cli.download_cli_file.ApiWorkflowClient = MockedApiWorkflowClient + sys.modules["lightly.cli.download_cli"].ApiWorkflowClient = MockedApiWorkflowClient initialize(config_path="../../lightly/cli/config", job_name="test_app") def setUp(self): @@ -41,22 +42,23 @@ def test_parse_cli_string(self): def test_download_base(self): cli_string = "lightly-download token='123' dataset_id='XYZ'" self.parse_cli_string(cli_string) - lightly.cli.download_cli_file.download_cli(self.cfg) + lightly.cli.download_cli(self.cfg) def test_download_tag_name(self): cli_string = "lightly-download token='123' dataset_id='XYZ' tag_name='sampled_tag_xyz'" self.parse_cli_string(cli_string) - lightly.cli.download_cli_file.download_cli(self.cfg) + lightly.cli.download_cli(self.cfg) def test_download_tag_name_nonexisting(self): cli_string = "lightly-download token='123' dataset_id='XYZ' tag_name='nonexisting_xyz'" self.parse_cli_string(cli_string) - lightly.cli.download_cli_file.download_cli(self.cfg) + with self.assertWarns(Warning): + lightly.cli.download_cli(self.cfg) def test_download_tag_name_exclude_parent(self): cli_string = "lightly-download token='123' dataset_id='XYZ' tag_name='sampled_tag_xyz' exclude_parent_tag=True" self.parse_cli_string(cli_string) - lightly.cli.download_cli_file.download_cli(self.cfg) + lightly.cli.download_cli(self.cfg) def tearDown(self) -> None: try: From b3e2b7866a6d52c3978bed9d147dc6909f7c4da5 Mon Sep 17 00:00:00 2001 From: MalteEbner Date: Mon, 19 Apr 2021 13:46:19 +0200 Subject: [PATCH 16/27] Better warning string --- lightly/active_learning/agents/agent.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lightly/active_learning/agents/agent.py b/lightly/active_learning/agents/agent.py index 550522b07..a601117ca 100644 --- a/lightly/active_learning/agents/agent.py +++ b/lightly/active_learning/agents/agent.py @@ -126,8 +126,7 @@ def query(self, sampler_config: SamplerConfig, al_scorer: Scorer = None) -> Tupl "including the current labeled set " "(sampler_config.n_samples) " "is smaller than the number of samples in the current labeled set." - "Skipping the sampling and returning the old labeled_set and" - "no ne filenames.") + "Skipping the sampling and returning the previous labeled set.") return self.labeled_set, [] # calculate scores From c9c0aae8f781ccd2de7ba131e52368a291ebb385 Mon Sep 17 00:00:00 2001 From: MalteEbner Date: Mon, 19 Apr 2021 15:44:08 +0200 Subject: [PATCH 17/27] renamed scorer.calculate_scores() --- docs/source/getting_started/active_learning.rst | 7 ++++--- lightly/active_learning/agents/agent.py | 2 +- lightly/active_learning/scorers/classification.py | 8 +++++++- lightly/active_learning/scorers/detection.py | 8 +++++++- lightly/active_learning/scorers/scorer.py | 10 +++++++++- tests/active_learning/test_ScorerClassification.py | 2 +- tests/active_learning/test_ScorerObjectDetection.py | 6 +++--- 7 files changed, 32 insertions(+), 11 deletions(-) diff --git a/docs/source/getting_started/active_learning.rst b/docs/source/getting_started/active_learning.rst index 5b829a3ca..58e244522 100644 --- a/docs/source/getting_started/active_learning.rst +++ b/docs/source/getting_started/active_learning.rst @@ -45,9 +45,10 @@ Lightly makes use of the following concepts for active learning: * **Scorer:** :py:class:`lightly.active_learning.scorers.scorer.Scorer` The `Scorer` takes as input the predictions of a pre-trained model on the set - of unlabeled images. It evaluates different scores based on how certain the model - is about the images and passes them to the API so the sampler can use them with - Coral. + of unlabeled images. It offers a `calculate_scores()` method, which evaluates + different scores based on how certain the model is about the images. When + performing a sampling, the scores are passed to the API so the sampler can use + them with Coral. Continue reading to see how these components interact and how active learning is diff --git a/lightly/active_learning/agents/agent.py b/lightly/active_learning/agents/agent.py index a601117ca..ba9ecdddf 100644 --- a/lightly/active_learning/agents/agent.py +++ b/lightly/active_learning/agents/agent.py @@ -137,7 +137,7 @@ def query(self, sampler_config: SamplerConfig, al_scorer: Scorer = None) -> Tupl raise ValueError(f"The scorer must have exactly as many samples as in the unlabeled set," f"but there are {no_samples_with_predictions} predictions in the scorer," f"but {no_unlabeled_samples} in the unlabeled set.") - scores_dict = al_scorer._calculate_scores() + scores_dict = al_scorer.calculate_scores() else: scores_dict = None diff --git a/lightly/active_learning/scorers/classification.py b/lightly/active_learning/scorers/classification.py index d830f6dc9..0a2f5cfd4 100644 --- a/lightly/active_learning/scorers/classification.py +++ b/lightly/active_learning/scorers/classification.py @@ -67,7 +67,13 @@ class ScorerClassification(Scorer): def __init__(self, model_output: np.ndarray): super(ScorerClassification, self).__init__(model_output) - def _calculate_scores(self) -> Dict[str, np.ndarray]: + def calculate_scores(self) -> Dict[str, np.ndarray]: + """Calculates and returns the active learning scores. + + Returns: + A dictionary mapping from the score name (as string) + to the scores (as a single-dimensional numpy array). + """ scores = dict() scores["prediction-margin"] = self._get_prediction_margin_score() scores["prediction-entropy"] = self._get_prediction_entropy_score() diff --git a/lightly/active_learning/scorers/detection.py b/lightly/active_learning/scorers/detection.py index 2b91ee6e3..86fff5957 100644 --- a/lightly/active_learning/scorers/detection.py +++ b/lightly/active_learning/scorers/detection.py @@ -177,7 +177,13 @@ def _check_config(self): else: self.config = default_conf - def _calculate_scores(self) -> Dict[str, np.ndarray]: + def calculate_scores(self) -> Dict[str, np.ndarray]: + """Calculates and returns the active learning scores. + + Returns: + A dictionary mapping from the score name (as string) + to the scores (as a single-dimensional numpy array). + """ scores = dict() scores['object-frequency'] = self._get_object_frequency() scores['prediction-margin'] = self._get_prediction_margin() diff --git a/lightly/active_learning/scorers/scorer.py b/lightly/active_learning/scorers/scorer.py index 6e8a6079c..31bb868e1 100644 --- a/lightly/active_learning/scorers/scorer.py +++ b/lightly/active_learning/scorers/scorer.py @@ -8,5 +8,13 @@ class Scorer(): def __init__(self, model_output): self.model_output = model_output - def _calculate_scores(self) -> Dict[str, np.ndarray]: + def calculate_scores(self) -> Dict[str, np.ndarray]: + """Calculates and returns the active learning scores + + Which scores are calculated depends on the implementation + of this parent class by the child classes. + Returns: + A dictionary mapping from the score name (as string) + to the scores (as a single-dimensional numpy array). + """ raise NotImplementedError diff --git a/tests/active_learning/test_ScorerClassification.py b/tests/active_learning/test_ScorerClassification.py index e5c1d523d..4924faa63 100644 --- a/tests/active_learning/test_ScorerClassification.py +++ b/tests/active_learning/test_ScorerClassification.py @@ -13,7 +13,7 @@ def test_score_calculation(self): predictions_normalized = predictions / np.sum(predictions, axis=1)[:, np.newaxis] model_output = predictions_normalized scorer = ScorerClassification(model_output) - scores = scorer._calculate_scores() + scores = scorer.calculate_scores() scores_prediction_entropy = scores["prediction-entropy"] scores_prediction_margin = scores["prediction-margin"] diff --git a/tests/active_learning/test_ScorerObjectDetection.py b/tests/active_learning/test_ScorerObjectDetection.py index 473f4384f..9e024c565 100644 --- a/tests/active_learning/test_ScorerObjectDetection.py +++ b/tests/active_learning/test_ScorerObjectDetection.py @@ -69,7 +69,7 @@ def test_object_detection_scorer(self): ) scorer = ScorerObjectDetection(self.dummy_data) - scores = scorer._calculate_scores() + scores = scorer.calculate_scores() res = scores['object-frequency'] self.assertEqual(len(res), len(self.dummy_data)) @@ -101,7 +101,7 @@ def test_object_detection_scorer_config(self): # check for default config scorer = ScorerObjectDetection(self.dummy_data) - scores = scorer._calculate_scores() + scores = scorer.calculate_scores() expected_default_config = { 'frequency_penalty': 0.25, 'min_score': 0.9 @@ -114,7 +114,7 @@ def test_object_detection_scorer_config(self): 'min_score': 0.6 } scorer = ScorerObjectDetection(self.dummy_data, config=new_config) - scores = scorer._calculate_scores() + scores = scorer.calculate_scores() self.assertDictEqual(scorer.config, new_config) # check for invalid key passed From fd579f90a12d7087c882206279295a2d1148f897 Mon Sep 17 00:00:00 2001 From: MalteEbner Date: Mon, 19 Apr 2021 16:28:21 +0200 Subject: [PATCH 18/27] scorer.py: shorter docstring --- lightly/active_learning/scorers/scorer.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/lightly/active_learning/scorers/scorer.py b/lightly/active_learning/scorers/scorer.py index 31bb868e1..d62b9c217 100644 --- a/lightly/active_learning/scorers/scorer.py +++ b/lightly/active_learning/scorers/scorer.py @@ -9,12 +9,6 @@ def __init__(self, model_output): self.model_output = model_output def calculate_scores(self) -> Dict[str, np.ndarray]: - """Calculates and returns the active learning scores - - Which scores are calculated depends on the implementation - of this parent class by the child classes. - Returns: - A dictionary mapping from the score name (as string) - to the scores (as a single-dimensional numpy array). + """Calculates and returns active learning scores in a dictionary. """ raise NotImplementedError From 7ba5a0fbf37581feecf7271d81c50a20ac433f05 Mon Sep 17 00:00:00 2001 From: Philipp Wirth <65946090+philippmwirth@users.noreply.github.com> Date: Tue, 20 Apr 2021 07:24:51 +0200 Subject: [PATCH 19/27] 327 Add check for comma in filenames of CSV file (#302) * Add check for comma in filenames of CSV file * Add tests for better coverage --- lightly/api/api_workflow_upload_embeddings.py | 22 ++++++++++++++++--- .../test_api_workflow_upload_embeddings.py | 20 ++++++++++++++++- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/lightly/api/api_workflow_upload_embeddings.py b/lightly/api/api_workflow_upload_embeddings.py index a82fc0c9d..6595e9fbb 100644 --- a/lightly/api/api_workflow_upload_embeddings.py +++ b/lightly/api/api_workflow_upload_embeddings.py @@ -5,6 +5,18 @@ from lightly.openapi_generated.swagger_client.models.write_csv_url_data import WriteCSVUrlData + +def _is_valid_filename(filename: str): + """Returns False if the filename is misformatted. + + """ + invalid_characters = [','] + for character in invalid_characters: + if character in filename: + return False + return True + + class _UploadEmbeddingsMixin: def set_embedding_id_by_name(self, embedding_name: str = None): @@ -39,6 +51,7 @@ def upload_embeddings(self, path_to_embeddings_csv: str, name: str): embeddings_on_server: List[DatasetEmbeddingData] = \ self.embeddings_api.get_embeddings_by_dataset_id(dataset_id=self.dataset_id) names_embeddings_on_server = [embedding.name for embedding in embeddings_on_server] + if name in names_embeddings_on_server: print(f"Aborting upload, embedding with name='{name}' already exists.") self.embedding_id = next(embedding for embedding in embeddings_on_server if embedding.name == name).id @@ -79,10 +92,13 @@ def _order_csv_by_filenames(self, path_to_embeddings_csv: str) -> str: filenames = [row[index_filenames] for row in rows_without_header] if len(filenames) != len(self.filenames_on_server): - raise ValueError(f"There are {len(filenames)} rows in the embedding file, but " - f"{len(self.filenames_on_server)} filenames/samples on the server.") + raise ValueError(f'There are {len(filenames)} rows in the embedding file, but ' + f'{len(self.filenames_on_server)} filenames/samples on the server.') if set(filenames) != set(self.filenames_on_server): - raise ValueError(f"The filenames in the embedding file and the filenames on the server do not align") + raise ValueError(f'The filenames in the embedding file and the filenames on the server do not align') + invalid_filenames = [f for f in filenames if not _is_valid_filename(f)] + if len(invalid_filenames) > 0: + raise ValueError(f'Invalid filename(s) in embedding file: {invalid_filenames}') rows_without_header_ordered = self._order_list_by_filenames(filenames, rows_without_header) diff --git a/tests/api_workflow/test_api_workflow_upload_embeddings.py b/tests/api_workflow/test_api_workflow_upload_embeddings.py index 943613632..9fedc15b2 100644 --- a/tests/api_workflow/test_api_workflow_upload_embeddings.py +++ b/tests/api_workflow/test_api_workflow_upload_embeddings.py @@ -9,7 +9,10 @@ class TestApiWorkflowUploadEmbeddigns(MockedApiWorkflowSetup): - def t_ester_upload_embedding(self, n_data, special_name_first_sample: bool = False): + def t_ester_upload_embedding(self, + n_data, + special_name_first_sample: bool = False, + comma_in_first_sample: bool = False): # create fake embeddings folder_path = tempfile.mkdtemp() path_to_embeddings = os.path.join( @@ -19,6 +22,8 @@ def t_ester_upload_embedding(self, n_data, special_name_first_sample: bool = Fal sample_names = [f'img_{i}.jpg' for i in range(n_data)] if special_name_first_sample: sample_names[0] = "bliblablub" + if comma_in_first_sample: + sample_names[0] = "bli,blablu" labels = [0] * len(sample_names) save_embeddings( path_to_embeddings, @@ -44,6 +49,11 @@ def test_upload_wrong_filenames(self): with self.assertRaises(ValueError): self.t_ester_upload_embedding(n_data=n_data, special_name_first_sample=True) + def test_upload_comma_filenames(self): + n_data = len(self.api_workflow_client.mappings_api.sample_names) + with self.assertRaises(ValueError): + self.t_ester_upload_embedding(n_data=n_data, comma_in_first_sample=True) + def test_set_embedding_id_success(self): embedding_name = self.api_workflow_client.embeddings_api.embeddings[0].name self.api_workflow_client.set_embedding_id_by_name(embedding_name) @@ -55,3 +65,11 @@ def test_set_embedding_id_failure(self): def test_set_embedding_id_default(self): self.api_workflow_client.set_embedding_id_by_name() + + def test_is_valid_filename(self): + filenames = [',a', ',', 'a,', 'a'] + is_valid = [False, False, False, True] + result = [ + lightly.api.api_workflow_upload_embeddings._is_valid_filename(f) for f in filenames + ] + self.assertListEqual(is_valid, result) From 6ee6647f42f80ac1bb96bdab62fb1b205815be57 Mon Sep 17 00:00:00 2001 From: MalteEbner Date: Tue, 20 Apr 2021 08:18:15 +0200 Subject: [PATCH 20/27] rename lightly_subset to lightly_subset --- lightly/data/{lighty_subset.py => lightly_subset.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename lightly/data/{lighty_subset.py => lightly_subset.py} (100%) diff --git a/lightly/data/lighty_subset.py b/lightly/data/lightly_subset.py similarity index 100% rename from lightly/data/lighty_subset.py rename to lightly/data/lightly_subset.py From 65122d1f6d7d6857e1c54dacdeb2b7311cacbcf7 Mon Sep 17 00:00:00 2001 From: MalteEbner Date: Tue, 20 Apr 2021 08:18:29 +0200 Subject: [PATCH 21/27] Update test_LightlySubset.py --- tests/data/test_LightlySubset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data/test_LightlySubset.py b/tests/data/test_LightlySubset.py index 53a3ec9e3..e6dd40050 100644 --- a/tests/data/test_LightlySubset.py +++ b/tests/data/test_LightlySubset.py @@ -6,7 +6,7 @@ import torchvision from lightly.data.dataset import LightlyDataset -from lightly.data.lighty_subset import LightlySubset +from lightly.data.lightly_subset import LightlySubset from tests.data.test_LightlyDataset import TestLightlyDataset From bd982c9513effd75c11d9b2671de09a4be6ce155 Mon Sep 17 00:00:00 2001 From: Philipp Wirth <65946090+philippmwirth@users.noreply.github.com> Date: Tue, 20 Apr 2021 08:40:19 +0200 Subject: [PATCH 22/27] 304 fix import api workflow client pw (#305) * Fix imports for api workflow client and al utils * Add active learning utils to docs * Add more tests for imports --- docs/source/lightly.active_learning.rst | 6 ++++++ lightly/active_learning/utils/__init__.py | 7 +++++++ lightly/api/__init__.py | 1 + tests/imports/test_nested_imports.py | 17 +++++++++++++---- tests/imports/test_seminested_imports.py | 8 ++++---- 5 files changed, 31 insertions(+), 8 deletions(-) diff --git a/docs/source/lightly.active_learning.rst b/docs/source/lightly.active_learning.rst index 2c9df027c..27fe5ed79 100644 --- a/docs/source/lightly.active_learning.rst +++ b/docs/source/lightly.active_learning.rst @@ -22,3 +22,9 @@ lightly.active_learning .. automodule:: lightly.active_learning.scorers.detection :members: +.utils +-------- +.. automodule:: lightly.active_learning.utils.bounding_box + :members: +.. automodule:: lightly.active_learning.utils.object_detection_output + :members: \ No newline at end of file diff --git a/lightly/active_learning/utils/__init__.py b/lightly/active_learning/utils/__init__.py index e69de29bb..81ba8feb5 100644 --- a/lightly/active_learning/utils/__init__.py +++ b/lightly/active_learning/utils/__init__.py @@ -0,0 +1,7 @@ +""" Collection of Utils for Active Learning """ + +# Copyright (c) 2020. Lightly AG and its affiliates. +# All Rights Reserved + +from lightly.active_learning.utils.bounding_box import BoundingBox +from lightly.active_learning.utils.object_detection_output import ObjectDetectionOutput \ No newline at end of file diff --git a/lightly/api/__init__.py b/lightly/api/__init__.py index 86570f7b8..2aa24e779 100644 --- a/lightly/api/__init__.py +++ b/lightly/api/__init__.py @@ -3,4 +3,5 @@ # Copyright (c) 2020. Lightly AG and its affiliates. # All Rights Reserved +from lightly.api.api_workflow_client import ApiWorkflowClient from lightly.api import routes diff --git a/tests/imports/test_nested_imports.py b/tests/imports/test_nested_imports.py index b21207ce1..e1f854a2b 100644 --- a/tests/imports/test_nested_imports.py +++ b/tests/imports/test_nested_imports.py @@ -7,17 +7,26 @@ class TestNestedImports(unittest.TestCase): def test_nested_imports(self): - # active learning (commented out don't work) - #lightly.active_learning.agents.agent.ActiveLearningAgent - #lightly.active_learning.agents.ActiveLearningAgent + # active learning + lightly.active_learning.agents.agent.ActiveLearningAgent + lightly.active_learning.agents.ActiveLearningAgent lightly.active_learning.config.sampler_config.SamplerConfig - #lightly.active_learning.scorers.classification.ScorerClassification + lightly.active_learning.config.SamplerConfig + lightly.active_learning.scorers.classification.ScorerClassification + lightly.active_learning.scorers.ScorerClassification + lightly.active_learning.scorers.detection.ScorerObjectDetection + lightly.active_learning.scorers.ScorerObjectDetection + lightly.active_learning.utils.bounding_box.BoundingBox + lightly.active_learning.utils.BoundingBox + lightly.active_learning.utils.object_detection_output.ObjectDetectionOutput + lightly.active_learning.utils.ObjectDetectionOutput # api imports lightly.api.routes.users.docker.get_authorization lightly.api.routes.users.docker.get_soft_authorization lightly.api.routes.users.docker.post_diagnostics lightly.api.api_workflow_client.ApiWorkflowClient + lightly.api.ApiWorkflowClient lightly.api.bitmask.BitMask # data imports diff --git a/tests/imports/test_seminested_imports.py b/tests/imports/test_seminested_imports.py index 944bd1446..60f602bc4 100644 --- a/tests/imports/test_seminested_imports.py +++ b/tests/imports/test_seminested_imports.py @@ -9,10 +9,10 @@ class TestSemiNestedImports(unittest.TestCase): def test_seminested_imports(self): from lightly import active_learning # active learning (commented out don't work) - #lightly.active_learning.agents.agent.ActiveLearningAgent - #lightly.active_learning.agents.ActiveLearningAgent - active_learning.config.sampler_config.SamplerConfig - #lightly.active_learning.scorers.classification.ScorerClassification + active_learning.agents.ActiveLearningAgent + active_learning.config.SamplerConfig + active_learning.scorers.ScorerClassification + active_learning.scorers.ScorerObjectDetection # api imports from lightly import api From 0170e85026afce30334fd8401177e353fac42a8e Mon Sep 17 00:00:00 2001 From: Philipp Wirth <65946090+philippmwirth@users.noreply.github.com> Date: Wed, 21 Apr 2021 11:18:03 +0200 Subject: [PATCH 23/27] 132 Docs on embeddings upload (#303) * Fix docstrings and add api workflow clients to autodocs * Add short section on how to upload embeddings from the Python package * Add short section on how to upload images from Pytyhon * Make comments more detailed --- docs/source/getting_started/platform.rst | 33 +++++++++++++++++-- docs/source/lightly.api.rst | 14 ++++++++ lightly/api/api_workflow_client.py | 10 +++--- lightly/api/api_workflow_upload_embeddings.py | 19 ++++++++--- 4 files changed, 64 insertions(+), 12 deletions(-) diff --git a/docs/source/getting_started/platform.rst b/docs/source/getting_started/platform.rst index 5e4f0d648..5b62e5ca3 100644 --- a/docs/source/getting_started/platform.rst +++ b/docs/source/getting_started/platform.rst @@ -159,12 +159,39 @@ drag-and-drop or using the Python Package according to: You can upload up to 1'000 images using the frontend. +Images can also be uploaded from a Python script: + +.. code-block:: python + + from lightly.api.api_workflow_client import ApiWorkflowClient + client = ApiWorkflowClient(token='123'm dataset_id='xyz') + + # change mode to 'thumbnails' or 'meta' if you're working with sensitive data + client.upload_dataset('path/to/your/images/', mode='full') + + Upload Embeddings ------------------------- -Embeddings can be uploaded using the Python Package. -You can not upload embedding through the web interface. Instead -:ref:`ref-upload-embedding-lightly` +Embeddings can be uploaded using the Python Package or the front-end. The simplest +way to upload the embeddings is from the command line: :ref:`ref-upload-embedding-lightly`. + +If you have a numpy array of image embeddings, the filenames of the images, and categorical pseudo-labels, +you can use the `save_embeddings` function to store them in a lightly-compatible CSV format and upload +them from your Python code or using the CLI. The following snippet shows how to upload the embeddings from Python. + +.. code-block:: python + + from lightly.utils import save_embeddings + from lightly.api.api_workflow_client import ApiWorkflowClient + + # store the embeddings in a lightly compatible CSV format before uploading + # them to the platform + save_embeddings('embeddings.csv', embeddings, labels, filenames) + + # upload the embeddings.csv file to the platform + client = ApiWorkflowClient(token='123', dataset_id='xyz') + client.upload_embeddings('embeddings.csv', name='my-embeddings') Sampling diff --git a/docs/source/lightly.api.rst b/docs/source/lightly.api.rst index 601d36964..18d01e78b 100644 --- a/docs/source/lightly.api.rst +++ b/docs/source/lightly.api.rst @@ -8,6 +8,20 @@ lightly.api .. automodule:: lightly.api.api_workflow_client :members: +.. automodule:: lightly.api.api_workflow_datasets + :members: + +.. automodule:: lightly.api.api_workflow_download_dataset + :members: + +.. automodule:: lightly.api.api_workflow_sampling + :members: + +.. automodule:: lightly.api.api_workflow_upload_dataset + :members: + +.. automodule:: lightly.api.api_workflow_upload_embeddings + :members: .utils --------------- diff --git a/lightly/api/api_workflow_client.py b/lightly/api/api_workflow_client.py index 26f38533a..286008300 100644 --- a/lightly/api/api_workflow_client.py +++ b/lightly/api/api_workflow_client.py @@ -85,12 +85,11 @@ def check_version_compatibility(self): @property def dataset_id(self) -> str: - ''' Returns the dataset_id + '''The current dataset_id. If the dataset_id is set, it is returned. - If it is unset, then the dataset_id of the last modified dataset is taken. - - ''' + If it is not set, then the dataset_id of the last modified dataset is selected. + ''' try: return self._dataset_id except AttributeError: @@ -127,6 +126,9 @@ def _order_list_by_filenames(self, filenames_for_list: List[str], list_to_order: @property def filenames_on_server(self): + '''The list of the filenames in the dataset. + + ''' if not hasattr(self, "_filenames_on_server"): self._filenames_on_server = self.mappings_api. \ get_sample_mappings_by_dataset_id(dataset_id=self.dataset_id, field="fileName") diff --git a/lightly/api/api_workflow_upload_embeddings.py b/lightly/api/api_workflow_upload_embeddings.py index 6595e9fbb..7bc6b918c 100644 --- a/lightly/api/api_workflow_upload_embeddings.py +++ b/lightly/api/api_workflow_upload_embeddings.py @@ -20,6 +20,15 @@ def _is_valid_filename(filename: str): class _UploadEmbeddingsMixin: def set_embedding_id_by_name(self, embedding_name: str = None): + """Sets the embedding id of the client by embedding name. + + Args: + embedding_name: + Name under which the embedding was uploaded. + + Raises: + ValueError if the embedding does not exist. + """ embeddings: List[DatasetEmbeddingData] = \ self.embeddings_api.get_embeddings_by_dataset_id(dataset_id=self.dataset_id) @@ -38,14 +47,14 @@ def upload_embeddings(self, path_to_embeddings_csv: str, name: str): First checks that the specified embedding name is not on ther server. If it is, the upload is aborted. Then creates a new csv with the embeddings in the order specified on the server. Next it uploads it to the server. The received embedding_id is saved as a property of self. + Args: - path_to_embeddings_csv: the filepath to the .csv containing the embeddings, e.g. "path/to/embeddings.csv" - name: The name of the embedding. If an embedding with such a name already exists on the server, + path_to_embeddings_csv: + The path to the .csv containing the embeddings, e.g. "path/to/embeddings.csv" + name: + The name of the embedding. If an embedding with such a name already exists on the server, the upload is aborted. - Returns: - None - """ # get the names of the current embeddings on the server: embeddings_on_server: List[DatasetEmbeddingData] = \ From 3a9b4d8a477c248c8c9d93c7464d747da707570c Mon Sep 17 00:00:00 2001 From: MalteEbner Date: Wed, 21 Apr 2021 18:51:40 +0200 Subject: [PATCH 24/27] upload-cli: allow creation of dataset (#308) - changed one line in GitHub action to solve #313 - added parameter `new_dataset_name` to the `config.yaml` - allow to use `lightly-upload` and `lightly-magic` either with creating a new dataset in the webapp (if `new_dataset_name` is defined) or uploads to an existing dataset (if `dataset_id` is defined) - wrote test for the `upload_cli` and `lightly_cli` increasing coverage quite a lot - added docstrings to the `upload_cli` including the new parameter --- .github/workflows/test.yml | 2 +- .../getting_started/command_line_tool.rst | 10 ++- .../source/tutorials/structure_your_input.rst | 2 +- lightly/cli/config/config.yaml | 11 +-- lightly/cli/lightly_cli.py | 2 +- lightly/cli/upload_cli.py | 55 ++++++++----- tests/cli/test_cli_download.py | 5 +- tests/cli/test_cli_magic.py | 76 +++++++++++++++++ tests/cli/test_cli_upload.py | 81 +++++++++++++++++++ 9 files changed, 213 insertions(+), 31 deletions(-) create mode 100644 tests/cli/test_cli_magic.py create mode 100644 tests/cli/test_cli_upload.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0936d87e8..b9ad492b7 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -27,7 +27,7 @@ jobs: run: pip install -e '.[all]' - name: Run Pytest run: | - LIGHTLY_SERVER_LOCATION="localhost:-1" + export LIGHTLY_SERVER_LOCATION="localhost:-1" pip install pytest-cov python -m pytest -s -v --runslow --cov=./lightly --cov-report=xml --ignore=./lightly/openapi_generated/ - name: Upload coverage to Codecov diff --git a/docs/source/getting_started/command_line_tool.rst b/docs/source/getting_started/command_line_tool.rst index 046ffb615..91f560864 100644 --- a/docs/source/getting_started/command_line_tool.rst +++ b/docs/source/getting_started/command_line_tool.rst @@ -99,7 +99,9 @@ Upload data using the CLI In this example we will upload a dataset to the Lightly Platform. First, make sure you have an account on `Lightly `_. A free account is sufficient. Log in to the app and create a new dataset. -You will get a *token* and *dataset_id* which can be used to upload your dataset +You will get a *token* and *dataset_id* which can be used to upload your dataset. +Alternatively, you can create a new dataset directly with the *token* +by providing the *new_dataset_name* instead of the *dataset_id*. .. code-block:: bash @@ -110,6 +112,9 @@ You will get a *token* and *dataset_id* which can be used to upload your dataset lightly-upload input_dir=cat embeddings=your_embedding.csv \ token=your_token dataset_id=your_dataset_id + # create a new dataset and upload to it + lightly-upload input_dir=cat token=your_token new_dataset_name=your_dataset_name + .. note:: To obtain your *token* and *dataset_id* check: :ref:`ref-authentication-token` and :ref:`ref-webapp-dataset-id`. @@ -120,6 +125,7 @@ Upload embeddings using the CLI ---------------------------------- You can upload embeddings directly to the Lightly Platform using the CLI. +Again, you can use the *dataset_id* and *new_dataset_name* interchangeably. .. code-block:: bash @@ -129,7 +135,7 @@ You can upload embeddings directly to the Lightly Platform using the CLI. # you can upload the dataset together with the embeddings lightly-upload input_dir=cat embeddings=your_embedding.csv \ - token=your_token dataset_id=your_dataset_id + token=your_token new_dataset_name=your_dataset_name Download data using the CLI diff --git a/docs/source/tutorials/structure_your_input.rst b/docs/source/tutorials/structure_your_input.rst index 271a611bf..1bc1c08d7 100644 --- a/docs/source/tutorials/structure_your_input.rst +++ b/docs/source/tutorials/structure_your_input.rst @@ -156,7 +156,7 @@ To upload the three videos from above to the platform, you can use .. code-block:: bash - lightly-upload token='123' dataset_id='XYZ' input_dir='data/' + lightly-upload token='123' new_dataset_name='my_video_dataset' input_dir='data/' All other operations (like training a self-supervised model and embedding the frames individually) also work on video data. Give it a try! diff --git a/lightly/cli/config/config.yaml b/lightly/cli/config/config.yaml index be8a525a2..1a36b16f0 100644 --- a/lightly/cli/config/config.yaml +++ b/lightly/cli/config/config.yaml @@ -7,18 +7,19 @@ embeddings: '' # Path to csv file which holds embeddings. checkpoint: '' # Path to a model checkpoint. If left empty, a pre-trained model # will be used. -### platform +### Lightly platform # The following arguments are required for requests to the # Lightly platform. -token: '' # User access token to the platform. -dataset_id: '' # Identifier of the dataset on the platform +token: '' # User access token to the Lightly platform. +dataset_id: '' # Identifier of the dataset on the Lightly platform. +new_dataset_name: '' # Name of the new dataset to be created on the Lightly platform upload: 'full' # Whether to upload full images, thumbnails only, or metadata only. # Must be one of ['full', 'thumbnails', 'none'] resize: -1 # Allow resizing of the images before uploading, usage =-1, =x, =[x,y] -embedding_name: 'default' # Name of the embedding to be used on the platform. +embedding_name: 'default' # Name of the embedding to be used on the Lightly platform. emb_upload_bsz: 32 # Number of embeddings which are uploaded in a single batch. tag_name: 'initial-tag' # Name of the requested tag on the Lightly platform. -exclude_parent_tag: False # If true, only the samples in the defined tag, but without the parent tag, are taken +exclude_parent_tag: False # If true, only the samples in the defined tag, but without the parent tag, are taken. ### training and embeddings pre_trained: True # Whether to use a pre-trained model or not diff --git a/lightly/cli/lightly_cli.py b/lightly/cli/lightly_cli.py index b5fc78c71..184d3d638 100644 --- a/lightly/cli/lightly_cli.py +++ b/lightly/cli/lightly_cli.py @@ -31,7 +31,7 @@ def _lightly_cli(cfg, is_cli_call=True): embeddings = _embed_cli(cfg, is_cli_call) cfg['embeddings'] = embeddings - if cfg['token'] and cfg['dataset_id']: + if cfg['token'] and (cfg['dataset_id'] or cfg['new_dataset_name']): _upload_cli(cfg) diff --git a/lightly/cli/upload_cli.py b/lightly/cli/upload_cli.py index d4ac0abb9..28fa49ccd 100644 --- a/lightly/cli/upload_cli.py +++ b/lightly/cli/upload_cli.py @@ -7,6 +7,7 @@ # Copyright (c) 2020. Lightly AG and its affiliates. # All Rights Reserved +import warnings import hydra @@ -20,7 +21,6 @@ def _upload_cli(cfg, is_cli_call=True): - input_dir = cfg['input_dir'] if input_dir and is_cli_call: input_dir = fix_input_path(input_dir) @@ -31,6 +31,23 @@ def _upload_cli(cfg, is_cli_call=True): dataset_id = cfg['dataset_id'] token = cfg['token'] + new_dataset_name = cfg['new_dataset_name'] + + if not token: + warnings.warn('Please specify your access token. For help, try: lightly-upload --help') + return + + dataset_id_ok = dataset_id and len(dataset_id) > 0 + new_dataset_name_ok = new_dataset_name and len(new_dataset_name) > 0 + if new_dataset_name_ok and not dataset_id_ok: + api_workflow_client = ApiWorkflowClient(token=token) + api_workflow_client.create_dataset(dataset_name=new_dataset_name) + elif dataset_id_ok and not new_dataset_name_ok: + api_workflow_client = ApiWorkflowClient(token=token, dataset_id=dataset_id) + else: + warnings.warn('Please specify either the dataset_id of an existing dataset or a new_dataset_name. ' + 'For help, try: lightly-upload --help') + return size = cfg['resize'] if not isinstance(size, int): @@ -39,15 +56,6 @@ def _upload_cli(cfg, is_cli_call=True): if isinstance(size, tuple) or size > 0: transform = torchvision.transforms.Resize(size) - if not token or not dataset_id: - print('Please specify your access token and dataset id.') - print('For help, try: lightly-upload --help') - return - - api_workflow_client = ApiWorkflowClient( - token=token, dataset_id=dataset_id - ) - if input_dir: mode = cfg['upload'] dataset = LightlyDataset(input_dir=input_dir, transform=transform) @@ -79,13 +87,16 @@ def upload_cli(cfg): Path to the csv file storing the embeddings generated by lightly. token: - User access token to the Lightly platform. If dataset_id - and token are specified, the images and embeddings are - uploaded to the platform. + User access token to the Lightly platform. If needs to be + specified to upload the images and embeddings to the platform. dataset_id: - Identifier of the dataset on the Lightly platform. If - dataset_id and token are specified, the images and - embeddings are uploaded to the platform. + Identifier of the dataset on the Lightly platform. + Either the dataset_id or the new_dataset_name need to be + specified. + new_dataset_name: + The name of the new dataset to create on the Lightly platform. + Either the dataset_id or the new_dataset_name need to be + specified. upload: String to determine whether to upload the full images, thumbnails only, or metadata only. @@ -102,11 +113,14 @@ def upload_cli(cfg): to (size * height / width, size). Examples: - >>> # upload thumbnails to the Lightly platform + >>> # create a new dataset on the Lightly platform and upload thumbnails to it + >>> lightly-upload input_dir=data/ token='123' new_dataset_name='new_dataset_name_xyz' + >>> + >>> # upload thumbnails to the Lightly platform to an existing dataset >>> lightly-upload input_dir=data/ token='123' dataset_id='XYZ' >>> - >>> # upload full images to the Lightly platform - >>> lightly-upload input_dir=data/ token='123' dataset_id='XYZ' upload='full' + >>> # create a new dataset on the Lightly platform and upload full images to it + >>> lightly-upload input_dir=data/ token='123' new_dataset_name='new_dataset_name_xyz' upload='full' >>> >>> # upload metadata to the Lightly platform >>> lightly-upload input_dir=data/ token='123' dataset_id='XYZ' upload='metadata' @@ -117,6 +131,9 @@ def upload_cli(cfg): >>> # upload both, images and embeddings in a single command >>> lightly-upload input_dir=data/ embeddings=embeddings.csv upload='full' \\ >>> token='123' dataset_id='XYZ' + >>> # create a new dataset on the Lightly platform and upload both, images and embeddings + >>> lightly-upload input_dir=data/ embeddings=embeddings.csv upload='full' \\ + >>> token='123' new_dataset_name='new_dataset_name_xyz' """ _upload_cli(cfg) diff --git a/tests/cli/test_cli_download.py b/tests/cli/test_cli_download.py index 8ad66cce1..80ac41dda 100644 --- a/tests/cli/test_cli_download.py +++ b/tests/cli/test_cli_download.py @@ -16,10 +16,11 @@ class TestCLIDownload(MockedApiWorkflowSetup): @classmethod def setUpClass(cls) -> None: sys.modules["lightly.cli.download_cli"].ApiWorkflowClient = MockedApiWorkflowClient - initialize(config_path="../../lightly/cli/config", job_name="test_app") + def setUp(self): - self.cfg = compose(config_name="config", overrides=["token='123'", "dataset_id='XYZ'"]) + with initialize(config_path="../../lightly/cli/config", job_name="test_app"): + self.cfg = compose(config_name="config", overrides=["token='123'", "dataset_id='XYZ'"]) def parse_cli_string(self, cli_words: str): diff --git a/tests/cli/test_cli_magic.py b/tests/cli/test_cli_magic.py new file mode 100644 index 000000000..e7d28df74 --- /dev/null +++ b/tests/cli/test_cli_magic.py @@ -0,0 +1,76 @@ +import os +import re +import sys +import tempfile + +import torchvision +from hydra.experimental import compose, initialize + +import lightly +from tests.api_workflow.mocked_api_workflow_client import MockedApiWorkflowSetup, MockedApiWorkflowClient + + +class TestCLIMagic(MockedApiWorkflowSetup): + + @classmethod + def setUpClass(cls) -> None: + sys.modules["lightly.cli.upload_cli"].ApiWorkflowClient = MockedApiWorkflowClient + + def setUp(self): + MockedApiWorkflowSetup.setUp(self) + self.create_fake_dataset() + with initialize(config_path="../../lightly/cli/config", job_name="test_app"): + self.cfg = compose(config_name="config", overrides=[ + "token='123'", + f"input_dir={self.folder_path}", + "trainer.max_epochs=0" + ]) + + def create_fake_dataset(self): + n_data = len(self.api_workflow_client.filenames_on_server) + self.dataset = torchvision.datasets.FakeData(size=n_data, image_size=(3, 32, 32)) + + self.folder_path = tempfile.mkdtemp() + sample_names = [f'img_{i}.jpg' for i in range(n_data)] + self.sample_names = sample_names + for sample_idx in range(n_data): + data = self.dataset[sample_idx] + path = os.path.join(self.folder_path, sample_names[sample_idx]) + data[0].save(path) + + def parse_cli_string(self, cli_words: str): + cli_words = cli_words.replace("lightly-magic ", "") + cli_words = re.split("=| ", cli_words) + assert len(cli_words) % 2 == 0 + dict_keys = cli_words[0::2] + dict_values = cli_words[1::2] + for key, value in zip(dict_keys, dict_values): + value = value.strip('\"') + value = value.strip('\'') + self.cfg[key] = value + + def test_parse_cli_string(self): + cli_string = "lightly-magic dataset_id='XYZ' upload='thumbnails'" + self.parse_cli_string(cli_string) + assert self.cfg["dataset_id"] == 'XYZ' + assert self.cfg["upload"] == 'thumbnails' + + def test_magic_new_dataset_name(self): + cli_string = "lightly-magic new_dataset_name='new_dataset_name_xyz'" + self.parse_cli_string(cli_string) + lightly.cli.lightly_cli(self.cfg) + + def test_magic_new_dataset_id(self): + cli_string = "lightly-magic dataset_id='xyz'" + self.parse_cli_string(cli_string) + lightly.cli.lightly_cli(self.cfg) + + def tearDown(self) -> None: + for filename in ["embeddings.csv", "embeddings_sorted.csv"]: + try: + os.remove(filename) + except FileNotFoundError: + pass + + + diff --git a/tests/cli/test_cli_upload.py b/tests/cli/test_cli_upload.py new file mode 100644 index 000000000..c71efad8a --- /dev/null +++ b/tests/cli/test_cli_upload.py @@ -0,0 +1,81 @@ +import os +import re +import sys +import tempfile + +import torchvision +from hydra.experimental import compose, initialize + +import lightly +from tests.api_workflow.mocked_api_workflow_client import MockedApiWorkflowSetup, MockedApiWorkflowClient + + +class TestCLIUpload(MockedApiWorkflowSetup): + + @classmethod + def setUpClass(cls) -> None: + sys.modules["lightly.cli.upload_cli"].ApiWorkflowClient = MockedApiWorkflowClient + + def setUp(self): + self.create_fake_dataset() + with initialize(config_path="../../lightly/cli/config", job_name="test_app"): + self.cfg = compose(config_name="config", overrides=["token='123'", f"input_dir={self.folder_path}"]) + + def create_fake_dataset(self, n_data: int=5): + self.dataset = torchvision.datasets.FakeData(size=n_data, + image_size=(3, 32, 32)) + + self.folder_path = tempfile.mkdtemp() + sample_names = [f'img_{i}.jpg' for i in range(n_data)] + self.sample_names = sample_names + for sample_idx in range(n_data): + data = self.dataset[sample_idx] + path = os.path.join(self.folder_path, sample_names[sample_idx]) + data[0].save(path) + + def parse_cli_string(self, cli_words: str): + cli_words = cli_words.replace("lightly-upload ", "") + cli_words = re.split("=| ", cli_words) + assert len(cli_words) % 2 == 0 + dict_keys = cli_words[0::2] + dict_values = cli_words[1::2] + for key, value in zip(dict_keys, dict_values): + value = value.strip('\"') + value = value.strip('\'') + self.cfg[key] = value + + def test_parse_cli_string(self): + cli_string = "lightly-upload dataset_id='XYZ' upload='thumbnails'" + self.parse_cli_string(cli_string) + assert self.cfg["dataset_id"] == 'XYZ' + assert self.cfg["upload"] == 'thumbnails' + + def test_upload_no_token(self): + self.cfg['token']='' + with self.assertWarns(UserWarning): + lightly.cli.upload_cli(self.cfg) + + def test_upload_new_dataset_name(self): + cli_string = "lightly-upload new_dataset_name='new_dataset_name_xyz'" + self.parse_cli_string(cli_string) + lightly.cli.upload_cli(self.cfg) + + def test_upload_new_dataset_id(self): + cli_string = "lightly-upload dataset_id='xyz'" + self.parse_cli_string(cli_string) + lightly.cli.upload_cli(self.cfg) + + def test_upload_no_dataset(self): + cli_string = "lightly-upload input_dir=data/ token='123'" + self.parse_cli_string(cli_string) + with self.assertWarns(UserWarning): + lightly.cli.upload_cli(self.cfg) + + def test_upload_both_dataset(self): + cli_string = "lightly-upload new_dataset_name='new_dataset_name_xyz' dataset_id='xyz'" + self.parse_cli_string(cli_string) + with self.assertWarns(UserWarning): + lightly.cli.upload_cli(self.cfg) + + + From 3f6203b6314e8efb8c72e25f08844cc631e9dcd5 Mon Sep 17 00:00:00 2001 From: MalteEbner Date: Thu, 22 Apr 2021 10:32:14 +0200 Subject: [PATCH 25/27] Created 3 templates for pull requests (#312) Created 3 templates for following purposes: - minimal: Make it easy for external developers to do a PR - checklist: a medium-sized checklist with three checklist subsections: type of change, tests, docs - checklist_full: extends the checklist with an example for a manual test and more elaborate further issues --- .../PR_template_checklist.md | 21 +++++++++++++ .../PR_template_checklist_full.md | 31 +++++++++++++++++++ .../PR_template_minimal.md | 12 +++++++ 3 files changed, 64 insertions(+) create mode 100644 .github/templates/PULL_REQUEST_TEMPLATE/PR_template_checklist.md create mode 100644 .github/templates/PULL_REQUEST_TEMPLATE/PR_template_checklist_full.md create mode 100644 .github/templates/PULL_REQUEST_TEMPLATE/PR_template_minimal.md diff --git a/.github/templates/PULL_REQUEST_TEMPLATE/PR_template_checklist.md b/.github/templates/PULL_REQUEST_TEMPLATE/PR_template_checklist.md new file mode 100644 index 000000000..e9402109a --- /dev/null +++ b/.github/templates/PULL_REQUEST_TEMPLATE/PR_template_checklist.md @@ -0,0 +1,21 @@ +closes #issue_number + +## Description +- [ ] My change is breaking +Please_describe_what_you_changed_and_why___You_do_not_need_to_repeat_stuff_from_the_issue + +## Tests +- [ ] My change is covered by existing tests. +- [ ] My change needs new tests. +- [ ] I have added/adapted the tests accordingly. +- [ ] I have manually tested the change. if_yes_describe_how + +## Documentation +- [ ] I have added docstrings to all public functions/methods. +- [ ] My change requires a change to the documentation ( `.rst` files). +- [ ] I have updated the documentation accordingly. +- [ ] The autodocs update the documentation accordingly. + +## Implications / comments / further issues +- #e_g_link_to_issue_to_cover_breaking_changes + diff --git a/.github/templates/PULL_REQUEST_TEMPLATE/PR_template_checklist_full.md b/.github/templates/PULL_REQUEST_TEMPLATE/PR_template_checklist_full.md new file mode 100644 index 000000000..aaf66fe49 --- /dev/null +++ b/.github/templates/PULL_REQUEST_TEMPLATE/PR_template_checklist_full.md @@ -0,0 +1,31 @@ +closes #issue_number + +## Description +- [ ] My change is breaking +Please_describe_what_you_changed_and_why___You_do_not_need_to_repeat_stuff_from_the_issue + +## Tests +- [ ] My change is covered by existing tests +- [ ] My change needs new tests +- [ ] I have added/adapted tests accordingly. +- [ ] I have manually tested the change. + +If applicable, describe the manual test procedure, e.g: +```bash +pip uninstall lightly +export BRANCH_NAME="branch_name" +pip install "git+https://github.com/lightly-ai/lightly.git@$BRANCH_NAME" +lightly-cli_do_something_command +``` + +## Documentation +- [ ] I have added docstrings to all changed/added public functions/methods. +- [ ] My change requires a change to the documentation ( `.rst` files). +- [ ] I have updated the documentation accordingly. +- [ ] The autodocs update the documentation accordingly.` + +## Improvements put into another issue: +- #issue_number + +## Issues covering the breaking change: +- #link_to_issue_in_other_repo to adapt the other side of the breaking change \ No newline at end of file diff --git a/.github/templates/PULL_REQUEST_TEMPLATE/PR_template_minimal.md b/.github/templates/PULL_REQUEST_TEMPLATE/PR_template_minimal.md new file mode 100644 index 000000000..f2ec4432c --- /dev/null +++ b/.github/templates/PULL_REQUEST_TEMPLATE/PR_template_minimal.md @@ -0,0 +1,12 @@ +closes #issue_number + +## Description +Please_describe_what_you_changed_and_why___You_do_not_need_to_repeat_stuff_from_the_issue + +## Documentation +- [ ] I have updated the documentation. +- [ ] I need help on it. + +## Tests +- [ ] I have updated the tests. +- [ ] I need help on it. \ No newline at end of file From 2e81ff1589075d4fcf962b7b2b5ca121e62bc986 Mon Sep 17 00:00:00 2001 From: MalteEbner Date: Thu, 22 Apr 2021 13:50:01 +0200 Subject: [PATCH 26/27] Bugfix: put PR templates one folder up (#316) --- .../PULL_REQUEST_TEMPLATE/PR_template_checklist.md | 0 .../PULL_REQUEST_TEMPLATE/PR_template_checklist_full.md | 0 .../{templates => }/PULL_REQUEST_TEMPLATE/PR_template_minimal.md | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename .github/{templates => }/PULL_REQUEST_TEMPLATE/PR_template_checklist.md (100%) rename .github/{templates => }/PULL_REQUEST_TEMPLATE/PR_template_checklist_full.md (100%) rename .github/{templates => }/PULL_REQUEST_TEMPLATE/PR_template_minimal.md (100%) diff --git a/.github/templates/PULL_REQUEST_TEMPLATE/PR_template_checklist.md b/.github/PULL_REQUEST_TEMPLATE/PR_template_checklist.md similarity index 100% rename from .github/templates/PULL_REQUEST_TEMPLATE/PR_template_checklist.md rename to .github/PULL_REQUEST_TEMPLATE/PR_template_checklist.md diff --git a/.github/templates/PULL_REQUEST_TEMPLATE/PR_template_checklist_full.md b/.github/PULL_REQUEST_TEMPLATE/PR_template_checklist_full.md similarity index 100% rename from .github/templates/PULL_REQUEST_TEMPLATE/PR_template_checklist_full.md rename to .github/PULL_REQUEST_TEMPLATE/PR_template_checklist_full.md diff --git a/.github/templates/PULL_REQUEST_TEMPLATE/PR_template_minimal.md b/.github/PULL_REQUEST_TEMPLATE/PR_template_minimal.md similarity index 100% rename from .github/templates/PULL_REQUEST_TEMPLATE/PR_template_minimal.md rename to .github/PULL_REQUEST_TEMPLATE/PR_template_minimal.md From 7314e93077d11fc905c4a5152ad2ab4663122b22 Mon Sep 17 00:00:00 2001 From: MalteEbner Date: Thu, 22 Apr 2021 14:51:41 +0200 Subject: [PATCH 27/27] bump version 1.1.6 (#319) --- lightly/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightly/__init__.py b/lightly/__init__.py index c4463f560..e25608181 100644 --- a/lightly/__init__.py +++ b/lightly/__init__.py @@ -74,7 +74,7 @@ # All Rights Reserved __name__ = 'lightly' -__version__ = '1.1.5' +__version__ = '1.1.6' try: