Migratation to jina3 (#8)

* tests: update to jina 3 * fix(executors): update to jina 3 * fix(cicd): directly use the master branch of jina * Revert "fix(cicd): directly use the master branch of jina" This reverts commit d26796e. * fix: update dockerfile gpu to use jina master branch * fix: remove optional docs
jina-ai · Feb 11, 2022 · 8e14d97 · 8e14d97
1 parent e129998
commit 8e14d97
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 27 deletions.
diff --git a/Dockerfile.gpu b/Dockerfile.gpu
@@ -1,4 +1,4 @@
-FROM jinaai/jina:2-py37-perf
+FROM jinaai/jina:master-py37-perf
 
 COPY gpu_requirements.txt gpu_requirements.txt
 RUN pip install --no-cache-dir -r gpu_requirements.txt

diff --git a/tests/integration/test_integration.py b/tests/integration/test_integration.py
@@ -16,17 +16,16 @@ def test_integration(request_size: int):
         [Document(text='just some random text here') for _ in range(50)]
     )
     with Flow(return_results=True).add(uses=TransformerTorchEncoder) as flow:
-        resp = flow.post(
+        da = flow.post(
             on='/index',
             inputs=docs,
             request_size=request_size,
             return_results=True,
         )
 
-    assert sum(len(resp_batch.docs) for resp_batch in resp) == 50
-    for r in resp:
-        for doc in r.docs:
-            assert doc.embedding.shape == (_EMBEDDING_DIM,)
+    assert len(da) == 50
+    for doc in da:
+        assert doc.embedding.shape == (_EMBEDDING_DIM,)
 
 
 @pytest.mark.gpu

diff --git a/tests/unit/test_encoder.py b/tests/unit/test_encoder.py
@@ -101,10 +101,10 @@ def test_pooling_strategy(pooling_strategy: str):
 @pytest.mark.parametrize(
     'traversal_paths, counts',
     [
-        ('r', [['r', 1], ['c', 0], ['cc', 0]]),
-        ('c', [['r', 0], ['c', 3], ['cc', 0]]),
-        ('cc', [['r', 0], ['c', 0], ['cc', 2]]),
-        ('cc,r', [['r', 1], ['c', 0], ['cc', 2]]),
+        ('@r', [['@r', 1], ['@c', 0], ['@cc', 0]]),
+        ('@c', [['@r', 0], ['@c', 3], ['@cc', 0]]),
+        ('@cc', [['@r', 0], ['@c', 0], ['@cc', 2]]),
+        ('@cc,r', [['@r', 1], ['@c', 0], ['@cc', 2]]),
     ],
 )
 def test_traversal_path(
@@ -124,9 +124,11 @@ def test_traversal_path(
 
     basic_encoder.encode(docs=docs, parameters={'traversal_paths': traversal_paths})
     for path, count in counts:
-        embeddings = docs.traverse_flat(path).get_attributes('embedding')
-        assert len(list(filter(lambda x: x is not None, embeddings))) == count
-
+        embeddings = docs[path].embeddings
+        if count != 0:
+            assert len([em for em in embeddings if em is not None]) == count
+        else:
+            assert embeddings is None
 
 @pytest.mark.parametrize('batch_size', [1, 2, 4, 8])
 def test_batch_size(basic_encoder: TransformerTorchEncoder, batch_size: int):

diff --git a/transform_encoder.py b/transform_encoder.py
@@ -23,7 +23,7 @@ def __init__(
         max_length: Optional[int] = None,
         embedding_fn_name: str = '__call__',
         device: str = 'cpu',
-        traversal_paths: str = 'r',
+        traversal_paths: str = '@r',
         batch_size: int = 32,
         *args,
         **kwargs,
@@ -67,7 +67,7 @@ def __init__(
         self.model.to(device).eval()
 
     @requests
-    def encode(self, docs: Optional[DocumentArray], parameters: Dict={}, **kwargs):
+    def encode(self, docs: DocumentArray, parameters: Dict={}, **kwargs):
         """
         Encode text data into a ndarray of `D` as dimension, and fill the embedding of
         each Document.
@@ -78,18 +78,16 @@ def encode(self, docs: Optional[DocumentArray], parameters: Dict={}, **kwargs):
             `parameters={'traversal_paths': 'r', 'batch_size': 10}`.
         :param kwargs: Additional key value arguments.
         """
-        if docs is None:
-            return
-
-        docs_batch_generator = docs.traverse_flat(
-            traversal_paths=parameters.get('traversal_paths', self.traversal_paths),
-            filter_fn=lambda doc: len(doc.text) > 0
-        ).batch(
-            batch_size=parameters.get('batch_size', self.batch_size),
-        )
+
+        docs_batch_generator =  DocumentArray(
+            filter(
+                lambda x: bool(x.text),
+                docs[parameters.get('traversal_paths', self.traversal_paths)],
+            )
+        ).batch(batch_size=parameters.get('batch_size', self.batch_size))
 
         for batch in docs_batch_generator:
-            texts = batch.get_attributes('text')
+            texts = batch.texts
 
             with torch.inference_mode():
                 input_tokens = self._generate_input_tokens(texts)
@@ -98,8 +96,8 @@ def encode(self, docs: Optional[DocumentArray], parameters: Dict={}, **kwargs):
                     outputs = outputs.cpu().numpy()
                 hidden_states = outputs.hidden_states
                 embeds = self._compute_embedding(hidden_states, input_tokens)
-                for doc, embed in zip(batch, embeds):
-                    doc.embedding = embed
+                batch.embeddings = embeds
+
 
     def _compute_embedding(
         self, hidden_states: Tuple['torch.Tensor'], input_tokens: Dict