Skip to content

Commit

Permalink
Migratation to jina3 (#8)
Browse files Browse the repository at this point in the history
* tests:  update to jina 3

* fix(executors):  update to jina 3

* fix(cicd):  directly use the master branch of jina

* Revert "fix(cicd):  directly use the master branch of jina"

This reverts commit d26796e.

* fix: update dockerfile gpu to use jina master branch

* fix: remove optional docs
  • Loading branch information
samsja authored Feb 11, 2022
1 parent e129998 commit 8e14d97
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 27 deletions.
2 changes: 1 addition & 1 deletion Dockerfile.gpu
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM jinaai/jina:2-py37-perf
FROM jinaai/jina:master-py37-perf

COPY gpu_requirements.txt gpu_requirements.txt
RUN pip install --no-cache-dir -r gpu_requirements.txt
Expand Down
9 changes: 4 additions & 5 deletions tests/integration/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,16 @@ def test_integration(request_size: int):
[Document(text='just some random text here') for _ in range(50)]
)
with Flow(return_results=True).add(uses=TransformerTorchEncoder) as flow:
resp = flow.post(
da = flow.post(
on='/index',
inputs=docs,
request_size=request_size,
return_results=True,
)

assert sum(len(resp_batch.docs) for resp_batch in resp) == 50
for r in resp:
for doc in r.docs:
assert doc.embedding.shape == (_EMBEDDING_DIM,)
assert len(da) == 50
for doc in da:
assert doc.embedding.shape == (_EMBEDDING_DIM,)


@pytest.mark.gpu
Expand Down
16 changes: 9 additions & 7 deletions tests/unit/test_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,10 @@ def test_pooling_strategy(pooling_strategy: str):
@pytest.mark.parametrize(
'traversal_paths, counts',
[
('r', [['r', 1], ['c', 0], ['cc', 0]]),
('c', [['r', 0], ['c', 3], ['cc', 0]]),
('cc', [['r', 0], ['c', 0], ['cc', 2]]),
('cc,r', [['r', 1], ['c', 0], ['cc', 2]]),
('@r', [['@r', 1], ['@c', 0], ['@cc', 0]]),
('@c', [['@r', 0], ['@c', 3], ['@cc', 0]]),
('@cc', [['@r', 0], ['@c', 0], ['@cc', 2]]),
('@cc,r', [['@r', 1], ['@c', 0], ['@cc', 2]]),
],
)
def test_traversal_path(
Expand All @@ -124,9 +124,11 @@ def test_traversal_path(

basic_encoder.encode(docs=docs, parameters={'traversal_paths': traversal_paths})
for path, count in counts:
embeddings = docs.traverse_flat(path).get_attributes('embedding')
assert len(list(filter(lambda x: x is not None, embeddings))) == count

embeddings = docs[path].embeddings
if count != 0:
assert len([em for em in embeddings if em is not None]) == count
else:
assert embeddings is None

@pytest.mark.parametrize('batch_size', [1, 2, 4, 8])
def test_batch_size(basic_encoder: TransformerTorchEncoder, batch_size: int):
Expand Down
26 changes: 12 additions & 14 deletions transform_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def __init__(
max_length: Optional[int] = None,
embedding_fn_name: str = '__call__',
device: str = 'cpu',
traversal_paths: str = 'r',
traversal_paths: str = '@r',
batch_size: int = 32,
*args,
**kwargs,
Expand Down Expand Up @@ -67,7 +67,7 @@ def __init__(
self.model.to(device).eval()

@requests
def encode(self, docs: Optional[DocumentArray], parameters: Dict={}, **kwargs):
def encode(self, docs: DocumentArray, parameters: Dict={}, **kwargs):
"""
Encode text data into a ndarray of `D` as dimension, and fill the embedding of
each Document.
Expand All @@ -78,18 +78,16 @@ def encode(self, docs: Optional[DocumentArray], parameters: Dict={}, **kwargs):
`parameters={'traversal_paths': 'r', 'batch_size': 10}`.
:param kwargs: Additional key value arguments.
"""
if docs is None:
return

docs_batch_generator = docs.traverse_flat(
traversal_paths=parameters.get('traversal_paths', self.traversal_paths),
filter_fn=lambda doc: len(doc.text) > 0
).batch(
batch_size=parameters.get('batch_size', self.batch_size),
)

docs_batch_generator = DocumentArray(
filter(
lambda x: bool(x.text),
docs[parameters.get('traversal_paths', self.traversal_paths)],
)
).batch(batch_size=parameters.get('batch_size', self.batch_size))

for batch in docs_batch_generator:
texts = batch.get_attributes('text')
texts = batch.texts

with torch.inference_mode():
input_tokens = self._generate_input_tokens(texts)
Expand All @@ -98,8 +96,8 @@ def encode(self, docs: Optional[DocumentArray], parameters: Dict={}, **kwargs):
outputs = outputs.cpu().numpy()
hidden_states = outputs.hidden_states
embeds = self._compute_embedding(hidden_states, input_tokens)
for doc, embed in zip(batch, embeds):
doc.embedding = embed
batch.embeddings = embeds


def _compute_embedding(
self, hidden_states: Tuple['torch.Tensor'], input_tokens: Dict
Expand Down

0 comments on commit 8e14d97

Please sign in to comment.