Skip to content

Commit 63a13dd

Browse files
committed
Inline NER as an experiment
1 parent a2abe5d commit 63a13dd

File tree

5 files changed

+36
-60
lines changed

5 files changed

+36
-60
lines changed

Dockerfile

+2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
FROM alephdata/aleph-base:8
22

33
# Install Python dependencies
4+
RUN pip3 install spacy-nightly
5+
RUN python3 -m spacy download xx
46
COPY requirements-generic.txt /tmp/
57
RUN pip3 install --no-cache-dir -r /tmp/requirements-generic.txt
68
COPY requirements-toolkit.txt /tmp/

aleph/logic/collections.py

+10-6
Original file line numberDiff line numberDiff line change
@@ -38,16 +38,20 @@ def refresh_collection(collection_id, sync=False):
3838
cache.kv.delete(cache.object_key(Collection, collection_id))
3939

4040

41+
def index_collection(collection, entities=False, refresh=False):
42+
log.info("Index [%s]: %s", collection.id, collection.label)
43+
if entities and collection.deleted_at is None:
44+
index_collection_entities.delay(collection_id=collection.id)
45+
if refresh:
46+
refresh_collection(collection.id)
47+
index.index_collection(collection)
48+
49+
4150
def index_collections(entities=False, refresh=False):
4251
q = Collection.all(deleted=True)
4352
q = q.order_by(Collection.updated_at.desc())
4453
for collection in q:
45-
log.info("Index [%s]: %s", collection.id, collection.label)
46-
if entities and collection.deleted_at is None:
47-
index_collection_entities.delay(collection_id=collection.id)
48-
if refresh:
49-
refresh_collection(collection.id)
50-
index.index_collection(collection)
54+
index_collection(collection, entities=entities, refresh=refresh)
5155

5256

5357
def delete_collection(collection, sync=False):

aleph/logic/extractors/extract.py

+24-35
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,32 @@
1+
import spacy
12
import logging
2-
import textwrap
3-
from servicelayer.rpc import ExtractedEntity
4-
from servicelayer.rpc import EntityExtractService
53

64
from aleph import settings
7-
from aleph.tracing import trace_function
8-
from aleph.logic.extractors.result import PersonResult, LocationResult
9-
from aleph.logic.extractors.result import OrganizationResult, LanguageResult
5+
from aleph.logic.extractors.result import PersonResult
6+
from aleph.logic.extractors.result import LocationResult
7+
from aleph.logic.extractors.result import OrganizationResult
108

119
log = logging.getLogger(__name__)
12-
13-
14-
class NERService(EntityExtractService):
15-
MIN_LENGTH = 60
16-
MAX_LENGTH = 100000
17-
TYPES = {
18-
ExtractedEntity.ORGANIZATION: OrganizationResult,
19-
ExtractedEntity.PERSON: PersonResult,
20-
ExtractedEntity.LOCATION: LocationResult,
21-
ExtractedEntity.LANGUAGE: LanguageResult
22-
}
23-
24-
@trace_function(span_name='NER')
25-
def extract_all(self, text, languages):
26-
if text is None or len(text) < self.MIN_LENGTH:
27-
return
28-
if len(text) > self.MAX_LENGTH:
29-
texts = textwrap.wrap(text, self.MAX_LENGTH)
30-
else:
31-
texts = [text]
32-
for text in texts:
33-
for res in self.Extract(text, languages):
34-
clazz = self.TYPES.get(res.type)
35-
yield (res.text, clazz, res.start, res.end)
10+
MIN_LENGTH = 60
11+
MAX_LENGTH = 100000
12+
# https://spacy.io/api/annotation#named-entities
13+
SPACY_TYPES = {
14+
'PER': PersonResult,
15+
'PERSON': PersonResult,
16+
'ORG': OrganizationResult,
17+
'LOC': LocationResult,
18+
'GPE': LocationResult
19+
}
3620

3721

3822
def extract_entities(ctx, text, languages):
39-
if not hasattr(settings, '_ner_service'):
40-
settings._ner_service = NERService()
41-
entities = settings._ner_service.extract_all(text, languages=languages)
42-
for (text, clazz, start, end) in entities:
43-
yield clazz.create(ctx, text, start, end)
23+
if text is None or len(text) < MIN_LENGTH:
24+
return
25+
if not hasattr(settings, '_nlp'):
26+
settings._nlp = spacy.load('xx')
27+
doc = settings._nlp(text)
28+
for ent in doc.ents:
29+
clazz = SPACY_TYPES.get(ent.label_)
30+
label = ent.text.strip()
31+
if clazz is not None and len(label):
32+
yield clazz.create(ctx, label, ent.start, ent.end)

docker-compose.dev.yml

-8
Original file line numberDiff line numberDiff line change
@@ -46,12 +46,6 @@ services:
4646
restart: on-failure
4747
expose:
4848
- 50000
49-
50-
extract-entities:
51-
image: alephdata/extract-entities:1.2.0
52-
restart: on-failure
53-
expose:
54-
- 50000
5549

5650
app:
5751
image: alephdata/aleph
@@ -63,7 +57,6 @@ services:
6357
- redis
6458
- convert-document
6559
- recognize-text
66-
- extract-entities
6760
tmpfs: /tmp
6861
volumes:
6962
- archive-data:/data
@@ -96,7 +89,6 @@ services:
9689
- redis
9790
- convert-document
9891
- recognize-text
99-
- extract-entities
10092
tmpfs: /tmp
10193
volumes:
10294
- archive-data:/data

docker-compose.yml

-11
Original file line numberDiff line numberDiff line change
@@ -44,13 +44,6 @@ services:
4444
expose:
4545
- 50000
4646

47-
extract-entities:
48-
image: alephdata/extract-entities:1.2.0
49-
restart: on-failure
50-
mem_limit: 4g
51-
expose:
52-
- 50000
53-
5447
worker:
5548
image: alephdata/aleph
5649
command: celery -A aleph.queues -B -c 4 -l INFO worker
@@ -62,7 +55,6 @@ services:
6255
- redis
6356
- convert-document
6457
- recognize-text
65-
- extract-entities
6658
tmpfs:
6759
- /tmp
6860
volumes:
@@ -81,7 +73,6 @@ services:
8173
- redis
8274
- convert-document
8375
- recognize-text
84-
- extract-entities
8576
- worker
8677
tmpfs:
8778
- /tmp
@@ -103,7 +94,6 @@ services:
10394
- worker
10495
- convert-document
10596
- recognize-text
106-
- extract-entities
10797
tmpfs:
10898
- /tmp
10999
volumes:
@@ -124,7 +114,6 @@ services:
124114
- worker
125115
- convert-document
126116
- recognize-text
127-
- extract-entities
128117
tmpfs:
129118
- /tmp
130119
volumes:

0 commit comments

Comments
 (0)