Skip to content

Commit 2f70907

Browse files
authored
Merge branch 'master' into sunu-graph-api
2 parents 9cae990 + fecb99a commit 2f70907

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

75 files changed

+1161
-907
lines changed

.travis.yml

+3-1
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,10 @@ script:
3535
after_success:
3636
- echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin
3737
# push tag as well
38-
- if [[ $TRAVIS_TAG = $TRAVIS_BRANCH ]]; then
38+
- if [[ $TRAVIS_BRANCH = "master" ]]; then
3939
make TAG=latest build-full docker-push ;
40+
fi
41+
- if [[ $TRAVIS_TAG = $TRAVIS_BRANCH ]]; then
4042
make TAG=$TRAVIS_TAG build-full docker-push ;
4143
fi
4244
- if [[ $TRAVIS_BRANCH = "occrp/production" ]]; then

Dockerfile

+1-2
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,7 @@ COPY . /aleph
5050
WORKDIR /aleph
5151
ENV PYTHONPATH /aleph
5252
RUN pip install -e /aleph
53-
RUN cd /usr/local/lib/python3.6/dist-packages && python3 /aleph/setup.py develop
54-
53+
# RUN cd /usr/local/lib/python3.6/dist-packages && python3 /aleph/setup.py develop
5554

5655
# Configure some docker defaults:
5756
ENV C_FORCE_ROOT=true \

Makefile

+6-6
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,14 @@ services:
88
$(COMPOSE) up -d --remove-orphans \
99
rabbitmq postgres elasticsearch \
1010
convert-document extract-entities \
11-
extract-countries recognize-text
11+
recognize-text
1212

1313
shell: services
1414
$(DEVDOCKER) /bin/bash
1515

1616
test:
1717
$(DEVDOCKER) contrib/test.sh
18+
$(COMPOSE) run --rm extract-entities pytest
1819

1920
upgrade: build
2021
$(COMPOSE) up -d postgres elasticsearch
@@ -49,18 +50,18 @@ build:
4950
docker build --cache-from alephdata/aleph-convert-document -t alephdata/aleph-convert-document:$(TAG) services/convert-document
5051
docker build --cache-from alephdata/aleph-recognize-text -t alephdata/aleph-recognize-text:$(TAG) services/recognize-text
5152
docker build --cache-from alephdata/aleph-extract-entities -t alephdata/aleph-extract-entities:$(TAG) services/extract-entities
52-
docker build --cache-from alephdata/aleph-extract-countries -t alephdata/aleph-extract-countries:$(TAG) services/extract-countries
5353

54-
build-full: build
55-
docker build -t alephdata/aleph-ui-production:$(TAG) ui/production
54+
build-ui:
55+
docker build -t alephdata/aleph-ui-production:$(TAG) -f ui/Dockerfile.production ui
56+
57+
build-full: build build-ui
5658

5759
docker-pull:
5860
docker pull alephdata/aleph
5961
docker pull alephdata/aleph-ui
6062
docker pull alephdata/aleph-convert-document
6163
docker pull alephdata/aleph-recognize-text
6264
docker pull alephdata/aleph-extract-entities
63-
docker pull alephdata/aleph-extract-countries
6465

6566
docker-push:
6667
docker push alephdata/aleph:$(TAG)
@@ -69,7 +70,6 @@ docker-push:
6970
docker push alephdata/aleph-convert-document:$(TAG)
7071
docker push alephdata/aleph-recognize-text:$(TAG)
7172
docker push alephdata/aleph-extract-entities:$(TAG)
72-
docker push alephdata/aleph-extract-countries:$(TAG)
7373

7474
dev:
7575
pip install -q transifex-client bumpversion babel

aleph/analyze/extract_country.py

-33
This file was deleted.

aleph/analyze/extract_entity.py

+12-3
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@ class EntityExtractor(EntityAnalyzer, TextIterator, ServiceClientMixin):
1818
ExtractedEntity.PERSON: DocumentTag.TYPE_PERSON,
1919
ExtractedEntity.ORGANIZATION: DocumentTag.TYPE_ORGANIZATION,
2020
ExtractedEntity.COMPANY: DocumentTag.TYPE_ORGANIZATION,
21+
ExtractedEntity.PHONE: DocumentTag.TYPE_PHONE,
22+
ExtractedEntity.EMAIL: DocumentTag.TYPE_EMAIL,
23+
ExtractedEntity.IBAN: DocumentTag.TYPE_IBAN,
24+
ExtractedEntity.IPADDRESS: DocumentTag.TYPE_IP,
25+
ExtractedEntity.LOCATION: DocumentTag.TYPE_LOCATION
2126
}
2227

2328
def __init__(self):
@@ -31,10 +36,14 @@ def extract(self, collector, document):
3136
texts = self.text_iterator(document)
3237
entities = service.Extract(texts)
3338
for entity in entities.entities:
39+
if entity.type == ExtractedEntity.COUNTRY:
40+
document.add_country(entity.label)
41+
if entity.type == ExtractedEntity.LANGUAGE:
42+
document.add_language(entity.label)
3443
type_ = self.TYPES.get(entity.type)
35-
if type_ is None:
36-
continue
37-
collector.emit(entity.label, type_, weight=entity.weight)
44+
# log.info('%s: %s', entity.label, type_)
45+
if type_ is not None:
46+
collector.emit(entity.label, type_, weight=entity.weight)
3847
log.info('Extracted %s entities.', len(collector))
3948
except self.Error as e:
4049
log.warning("gRPC [%s]: %s", e.code(), e.details())

aleph/analyze/language.py

-46
This file was deleted.

aleph/analyze/regex.py

-62
This file was deleted.

aleph/index/admin.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ def upgrade_search():
3232
es.indices.create(index, body=body, ignore=[404, 400])
3333
# es.indices.put_mapping(index=index, doc_type='doc', body=mapping)
3434
es.indices.open(index=index, ignore=[400, 404])
35-
es.indices.refresh(index=index, ignore=[400, 404])
3635
es.indices.clear_cache(index=index, ignore=[400, 404])
3736

3837

@@ -42,4 +41,4 @@ def delete_index():
4241

4342
def clear_index():
4443
q = {'query': {'match_all': {}}}
45-
es.delete_by_query(index=all_indexes(), body=q, refresh=True)
44+
es.delete_by_query(index=all_indexes(), body=q)

aleph/index/collections.py

+5-10
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,13 @@
11
import logging
2-
import exactitude
32
from pprint import pprint # noqa
43
from normality import normalize
4+
from followthemoney import types
55

66
from aleph.core import es
77
from aleph.model import Entity, Collection
88
from aleph.index.core import collections_index, entities_index, records_index
99
from aleph.index.util import query_delete, query_update, unpack_result
10-
from aleph.index.util import index_safe, index_form, refresh_index
11-
from aleph.index.util import search_safe
10+
from aleph.index.util import index_safe, index_form, search_safe
1211

1312
log = logging.getLogger(__name__)
1413

@@ -86,19 +85,17 @@ def index_collection(collection):
8685
if countries is None or not len(countries):
8786
countries = aggregations['countries']['buckets']
8887
countries = [c['key'] for c in countries]
89-
data['countries'] = exactitude.countries.normalize_set(countries)
88+
data['countries'] = types.countries.normalize_set(countries)
9089

9190
languages = collection.languages
9291
if languages is None or not len(languages):
9392
languages = aggregations['languages']['buckets']
9493
languages = [c['key'] for c in languages]
95-
data['languages'] = exactitude.languages.normalize_set(languages)
94+
data['languages'] = types.languages.normalize_set(languages)
9695

9796
texts.extend([normalize(t, ascii=True) for t in texts])
9897
data['text'] = index_form(texts)
99-
data = index_safe(collections_index(), collection.id, data)
100-
refresh_index(index=collections_index())
101-
return data
98+
return index_safe(collections_index(), collection.id, data)
10299

103100

104101
def get_collection(collection_id):
@@ -127,7 +124,6 @@ def delete_collection(collection_id):
127124
"""Delete all documents from a particular collection."""
128125
q = {'ids': {'values': str(collection_id)}}
129126
query_delete(collections_index(), q)
130-
refresh_index(index=collections_index())
131127

132128

133129
def delete_entities(collection_id):
@@ -143,7 +139,6 @@ def delete_documents(collection_id):
143139
"""Delete documents from a collection."""
144140
records_query = {'term': {'collection_id': collection_id}}
145141
query_delete(records_index(), records_query)
146-
refresh_index(index=records_index())
147142
query = {'bool': {
148143
'must': [
149144
{'term': {'schemata': 'Document'}},

aleph/index/documents.py

+5-37
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,12 @@
11
import logging
22
from pprint import pprint # noqa
3-
from collections import defaultdict
43

5-
from aleph.core import celery, db
6-
from aleph.model import Document, DocumentTag
7-
from aleph.index.core import records_index
4+
from aleph.core import celery
5+
from aleph.model import Document
86
from aleph.index.records import index_records, clear_records
97
from aleph.index.entities import delete_entity, index_single
10-
from aleph.index.util import refresh_index
118

129
log = logging.getLogger(__name__)
13-
MAX_TAGS_PER_DOCUMENT = 1000
1410

1511

1612
@celery.task()
@@ -23,31 +19,11 @@ def index_document_id(document_id):
2319
index_records(document)
2420

2521

26-
def generate_tags(document):
27-
"""Transform document tag objects into normalized tag snippets."""
28-
if document.status == Document.STATUS_PENDING:
29-
return []
30-
tags = defaultdict(set)
31-
q = db.session.query(DocumentTag)
32-
q = q.filter(DocumentTag.document_id == document.id)
33-
q = q.order_by(DocumentTag.weight.desc())
34-
q = q.limit(MAX_TAGS_PER_DOCUMENT)
35-
for tag in q.all():
36-
type_ = DocumentTag.TYPES[tag.type]
37-
values = type_.normalize(tag.text,
38-
cleaned=True,
39-
countries=document.countries)
40-
if tag.field is not None:
41-
tags[tag.field].update(values)
42-
43-
# pprint(dict(tags))
44-
return tags.items()
45-
46-
4722
def index_document(document):
4823
name = document.name
4924
log.info("Index document [%s]: %s", document.id, name)
50-
data = {
25+
data = document.to_schema_entity()
26+
data.update({
5127
'status': document.status,
5228
'content_hash': document.content_hash,
5329
'foreign_id': document.foreign_id,
@@ -77,7 +53,7 @@ def index_document(document):
7753
'columns': document.columns,
7854
'ancestors': document.ancestors,
7955
'children': document.children.count()
80-
}
56+
})
8157

8258
texts = list(document.texts)
8359
texts.extend(document.columns)
@@ -90,17 +66,9 @@ def index_document(document):
9066
'title': document.parent.title,
9167
}
9268

93-
for (field, values) in generate_tags(document):
94-
if field not in data:
95-
data[field] = list(values)
96-
else:
97-
data[field].extend(values)
98-
texts.extend(values)
99-
10069
return index_single(document, data, texts)
10170

10271

10372
def delete_document(document_id):
10473
clear_records(document_id)
10574
delete_entity(document_id)
106-
refresh_index(index=records_index())

0 commit comments

Comments
 (0)