Skip to content

Commit

Permalink
Change how search is done on ror and fundreg.
Browse files Browse the repository at this point in the history
  • Loading branch information
kmccurley committed Dec 22, 2024
1 parent f7bdcdb commit b508a6c
Show file tree
Hide file tree
Showing 10 changed files with 2,139 additions and 85 deletions.
1,009 changes: 1,009 additions & 0 deletions search/searchapp/index/countries.py

Large diffs are not rendered by default.

27 changes: 22 additions & 5 deletions search/searchapp/index/create_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ def create_index(dbpath, funderlist, verbose=False):
termgenerator.set_flags(termgenerator.FLAG_SPELLING);
count = 0
for funder in funderlist.funders.values():
narrower = {}
index_funder(funder, db, termgenerator)
count += 1
if count % 5000 == 0:
Expand All @@ -63,12 +62,14 @@ def fetch_fundreg():
def fetch_ror():
print('fetching ROR data')
# Apparently we have to use the zenodo schema to determine the date on the latest ROR data.
response = requests.get('https://zenodo.org/api/records/?communities=ror-data&sort=mostrecent')
version_data = response.json().get('hits').get('hits')[0]
response = requests.get('https://zenodo.org/api/records/?communities=ror-data&sort=mostrecent').json()
print(json.dumps(response, indent=2))
version_data = response.get('hits').get('hits')[0]
print(json.dumps(version_data, indent=2))
publication_date = version_data.get('metadata').get('publication_date')
print('ROR data from {}'.format(publication_date))
latest_url = version_data.get('files')[0].get('links').get('self')
latest_url = latest_url.replace('.json', '_schema_v2.json')
print('fetching {}'.format(latest_url))
# latest_url should be a zip file.
with requests.get(latest_url, stream=True) as stream:
Expand All @@ -80,7 +81,7 @@ def fetch_ror():
namelist = zipObj.namelist()
zipObj.extractall()
for fname in namelist:
if fname.endswith('.json'):
if fname.endswith('_schema_v2.json'):
os.rename(fname, _RAW_ROR_JSON)
else:
try:
Expand Down Expand Up @@ -181,10 +182,26 @@ def parse_ror(filename):
arguments.add_argument('--exclude_dup_fundref',
action='store_true',
help='Whether to replace Fundref with corresponding ROR')
arguments.add_argument('--easter_egg',
action='store_true',
help='Whether to add an easter egg.')
args = arguments.parse_args()
ror_file = Path(_ROR_JSON)
country_map = json.loads(open('data/country_map.json', 'r').read())
allfunders = FunderList(funders={})
if args.easter_egg:
obj = {'source': 'ror',
'source_id': 'ror_0ohdarn00',
'name': 'University of Second Choice',
'country': 'Odarn',
'funder_type': 'Education',
'country_code': 'OO',
'altnames': [],
'children': [],
'parents': [],
'related': []}
allfunders.funders['ror_0unreal13'] = Funder(**obj)
print(allfunders.funders)
if os.path.isfile(args.dbpath) or os.path.isdir(args.dbpath):
print('CANNOT OVERWRITE dbpath')
sys.exit(2)
Expand Down Expand Up @@ -213,7 +230,7 @@ def parse_ror(filename):
print('reading {}'.format(ror_file.name))
ror_funders = FunderList.model_validate_json(ror_file.read_text(encoding='UTF-8'))
else:
if not ror_file.is_file():
if not Path(_RAW_ROR_JSON).is_file():
fetch_ror()
print('parsing {}...this is slow to parse 112000 entries...'.format(_ROR_JSON))
ror_funders = parse_ror(_RAW_ROR_JSON)
Expand Down
9 changes: 8 additions & 1 deletion search/searchapp/index/data/countries.json
Original file line number Diff line number Diff line change
Expand Up @@ -1491,4 +1491,11 @@
"iso2": "ZW",
"iso3" : "ZWE",
"numeric": "716"
}]
},
{
"name": "Odarn",
"iso2": "OO",
"iso3": "OOO",
"numeric": "999"
}
]
10 changes: 6 additions & 4 deletions search/searchapp/index/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
"""

from enum import Enum
from pydantic import StringConstraints, ConfigDict, BaseModel, Field, conint, conlist, validator, AnyUrl
from pydantic import StringConstraints, ConfigDict, BaseModel, Field, conint, conlist, validator, AnyUrl, constr
from typing import List, Dict, Optional, Union, Literal
from typing_extensions import Annotated

Expand Down Expand Up @@ -98,9 +98,11 @@ class Funder(GlobalEntity):
country: str = Field(...,
title='Country of affiliation',
description='May be any string')
country_code: Optional[str] = Field(default=None,
title='ISO 3-letter country code.',
description='Optional')
country_code: Optional[constr(to_lower=True,
min_length=2,
max_length=2)] = Field(default=None,
title='ISO 2-letter country code.',
description='Optional')
funder_type: FunderType = Field(...,
title='The type of funding agency',
description='May be from ROR.')
Expand Down
3 changes: 2 additions & 1 deletion search/searchapp/index/rdf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from xml.sax.handler import ContentHandler
from model import Funder, RelationshipType, DataSource, StrEnum, FunderType, FunderList
from model import add_names_to_relationships
from countries import iso_codes
# This is a map from the values of svf:fundingBodySubType to the
# associated FunderType. We don't have a schema to define the values
# of svf:fundingBodySubType. They were apparently provided by
Expand Down Expand Up @@ -109,7 +110,7 @@ def endElement(self, name):
except:
raise ValueError('unrecognized funder type: ' + self.content)
elif self.current_tag == Tag.addressCountry:
self.item['country_code'] = self.content
self.item['country_code'] = iso_codes[self.content.lower()]['code']
self.item['country'] = self.country_map.get(self.content, 'unknown')
self.current_tag = None
self.content = '' # reset at end of tag.
Expand Down
38 changes: 20 additions & 18 deletions search/searchapp/index/search_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class SearchPrefix(str, Enum):
ORGTYPE = 'O'
ID = 'Q'
SOURCE = 'XS'
COUNTRY = 'N'

def index_funder(funder, writable_db=None, termgenerator=None):
"""Index the funder. It returns no value. It is used by create_index.py.
Expand All @@ -44,7 +45,10 @@ def index_funder(funder, writable_db=None, termgenerator=None):
doc = xapian.Document()
docid = funder.global_id()
doc.add_boolean_term(docid)
doc.add_boolean_term(SearchPrefix.SOURCE.value + funder.source.value)
termgenerator.increase_termpos()
doc.add_boolean_term(SearchPrefix.SOURCE.value + funder.source.value.lower())
termgenerator.increase_termpos()
doc.add_boolean_term(SearchPrefix.COUNTRY.value + funder.country_code.lower())
# We sort on SLOT_NUMBER
slot_value = '1' if funder.source.value == 'fundreg' else '0'
doc.add_value(SLOT_NUMBER, slot_value)
Expand All @@ -54,18 +58,18 @@ def index_funder(funder, writable_db=None, termgenerator=None):
termgenerator.index_text(name, 1, SearchPrefix.NAME.value)
termgenerator.index_text(name, NAME_WEIGHT)

termgenerator.increase_termpos()
for altname in funder.altnames:
termgenerator.increase_termpos()
termgenerator.index_text(altname, 1, SearchPrefix.NAME.value)
termgenerator.index_text(altname, NAME_WEIGHT)
for child in funder.children:
termgenerator.increase_termpos()
termgenerator.index_text(child.name, 1, SearchPrefix.NAME.value)
termgenerator.index_text(child.name, NAME_WEIGHT)

termgenerator.increase_termpos()
location = funder.country
termgenerator.index_text(location, 1, SearchPrefix.LOCATION.value)
termgenerator.increase_termpos()

termgenerator.increase_termpos()
orgtype = funder.funder_type.value
Expand All @@ -76,26 +80,26 @@ def index_funder(funder, writable_db=None, termgenerator=None):

data = funder.dict()
data['id'] = docid
data['source_id']
doc.set_data(json.dumps(data, indent=2))
writable_db.replace_document(docid, doc)

def search(db_path, offset=0, limit=1000, textq=None, locationq=None, source=None, app=None):
def search(db_path, offset=0, limit=1000, textq=None, country=None, source=None, app=None):
"""Execute a query on the index. At least one of textq or locationq
must be non-None.
Args:
db_path: path to database
offset: starting offset for paging of results
textq: raw query string from the user to be applied to any text field
locationq: raw query for location field
country: country code filter
Returns: dict with the following:
error: string if an error occurs (no other fields in this case)
parsed_query: debug parsed query
estimated_results: number of total results available
results: an array of results
"""
if (not textq and not locationq):
if textq is None and country is None:
app.logger.info('search with not query')
return {'estimated_results': 0,
'parsed_query': '',
'spell_corrected_query': '',
Expand All @@ -122,21 +126,19 @@ def search(db_path, offset=0, limit=1000, textq=None, locationq=None, source=Non
# FLAG_WILDCARD enables things like * signature scheme to expand the *
flags = queryparser.FLAG_SPELLING_CORRECTION | queryparser.FLAG_BOOLEAN | queryparser.FLAG_LOVEHATE | queryparser.FLAG_PHRASE | queryparser.FLAG_WILDCARD
# we build a list of subqueries and combine them later with AND.
if not textq and not locationq:
return {'error': 'missing query'}
query_list = []
if textq:
terms = textq.split()
terms[-1] = terms[-1] + '*'
for term in terms:
query_list.append(queryparser.parse_query(term, flags))
if locationq:
location_query = queryparser.parse_query(locationq, flags, SearchPrefix.LOCATION.value)
query_list.append(location_query)
query = xapian.Query(xapian.Query.OP_AND, query_list)
if source: # filter on this source value.
if source and source != 'all': # filter on this source value.
source_query = xapian.Query(SearchPrefix.SOURCE.value + source)
query = xapian.Query(xapian.Query.OP_FILTER, query, source_query)
if country:
country_query = xapian.Query(SearchPrefix.COUNTRY.value + country.lower())
query = xapian.Query(xapian.Query.OP_FILTER, query, country_query)
# Use an Enquire object on the database to run the query
enquire = xapian.Enquire(db)
enquire.set_query(query)
Expand Down Expand Up @@ -181,13 +183,13 @@ def search(db_path, offset=0, limit=1000, textq=None, locationq=None, source=Non
help='Path to writable database directory.')
arguments.add_argument('--name',
help='basic query')
arguments.add_argument('--location',
help='query restricted to location')
arguments.add_argument('--country',
help='query restricted to country code')
arguments.add_argument('--source',
help='ror or fundreg or None')
args = arguments.parse_args()
if not args.name and not args.location:
print('one of --name or --location is required')
if not args.name and not args.country:
print('one of --name or --country is required')
sys.exit(2)
results = search(args.dbpath, 0, 100, args.name, args.location, args.source)
results = search(args.dbpath, 0, 100, args.name, args.country, args.source)
print(json.dumps(results, indent=2))
4 changes: 2 additions & 2 deletions search/searchapp/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@ def _get_dbpath(args):
@search_bp.route('/search', methods=['GET'])
def get_results():
args = request.args.to_dict()
if 'textq' not in args and 'locationq' not in args:
if 'textq' not in args and 'country' not in args:
response = jsonify({'error': 'missing queries'})
else:
db_path = _get_dbpath(args)
response = jsonify(search(db_path,
offset=args.get('offset', 0),
textq=args.get('textq'),
locationq=args.get('locationq'),
country=args.get('country'),
source=args.get('source'),
app=app))
response.headers.add('Access-Control-Allow-Origin', '*');
Expand Down
Loading

0 comments on commit b508a6c

Please sign in to comment.