Change how search is done on ror and fundreg.

IACR · Dec 22, 2024 · b508a6c · b508a6c
1 parent f7bdcdb
commit b508a6c
Show file tree

Hide file tree

Showing 10 changed files with 2,139 additions and 85 deletions.
diff --git a/search/searchapp/index/countries.py b/search/searchapp/index/countries.py
diff --git a/search/searchapp/index/create_index.py b/search/searchapp/index/create_index.py
@@ -43,7 +43,6 @@ def create_index(dbpath, funderlist, verbose=False):
     termgenerator.set_flags(termgenerator.FLAG_SPELLING);
     count = 0
     for funder in funderlist.funders.values():
-        narrower = {}
         index_funder(funder, db, termgenerator)
         count += 1
         if count % 5000 == 0:
@@ -63,12 +62,14 @@ def fetch_fundreg():
 def fetch_ror():
     print('fetching ROR data')
     # Apparently we have to use the zenodo schema to determine the date on the latest ROR data.
-    response = requests.get('https://zenodo.org/api/records/?communities=ror-data&sort=mostrecent')
-    version_data = response.json().get('hits').get('hits')[0]
+    response = requests.get('https://zenodo.org/api/records/?communities=ror-data&sort=mostrecent').json()
+    print(json.dumps(response, indent=2))
+    version_data = response.get('hits').get('hits')[0]
     print(json.dumps(version_data, indent=2))
     publication_date = version_data.get('metadata').get('publication_date')
     print('ROR data from {}'.format(publication_date))
     latest_url = version_data.get('files')[0].get('links').get('self')
+    latest_url = latest_url.replace('.json', '_schema_v2.json')
     print('fetching {}'.format(latest_url))
     # latest_url should be a zip file.
     with requests.get(latest_url, stream=True) as stream:
@@ -80,7 +81,7 @@ def fetch_ror():
         namelist = zipObj.namelist()
         zipObj.extractall()
         for fname in namelist:
-            if fname.endswith('.json'):
+            if fname.endswith('_schema_v2.json'):
                 os.rename(fname, _RAW_ROR_JSON)
             else:
                 try:
@@ -181,10 +182,26 @@ def parse_ror(filename):
     arguments.add_argument('--exclude_dup_fundref',
                            action='store_true',
                            help='Whether to replace Fundref with corresponding ROR')
+    arguments.add_argument('--easter_egg',
+                           action='store_true',
+                           help='Whether to add an easter egg.')
     args = arguments.parse_args()
     ror_file = Path(_ROR_JSON)
     country_map = json.loads(open('data/country_map.json', 'r').read())
     allfunders = FunderList(funders={})
+    if args.easter_egg:
+        obj = {'source': 'ror',
+               'source_id': 'ror_0ohdarn00',
+               'name': 'University of Second Choice',
+               'country': 'Odarn',
+               'funder_type': 'Education',
+               'country_code': 'OO',
+               'altnames': [],
+               'children': [],
+               'parents': [],
+               'related': []}
+        allfunders.funders['ror_0unreal13'] = Funder(**obj)
+        print(allfunders.funders)
     if os.path.isfile(args.dbpath) or os.path.isdir(args.dbpath):
         print('CANNOT OVERWRITE dbpath')
         sys.exit(2)
@@ -213,7 +230,7 @@ def parse_ror(filename):
         print('reading {}'.format(ror_file.name))
         ror_funders = FunderList.model_validate_json(ror_file.read_text(encoding='UTF-8'))
     else:
-        if not ror_file.is_file():
+        if not Path(_RAW_ROR_JSON).is_file():
             fetch_ror()
         print('parsing {}...this is slow to parse 112000 entries...'.format(_ROR_JSON))
         ror_funders = parse_ror(_RAW_ROR_JSON)

diff --git a/search/searchapp/index/data/countries.json b/search/searchapp/index/data/countries.json
@@ -1491,4 +1491,11 @@
 	"iso2": "ZW",
 	"iso3" : "ZWE",
 	"numeric": "716"
-}]
+},
+{
+        "name": "Odarn",
+        "iso2": "OO",
+        "iso3": "OOO",
+        "numeric": "999"
+ }
+]
diff --git a/search/searchapp/index/model.py b/search/searchapp/index/model.py
@@ -28,7 +28,7 @@
 """
 
 from enum import Enum
-from pydantic import StringConstraints, ConfigDict, BaseModel, Field, conint, conlist, validator, AnyUrl
+from pydantic import StringConstraints, ConfigDict, BaseModel, Field, conint, conlist, validator, AnyUrl, constr
 from typing import List, Dict, Optional, Union, Literal
 from typing_extensions import Annotated
 
@@ -98,9 +98,11 @@ class Funder(GlobalEntity):
     country: str = Field(...,
                          title='Country of affiliation',
                          description='May be any string')
-    country_code: Optional[str] = Field(default=None,
-                                        title='ISO 3-letter country code.',
-                                        description='Optional')
+    country_code: Optional[constr(to_lower=True,
+                                  min_length=2,
+                                  max_length=2)] = Field(default=None,
+                                                         title='ISO 2-letter country code.',
+                                                         description='Optional')
     funder_type: FunderType = Field(...,
                                     title='The type of funding agency',
                                     description='May be from ROR.')

diff --git a/search/searchapp/index/rdf_parser.py b/search/searchapp/index/rdf_parser.py
@@ -8,6 +8,7 @@
 from xml.sax.handler import ContentHandler
 from model import Funder, RelationshipType, DataSource, StrEnum, FunderType, FunderList
 from model import add_names_to_relationships
+from countries import iso_codes
 # This is a map from the values of svf:fundingBodySubType to the
 # associated FunderType.  We don't have a schema to define the values
 # of svf:fundingBodySubType. They were apparently provided by
@@ -109,7 +110,7 @@ def endElement(self, name):
             except:
                 raise ValueError('unrecognized funder type: ' + self.content)
         elif self.current_tag == Tag.addressCountry:
-            self.item['country_code'] = self.content
+            self.item['country_code'] = iso_codes[self.content.lower()]['code']
             self.item['country'] = self.country_map.get(self.content, 'unknown')
         self.current_tag = None
         self.content = '' # reset at end of tag.

diff --git a/search/searchapp/index/search_lib.py b/search/searchapp/index/search_lib.py
@@ -25,6 +25,7 @@ class SearchPrefix(str, Enum):
     ORGTYPE = 'O'
     ID = 'Q'
     SOURCE = 'XS'
+    COUNTRY = 'N'
 
 def index_funder(funder, writable_db=None, termgenerator=None):
     """Index the funder. It returns no value. It is used by create_index.py.
@@ -44,7 +45,10 @@ def index_funder(funder, writable_db=None, termgenerator=None):
     doc = xapian.Document()
     docid = funder.global_id()
     doc.add_boolean_term(docid)
-    doc.add_boolean_term(SearchPrefix.SOURCE.value + funder.source.value)
+    termgenerator.increase_termpos()
+    doc.add_boolean_term(SearchPrefix.SOURCE.value + funder.source.value.lower())
+    termgenerator.increase_termpos()
+    doc.add_boolean_term(SearchPrefix.COUNTRY.value + funder.country_code.lower())
     # We sort on SLOT_NUMBER
     slot_value = '1' if funder.source.value == 'fundreg' else '0'
     doc.add_value(SLOT_NUMBER, slot_value)
@@ -54,18 +58,18 @@ def index_funder(funder, writable_db=None, termgenerator=None):
     termgenerator.index_text(name, 1, SearchPrefix.NAME.value)
     termgenerator.index_text(name, NAME_WEIGHT)
 
-    termgenerator.increase_termpos()
     for altname in funder.altnames:
+        termgenerator.increase_termpos()
         termgenerator.index_text(altname, 1, SearchPrefix.NAME.value)
         termgenerator.index_text(altname, NAME_WEIGHT)
     for child in funder.children:
+        termgenerator.increase_termpos()
         termgenerator.index_text(child.name, 1, SearchPrefix.NAME.value)
         termgenerator.index_text(child.name, NAME_WEIGHT)
 
     termgenerator.increase_termpos()
     location = funder.country
     termgenerator.index_text(location, 1, SearchPrefix.LOCATION.value)
-    termgenerator.increase_termpos()
 
     termgenerator.increase_termpos()
     orgtype = funder.funder_type.value
@@ -76,26 +80,26 @@ def index_funder(funder, writable_db=None, termgenerator=None):
 
     data = funder.dict()
     data['id'] = docid
-    data['source_id']
     doc.set_data(json.dumps(data, indent=2))
     writable_db.replace_document(docid, doc)
 
-def search(db_path, offset=0, limit=1000, textq=None, locationq=None, source=None, app=None):
+def search(db_path, offset=0, limit=1000, textq=None, country=None, source=None, app=None):
     """Execute a query on the index. At least one of textq or locationq
     must be non-None.
 
     Args:
        db_path: path to database
        offset: starting offset for paging of results
        textq: raw query string from the user to be applied to any text field
-       locationq: raw query for location field
+       country: country code filter
     Returns: dict with the following:
        error: string if an error occurs (no other fields in this case)
        parsed_query: debug parsed query
        estimated_results: number of total results available
        results: an array of results
     """
-    if (not textq and not locationq):
+    if textq is None and country is None:
+        app.logger.info('search with not query')
         return {'estimated_results': 0,
                 'parsed_query': '',
                 'spell_corrected_query': '',
@@ -122,21 +126,19 @@ def search(db_path, offset=0, limit=1000, textq=None, locationq=None, source=Non
         # FLAG_WILDCARD enables things like * signature scheme to expand the *
         flags = queryparser.FLAG_SPELLING_CORRECTION | queryparser.FLAG_BOOLEAN | queryparser.FLAG_LOVEHATE | queryparser.FLAG_PHRASE | queryparser.FLAG_WILDCARD
         # we build a list of subqueries and combine them later with AND.
-        if not textq and not locationq:
-            return {'error': 'missing query'}
         query_list = []
         if textq:
             terms = textq.split()
             terms[-1] = terms[-1] + '*'
             for term in terms:
                 query_list.append(queryparser.parse_query(term, flags))
-        if locationq:
-            location_query = queryparser.parse_query(locationq, flags, SearchPrefix.LOCATION.value)
-            query_list.append(location_query)
         query = xapian.Query(xapian.Query.OP_AND, query_list)
-        if source: # filter on this source value.
+        if source and source != 'all': # filter on this source value.
             source_query = xapian.Query(SearchPrefix.SOURCE.value + source)
             query = xapian.Query(xapian.Query.OP_FILTER, query, source_query)
+        if country:
+            country_query = xapian.Query(SearchPrefix.COUNTRY.value + country.lower())
+            query = xapian.Query(xapian.Query.OP_FILTER, query, country_query)
         # Use an Enquire object on the database to run the query
         enquire = xapian.Enquire(db)
         enquire.set_query(query)
@@ -181,13 +183,13 @@ def search(db_path, offset=0, limit=1000, textq=None, locationq=None, source=Non
                            help='Path to writable database directory.')
     arguments.add_argument('--name',
                            help='basic query')
-    arguments.add_argument('--location',
-                           help='query restricted to location')
+    arguments.add_argument('--country',
+                           help='query restricted to country code')
     arguments.add_argument('--source',
                            help='ror or fundreg or None')
     args = arguments.parse_args()
-    if not args.name and not args.location:
-        print('one of --name or --location is required')
+    if not args.name and not args.country:
+        print('one of --name or --country is required')
         sys.exit(2)
-    results = search(args.dbpath, 0, 100, args.name, args.location, args.source)
+    results = search(args.dbpath, 0, 100, args.name, args.country, args.source)
     print(json.dumps(results, indent=2))
diff --git a/search/searchapp/routes.py b/search/searchapp/routes.py
@@ -19,14 +19,14 @@ def _get_dbpath(args):
 @search_bp.route('/search', methods=['GET'])
 def get_results():
     args = request.args.to_dict()
-    if 'textq' not in args and 'locationq' not in args:
+    if 'textq' not in args and 'country' not in args:
         response = jsonify({'error': 'missing queries'})
     else:
         db_path = _get_dbpath(args)
         response = jsonify(search(db_path,
                                   offset=args.get('offset', 0),
                                   textq=args.get('textq'),
-                                  locationq=args.get('locationq'),
+                                  country=args.get('country'),
                                   source=args.get('source'),
                                   app=app))
     response.headers.add('Access-Control-Allow-Origin', '*');