Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updates to process #84

Open
wants to merge 34 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
0135edb
remove unnecessary options from config parsing
gtfierro Mar 27, 2014
d13015c
make join on previous disambig output optional
gtfierro Mar 27, 2014
3bb0881
insert most recent location for disambiguated inventor/assignee into …
gtfierro Mar 27, 2014
69beee9
fix typo
gtfierro Mar 27, 2014
c695ded
add Vagrant config
gtfierro Mar 27, 2014
97e2549
fix indent
gtfierro Mar 27, 2014
6b9487e
remove old import
gtfierro Mar 28, 2014
f610510
adjust configuration for VM
gtfierro Mar 28, 2014
b8dae6e
avoid overlapping uuids
gtfierro Mar 28, 2014
7cf15b7
fix order
gtfierro Mar 29, 2014
2ef665c
run integrated clean/consolidate
gtfierro Mar 29, 2014
d845c52
update vm
gtfierro Mar 29, 2014
c29b8fd
try to fix problems with consolidate
gtfierro Mar 31, 2014
fd80cd1
update consolidate to treat database as groundtruth
gtfierro Apr 2, 2014
e88a529
include metadata
gtfierro Apr 1, 2014
9767ba4
incrase size of state field
gtfierro Apr 2, 2014
109618f
update MySQL notebook
gtfierro Apr 4, 2014
b3d6f4d
Merge remote-tracking branch 'origin/master'
gtfierro Apr 4, 2014
5ed458e
remove old tables from schema
gtfierro Apr 16, 2014
87441e7
do not commit empty ipcr for app
gtfierro Apr 16, 2014
aa9ec54
continue removing unused columns and tables
gtfierro Apr 16, 2014
d876b10
remove inventor nationality and foreigncitation name
gtfierro Apr 16, 2014
f819ded
run geolocation disambig on both databases
gtfierro Apr 16, 2014
8b8096c
fixup integrate.py to work on grant + app
gtfierro Apr 25, 2014
e3d508b
update integrate script
gtfierro Apr 29, 2014
a122b2d
add nber subs file for assignee disambig
gtfierro Apr 30, 2014
d34fa29
add method to mark granted
gtfierro Apr 30, 2014
b938551
fix typos, ensure ordered delivery of records
gtfierro May 1, 2014
621c32c
remove nationality from schema
gtfierro May 5, 2014
5d0c713
do not attempt usreldoc for applications
gtfierro May 5, 2014
fcd054a
no error on bad lines
gtfierro May 14, 2014
720dce8
add nber substitutions to assignee disambig
gtfierro May 14, 2014
dcf2988
add sub file
gtfierro May 14, 2014
1c09b5f
finish linking up records in integrate
gtfierro May 29, 2014
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 18 additions & 15 deletions consolidate.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,9 +123,12 @@ def main(year, doctype):
namedict['name_last'] = name_last
rawloc = ri.rawlocation
if rawloc:
loc = rawloc.location
if rawloc.location:
loc = rawloc.location
else:
loc = primloc
else:
loc = None
loc = primloc
namedict['state'] = loc.state if loc else ''# if loc else rawloc.state if rawloc else primloc.state if primloc else ''
namedict['country'] = loc.country if loc else ''# if loc else rawloc.country if rawloc else primloc.country if primloc else ''
namedict['city'] = loc.city if loc else ''# if loc else rawloc.city if rawloc else primloc.city if primloc else ''
Expand All @@ -144,29 +147,29 @@ def main(year, doctype):
print e
continue

def join(oldfile, newfile):
def join(newfile):
"""
Does a JOIN on the rawinventor uuid field to associate rawinventors in this
round with inventor_ids they were assigned in the previous round of
disambiguation. This improves the runtime of the inventor disambiguator
"""
new = pd.read_csv(newfile,delimiter='\t',header=None)
old = pd.read_csv(oldfile,delimiter='\t',header=None)
new = pd.read_csv(newfile,delimiter='\t',header=None, error_bad_lines=False)
new[0] = new[0].astype(str)
ses_gen = alchemy.session_generator(dbtype='grant')
s = ses_gen()
old = s.execute('select uuid, inventor_id from rawinventor where inventor_id != "";')
old = pd.DataFrame.from_records(old.fetchall())
old[0] = old[0].astype(str)
merged = pd.merge(new,old,on=0,how='left')
merged.to_csv('disambiguator_{0}.tsv'.format(datetime.now().strftime('%B_%d')), index=False, header=None, sep='\t')

if __name__ == '__main__':
if len(sys.argv) < 2:
print "Provide path to previous disambiguation output"
pritn "USAGE: python consolidate.py <path/to/old/disambiguation/output.tsv>"
sys.exit(1)
prev_output = sys.argv[1]
for year in range(1975, datetime.today().year+1):
print 'Running year',year,datetime.now(),'for grant'
main(year, 'grant')
print 'Running year',year,datetime.now(),'for grant'
main(year, 'grant')
for year in range(2001, datetime.today().year+1):
print 'Running year',year,datetime.now(),'for application'
main(year, 'application')
print 'Running year',year,datetime.now(),'for application'
main(year, 'application')

# join files
join(prev_output, 'disambiguator.csv')
join('disambiguator.csv')
233 changes: 187 additions & 46 deletions integrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
import lib.alchemy as alchemy
from lib.util.csv_reader import read_file
from lib.alchemy import is_mysql
from lib.alchemy.schema import Inventor, RawInventor, patentinventor
from lib.alchemy.schema import Inventor, RawInventor, patentinventor, App_Inventor, App_RawInventor, applicationinventor, appmetadata
from lib.handlers.xml_util import normalize_document_identifier
from collections import defaultdict
import cPickle as pickle
Expand Down Expand Up @@ -66,32 +66,37 @@ def integrate(disambig_input_file, disambig_output_file):
just have to populate the fields of the disambiguated inventor object:
inventor id, first name, last name, nationality (?)
"""
#disambig_input = pd.read_csv(disambig_input_file,header=None,delimiter='\t',encoding='utf-8',skiprows=[1991872])
#disambig_output = pd.read_csv(disambig_output_file,header=None,delimiter='\t',encoding='utf-8',skiprows=[1991872])
disambig_input = pd.read_csv(disambig_input_file,header=None,delimiter='\t',encoding='utf-8',skiprows=[1991872])
disambig_output = pd.read_csv(disambig_output_file,header=None,delimiter='\t',encoding='utf-8',skiprows=[1991872])
#disambig_input[0] = disambig_input[0].apply(str)
#disambig_output[0] = disambig_output[0].apply(str)
disambig_input = pd.read_csv(disambig_input_file,header=None,delimiter='\t',encoding='utf-8')
disambig_output = pd.read_csv(disambig_output_file,header=None,delimiter='\t',encoding='utf-8')
disambig_input[0] = disambig_input[0].apply(str)
disambig_output[0] = disambig_output[0].apply(str)
print 'finished loading csvs'
merged = pd.merge(disambig_input, disambig_output, on=0)
import IPython
d = locals()
d.update(globals())
IPython.embed(user_ns=d)
merged.columns = ['rawinventor_uuid','isgrant','granted','name_first','name_middle','name_last','patent_id','mainclass','subclass','city','state','country','assignee','rawassignee','prev_inventorid','current_inventorid']
print 'finished merging'
#inventor_attributes = merged[[0,'1_y','1_x',2,3,4]] # rawinventor uuid, inventor id, first name, middle name, last name, patent_id
inventor_attributes = inventor_attributes.dropna(subset=[0],how='all')
inventor_attributes[2] = inventor_attributes[2].fillna('')
inventor_attributes[3] = inventor_attributes[3].fillna('')
inventor_attributes['1_x'] = inventor_attributes['1_x'].fillna('')
apps = merged[merged['isgrant'] == 0]


inventor_attributes = merged[['isgrant','rawinventor_uuid','current_inventorid','name_first','name_middle','name_last','patent_id']] # rawinventor uuid, inventor id, first name, middle name, last name, patent_id
inventor_attributes = inventor_attributes.dropna(subset=['rawinventor_uuid'],how='all')
inventor_attributes['name_first'] = inventor_attributes['name_first'].fillna('')
inventor_attributes['name_middle'] = inventor_attributes['name_middle'].fillna('')
inventor_attributes['name_last'] = inventor_attributes['name_last'].fillna('')

grants = inventor_attributes[inventor_attributes['isgrant'] == 1]
apps = inventor_attributes[inventor_attributes['isgrant'] == 0]
del grants['isgrant']
del apps['isgrant']

####### DO GRANTS #######
rawinventors = defaultdict(list)
inventor_inserts = []
rawinventor_updates = []
patentinventor_inserts = []
for row in inventor_attributes.iterrows():
uuid = row[1]['1_y']
for row in grants.iterrows():
uuid = row[1]['current_inventorid']
rawinventors[uuid].append(row[1])
patentinventor_inserts.append({'inventor_id': uuid, 'patent_id': row[1][4]})
patentinventor_inserts.append({'inventor_id': uuid, 'patent_id': row[1]['patent_id']})
print 'finished associating ids'
i = 0
for inventor_id in rawinventors.iterkeys():
Expand All @@ -102,7 +107,7 @@ def integrate(disambig_input_file, disambig_output_file):
names = []
for raw in rawinventors[inventor_id]:
rawuuids.append(raw[0])
name = ' '.join(x for x in (raw['1_x'], raw[2], raw[3]) if x)
name = ' '.join(x for x in (raw['name_first'], raw['name_middle'], raw['name_last']) if x)
freq['name'][name] += 1
for k,v in raw.iteritems():
freq[k][v] += 1
Expand All @@ -120,51 +125,187 @@ def integrate(disambig_input_file, disambig_output_file):
if i % 100000 == 0:
print i, datetime.now(), rawuuids[0]
print 'finished voting'
session_generator = alchemy.session_generator()
session_generator = alchemy.session_generator(dbtype='grant')
session = session_generator()
if alchemy.is_mysql():
session.execute('truncate inventor; truncate patent_inventor;')
else:
session.execute('delete from inventor; delete from patent_inventor;')

from lib.tasks import bulk_commit_inserts, bulk_commit_updates
bulk_commit_inserts(inventor_inserts, Inventor.__table__, is_mysql(), 20000)
bulk_commit_inserts(patentinventor_inserts, patentinventor, is_mysql(), 20000)
bulk_commit_updates('inventor_id', rawinventor_updates, RawInventor.__table__, is_mysql(), 20000)
bulk_commit_inserts(inventor_inserts, Inventor.__table__, is_mysql(), 20000,'grant')
bulk_commit_inserts(patentinventor_inserts, patentinventor, is_mysql(), 20000,'grant')
bulk_commit_updates('inventor_id', rawinventor_updates, RawInventor.__table__, is_mysql(), 20000,'grant')


###### DO APPLICATIONS ######

rawinventors = defaultdict(list)
inventor_inserts = []
rawinventor_updates = []
patentinventor_inserts = []
for row in apps.iterrows():
uuid = row[1]['current_inventorid']
rawinventors[uuid].append(row[1])
patentinventor_inserts.append({'inventor_id': uuid, 'patent_id': row[1]['patent_id']})
print 'finished associating ids'
i = 0
for inventor_id in rawinventors.iterkeys():
i += 1
freq = defaultdict(Counter)
param = {}
rawuuids = []
names = []
for raw in rawinventors[inventor_id]:
rawuuids.append(raw[0])
name = ' '.join(x for x in (raw['name_first'], raw['name_middle'], raw['name_last']) if x)
freq['name'][name] += 1
for k,v in raw.iteritems():
freq[k][v] += 1
param['id'] = inventor_id
name = freq['name'].most_common(1)[0][0]
name_first = unidecode(' '.join(name.split(' ')[:-1]))
name_last = unidecode(name.split(' ')[-1])
param['name_first'] = name_first
param['name_last'] = name_last
param['nationality'] = ''
assert set(param.keys()) == {'id','name_first','name_last','nationality'}
inventor_inserts.append(param)
for rawuuid in rawuuids:
rawinventor_updates.append({'pk': rawuuid, 'update': param['id']})
if i % 100000 == 0:
print i, datetime.now(), rawuuids[0]
print 'finished voting'
session_generator = alchemy.session_generator(dbtype='application')
session = session_generator()
if alchemy.is_mysql():
session.execute('truncate inventor; truncate application_inventor;')
else:
session.execute('delete from inventor; delete from application_inventor;')

from lib.tasks import bulk_commit_inserts, bulk_commit_updates
bulk_commit_inserts(inventor_inserts, App_Inventor.__table__, is_mysql(), 20000,'application')
bulk_commit_inserts(patentinventor_inserts, applicationinventor, is_mysql(), 20000,'application')
bulk_commit_updates('inventor_id', rawinventor_updates, App_RawInventor.__table__, is_mysql(), 20000,'application')


session_generator = alchemy.session_generator(dbtype='grant')
session = session_generator()
gsession = session
doctype = 'grant'
res = session.execute('select location.id, assignee.id from assignee \
inner join rawassignee on rawassignee.assignee_id = assignee.id \
inner join rawlocation on rawlocation.id = rawassignee.rawlocation_id \
inner join location on location.id = rawlocation.location_id;')
session.execute('truncate location_assignee;')
res = session.execute('select location_id, assignee_id from patent \
left join rawassignee on rawassignee.patent_id = patent.id \
left join rawlocation on rawlocation.id = rawassignee.rawlocation_id \
where assignee_id != "" and location_id != "";')
assigneelocation = pd.DataFrame.from_records(res.fetchall())
print assigneelocation.info()
assigneelocation = assigneelocation[assigneelocation[0].notnull()]
assigneelocation = assigneelocation[assigneelocation[1].notnull()]
assigneelocation.columns = ['location_id','assignee_id']
assigneelocation = assigneelocation.sort('assignee_id')
print assigneelocation.info()
locationassignee_inserts = [row[1].to_dict() for row in assigneelocation.iterrows()]
if doctype == 'grant':
bulk_commit_inserts(locationassignee_inserts, alchemy.schema.locationassignee, alchemy.is_mysql(), 20000, 'grant')
elif doctype == 'application':
bulk_commit_inserts(locationassignee_inserts, alchemy.schema.app_locationassignee, alchemy.is_mysql(), 20000, 'application')
bulk_commit_inserts(locationassignee_inserts, alchemy.schema.locationassignee, alchemy.is_mysql(), 20000, 'grant')

session.execute('truncate location_inventor;')
res = session.execute('select location.id, inventor.id from inventor \
left join rawinventor on rawinventor.inventor_id = inventor.id \
right join rawlocation on rawlocation.id = rawinventor.rawlocation_id \
right join location on location.id = rawlocation.location_id;')
res = session.execute('select location_id, inventor_id from patent \
left join rawinventor on rawinventor.patent_id = patent.id \
left join rawlocation on rawlocation.id = rawinventor.rawlocation_id \
where inventor_id != "" and location_id != "";')
inventorlocation = pd.DataFrame.from_records(res.fetchall())
inventorlocation.columns = ['location_id','inventor_id']
inventorlocation = inventorlocation.sort('inventor_id')
print inventorlocation.info()
inventorlocation = inventorlocation[inventorlocation[0].notnull()]
inventorlocation = inventorlocation[inventorlocation[1].notnull()]
locationinventor_inserts = [row[1].to_dict() for row in inventorlocation.iterrows()]
bulk_commit_inserts(locationinventor_inserts, alchemy.schema.locationinventor, alchemy.is_mysql(), 20000, 'grant')

doctype = 'application'
session_generator = alchemy.session_generator(dbtype='application')
session = session_generator()
asession = session
session.execute('truncate location_assignee;')
res = session.execute('select location_id, assignee_id from application \
left join rawassignee on rawassignee.application_id = application.id \
left join rawlocation on rawlocation.id = rawassignee.rawlocation_id \
where assignee_id != "" and location_id != "";')
assigneelocation = pd.DataFrame.from_records(res.fetchall())
assigneelocation.columns = ['location_id','assignee_id']
assigneelocation = assigneelocation.sort('assignee_id')
print assigneelocation.info()
locationassignee_inserts = [row[1].to_dict() for row in assigneelocation.iterrows()]
bulk_commit_inserts(locationassignee_inserts, alchemy.schema.app_locationassignee, alchemy.is_mysql(), 20000, 'application')

session.execute('truncate location_inventor;')
res = session.execute('select location_id, inventor_id from application \
left join rawinventor on rawinventor.application_id = application.id \
left join rawlocation on rawlocation.id = rawinventor.rawlocation_id \
where inventor_id != "" and location_id != "";')
inventorlocation = pd.DataFrame.from_records(res.fetchall())
inventorlocation.columns = ['location_id','inventor_id']
inventorlocation = inventorlocation.drop_duplicates(cols=['location_id','inventor_id'])
inventorlocation = inventorlocation.sort('inventor_id')
print inventorlocation.info()
locationinventor_inserts = [row[1].to_dict() for row in inventorlocation.iterrows()]
if doctype == 'grant':
bulk_commit_inserts(locationinventor_inserts, alchemy.schema.locationinventor, alchemy.is_mysql(), 20000, 'grant')
elif doctype == 'application':
bulk_commit_inserts(locationinventor_inserts, alchemy.schema.app_locationinventor, alchemy.is_mysql(), 20000, 'application')
bulk_commit_inserts(locationinventor_inserts, alchemy.schema.app_locationinventor, alchemy.is_mysql(), 20000, 'application')



#grantsessiongen = alchemy.session_generator(dbtype='grant')
#appsessiongen = alchemy.session_generator(dbtype='application')
#grantsession = grantsessiongen()
#appsession = appsessiongen()

grantedapps = disambig_input[(disambig_input[2] == 1) & (disambig_input[1] == 0)][[0, 6]] # rawinventor_id, application_id

print grantedapps

inserts = [{'pk': x[0], 'update': x[1]} for x in grantedapps.values]

asession.execute('truncate temporary_update;')
asession.commit()
bulk_commit_inserts(inserts, alchemy.schema.app_temporary_update, alchemy.is_mysql(), 20000, 'application')

res = session.execute('select id, number from application right join temporary_update on application.id = temporary_update.update;')
short_ids = res.fetchall()

inserts = [{'pk': x[1], 'update': 0} for x in short_ids]

short_ids = pd.DataFrame.from_records(short_ids).drop_duplicates()
grantedapps = pd.merge(grantedapps, short_ids, left_on=6, right_on=0)
print grantedapps.head(5)

#session_generator = alchemy.session_generator(dbtype='grant')
#session = session_generator()

gsession.execute('truncate temporary_update;')
gsession.commit()
bulk_commit_inserts(inserts, alchemy.schema.temporary_update, alchemy.is_mysql(), 20000, 'grant')

res = gsession.execute('select id, application.patent_id, rawinventor.inventor_id, rawinventor.sequence from application right join temporary_update on temporary_update.pk = application.id left join rawinventor on rawinventor.patent_id = application.patent_id')

patentpairs = res.fetchall()

patentpairs = pd.DataFrame.from_records(patentpairs)

updates = []

grantedapps_groups = grantedapps.groupby(1).groups
patentpairs_groups = patentpairs.sort(3).groupby(0).groups
common = set(grantedapps_groups.keys()).intersection(set(patentpairs_groups.keys()))
num = len(grantedapps_groups)
i = 0
for key in common:
i += 1
if i % 100000 == 0:
print i, datetime.now()
app_rawinventors = list(grantedapps.iloc[grantedapps_groups[key]]['0_x'])
inventor_ids = list(patentpairs.iloc[patentpairs_groups[key]][2])
less = min(len(app_rawinventors), len(inventor_ids))
pairs = zip(app_rawinventors[:less], inventor_ids[:less])
updates.extend([{'pk': x[0], 'update': x[1]} for x in pairs])

pd.DataFrame.from_records(updates).to_csv('updates.tsv',sep='\t',index=False)
gsession.close()
asession.close()
bulk_commit_updates('inventor_id', updates, App_RawInventor.__table__, alchemy.is_mysql(), 20000, 'application')


def main():
if len(sys.argv) <= 2:
Expand Down
Loading