From e08cfd1915f2855b58b9c7a96487a2fe5a9bd92c Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Mon, 30 Mar 2020 19:34:37 -0700 Subject: [PATCH] slight refactorr of the code just so i'm not tooo embarassed, and also setting it up better for more similarities in the future --- run.py | 202 ++++++++++++++++++++++++++++++------------------------- serve.py | 6 +- 2 files changed, 114 insertions(+), 94 deletions(-) diff --git a/run.py b/run.py index 4069205..656b778 100644 --- a/run.py +++ b/run.py @@ -6,97 +6,117 @@ import requests import numpy as np -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.feature_extraction import stop_words -from sklearn import svm - # ----------------------------------------------------------------------------- -jstr = requests.get('https://connect.biorxiv.org/relate/collection_json.php?grp=181') -jall = jstr.json() -print(f"writing jall.json with {len(jall['rels'])} papers") -json.dump(jall, open('jall.json', 'w')) - -# compute tfidf features with scikit learn -print("fitting tfidf") -max_features = 2000 -v = TfidfVectorizer(input='content', - encoding='utf-8', decode_error='replace', strip_accents='unicode', - lowercase=True, analyzer='word', stop_words='english', - token_pattern=r'(?u)\b[a-zA-Z_][a-zA-Z0-9_-]+\b', - ngram_range=(1, 1), max_features = max_features, - norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True, - max_df=1.0, min_df=1) -corpus = [a['rel_abs'] for a in jall['rels']] -v.fit(corpus) - -# use tfidf features to find nearest neighbors cheaply -X = v.transform(corpus) -D = np.dot(X, X.T).todense() -IX = np.argsort(-D, axis=1) -sim = {} -ntake = 40 -n = IX.shape[0] -for i in range(n): - ixc = [int(IX[i,j]) for j in range(ntake)] - ds = [int(D[i,IX[i,j]]*1000) for j in range(ntake)] - sim[i] = list(zip(ixc, ds)) -print("writing sim.json") -json.dump(sim, open('sim.json', 'w')) - -# use exemplar SVM to build similarity instead -print("fitting SVMs per paper") -svm_sim = {} -ntake = 40 -for i in range(n): - # everything is 0 except the current index - i.e. "exemplar svm" - y = np.zeros(X.shape[0]) - y[i] = 1 - clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-4, C=1.0) - clf.fit(X,y) - s = clf.decision_function(X) - IX = np.argsort(-s) - ixc = [int(IX[j]) for j in range(ntake)] - ds = [int(D[i,IX[j]]*1000) for j in range(ntake)] - svm_sim[i] = list(zip(ixc, ds)) -json.dump(svm_sim, open('svm_sim.json', 'w')) - -# construct a reverse index for suppoorting search -vocab = v.vocabulary_ -idf = v.idf_ -english_stop_words = stop_words.ENGLISH_STOP_WORDS -punc = "'!\"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~'" # removed hyphen from string.punctuation -trans_table = {ord(c): None for c in punc} - -def makedict(s, forceidf=None, scale=1.0): - words = set(s.lower().translate(trans_table).strip().split()) - words = set(w for w in words if len(w) > 1 and (not w in english_stop_words)) - idfd = {} - for w in words: # todo: if we're using bigrams in vocab then this won't search over them - if forceidf is None: - if w in vocab: - idfval = idf[vocab[w]] * scale # we have idf for this +def write_json(obj, filename, msg=''): + suffix = f'; {msg}' if msg else '' + print(f"writing {filename}{suffix}") + with open(filename, 'w') as f: + json.dump(obj, f) + + +def calculate_tfidf_features(rels, max_features=5000, max_df=1.0, min_df=3): + """ compute tfidf features with scikit learn """ + from sklearn.feature_extraction.text import TfidfVectorizer + v = TfidfVectorizer(input='content', + encoding='utf-8', decode_error='replace', strip_accents='unicode', + lowercase=True, analyzer='word', stop_words='english', + token_pattern=r'(?u)\b[a-zA-Z_][a-zA-Z0-9_-]+\b', + ngram_range=(1, 1), max_features=max_features, + norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True, + max_df=max_df, min_df=min_df) + corpus = [(a['rel_title'] + '. ' + a['rel_abs']) for a in rels] + X = v.fit_transform(corpus) + X = np.asarray(X.astype(np.float32).todense()) + print("tfidf calculated array of shape ", X.shape) + return X, v + + +def calculate_sim_dot_product(X, ntake=40): + """ take X (N,D) features and for each index return closest ntake indices via dot product """ + S = np.dot(X, X.T) + IX = np.argsort(S, axis=1)[:, :-ntake-1:-1] # take last ntake sorted backwards + return IX.tolist() + + +def calculate_sim_svm(X, ntake=40): + """ take X (N,D) features and for each index return closest ntake indices using exemplar SVM """ + from sklearn import svm + n, d = X.shape + IX = np.zeros((n, ntake), dtype=np.int64) + print(f"training {n} svms for each paper...") + for i in range(n): + # set all examples as negative except this one + y = np.zeros(X.shape[0], dtype=np.float32) + y[i] = 1 + # train an SVM + clf = svm.LinearSVC(class_weight='balanced', verbose=False, max_iter=10000, tol=1e-4, C=0.1) + clf.fit(X, y) + s = clf.decision_function(X) + ix = np.argsort(s)[:-ntake-1:-1] # take last ntake sorted backwards + IX[i] = ix + return IX.tolist() + + +def build_search_index(rels, v): + from sklearn.feature_extraction import stop_words + + # construct a reverse index for suppoorting search + vocab = v.vocabulary_ + idf = v.idf_ + english_stop_words = stop_words.ENGLISH_STOP_WORDS + punc = "'!\"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~'" # removed hyphen from string.punctuation + trans_table = {ord(c): None for c in punc} + + def makedict(s, forceidf=None): + words = set(s.lower().translate(trans_table).strip().split()) + words = set(w for w in words if len(w) > 1 and (not w in english_stop_words)) + idfd = {} + for w in words: # todo: if we're using bigrams in vocab then this won't search over them + if forceidf is None: + if w in vocab: + idfval = idf[vocab[w]] # we have a computed idf for this + else: + idfval = 1.0 # some word we don't know; assume idf 1.0 (low) else: - idfval = 1.0 * scale # assume idf 1.0 (low) - else: - idfval = forceidf - idfd[w] = idfval - return idfd - -def merge_dicts(dlist): - m = {} - for d in dlist: - for k,v in d.items(): - m[k] = m.get(k,0) + v - return m - -search_dict = [] -for p in jall['rels']: - dict_title = makedict(p['rel_title'], forceidf=10, scale=3) - dict_authors = makedict(p['rel_authors'], forceidf=5) - dict_summary = makedict(p['rel_abs']) - qdict = merge_dicts([dict_title, dict_authors, dict_summary]) - search_dict.append(qdict) - -print("writing search.json") -json.dump(search_dict, open('search.json', 'w')) + idfval = forceidf + idfd[w] = idfval + return idfd + + def merge_dicts(dlist): + m = {} + for d in dlist: + for k, v in d.items(): + m[k] = m.get(k,0) + v + return m + + search_dict = [] + for p in rels: + dict_title = makedict(p['rel_title'], forceidf=10) + dict_authors = makedict(p['rel_authors'], forceidf=5) + dict_summary = makedict(p['rel_abs']) + qdict = merge_dicts([dict_title, dict_authors, dict_summary]) + search_dict.append(qdict) + + return search_dict + + +if __name__ == '__main__': + + # fetch the raw data from biorxiv + jstr = requests.get('https://connect.biorxiv.org/relate/collection_json.php?grp=181') + jall = jstr.json() + write_json(jall, 'jall.json', f"{len(jall['rels'])} papers") + + # calculate similarities using various techniques + X, v = calculate_tfidf_features(jall['rels']) + # similarity using simple dot product on tfidf + sim_tfidf = calculate_sim_dot_product(X) + write_json(sim_tfidf, 'sim_tfidf_dot.json') + # similarity using an exemplar svm on tfidf + sim_svm = calculate_sim_svm(X) + write_json(sim_svm, 'sim_tfidf_svm.json') + + # calculate the search index to support search + search_dict = build_search_index(jall['rels'], v) + write_json(search_dict, 'search.json') diff --git a/serve.py b/serve.py index 574a58f..4395e55 100644 --- a/serve.py +++ b/serve.py @@ -3,10 +3,10 @@ """ import json -import argparse from flask import Flask, request, redirect, url_for from flask import render_template + # ----------------------------------------------------------------------------- app = Flask(__name__) @@ -16,7 +16,7 @@ jall = json.load(f) # load computed paper similarities -with open('svm_sim.json', 'r') as f: +with open('sim_tfidf_svm.json', 'r') as f: sim_dict = json.load(f) # load search dictionary for each paper @@ -63,7 +63,7 @@ def sim(doi_prefix=None, doi_suffix=None): if pix is None: papers = [] else: - sim_ix, match = zip(*sim_dict[str(pix)][:40]) # indices of closest papers + sim_ix = sim_dict[pix] papers = [jall['rels'][cix] for cix in sim_ix] gvars = {'sort_order': 'sim', 'num_papers': len(jall['rels'])} context = {'papers': papers, 'gvars': gvars}