Skip to content

Commit

Permalink
#1 Map MARBLE's domains to SemDoms
Browse files Browse the repository at this point in the history
  • Loading branch information
Jonathan Janetzki committed May 17, 2023
1 parent 875a3ce commit c7791a0
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 51 deletions.
17 changes: 9 additions & 8 deletions src/dictionary_creator/dictionary_creator.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
from collections import defaultdict

import dill
import os
import pandas as pd
import re
import subprocess
import sys
import time
from abc import ABC, abstractmethod
from collections import defaultdict
from datetime import datetime
from pickle import UnpicklingError

import dill
import pandas as pd
from nltk import WordNetLemmatizer, pos_tag
from nltk.corpus import wordnet
from nltk.corpus.reader.wordnet import WordNetError
from pickle import UnpicklingError
from polyglot.text import Text
from tokenizers import Tokenizer
from tokenizers.models import BPE
Expand Down Expand Up @@ -413,7 +413,7 @@ def _combine_alignments(self):
matches = re.search(r'FINAL(.|\n)*cross entropy: (\d+\.\d+)\n *perplexity: (\d+\.\d+)', result.stderr)
cross_entropy = float(matches.group(2))
perplexity = float(matches.group(3))
print(f'cross entropy: {cross_entropy}, perplexity: {perplexity}')
print(f'cross entropy: {str(cross_entropy)}, perplexity: {str(perplexity)}')

def _check_already_done(self, target_progress, load):
if load:
Expand Down Expand Up @@ -454,8 +454,9 @@ def _execute_and_track_state(self, func, step_name=None, save=False, load=False,
def _preprocess_data(self):
self._load_data()
self._build_sds()
self._tokenize_verses()
self._combine_alignments()

# self._tokenize_verses()
# self._combine_alignments()

def _add_bidirectional_edge(self, word_1, word_2, count=1):
word_1.add_aligned_word(word_2, count)
Expand Down
116 changes: 73 additions & 43 deletions src/lexical_domain_mapper.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from collections import defaultdict, Counter

import dill
import gensim.downloader as api
import numpy as np
from tqdm import tqdm
import pandas as pd
from dictionary_creator.link_prediction_dictionary_creator import LinkPredictionDictionaryCreator
from semantic_domain_identifier import SemanticDomainIdentifier
import dill
from nltk.metrics.distance import edit_distance
import gensim.downloader as api
from tqdm import tqdm

from dictionary_creator.link_prediction_dictionary_creator import LinkPredictionDictionaryCreator


def correlate_domains(lexical_domains, sdi):
Expand Down Expand Up @@ -97,33 +97,21 @@ def compute_word_vector_distance(string1, string2, word_vectors):
return np.linalg.norm(vector1 - vector2)


def correlate_domain_names(dc):
# load results
# with open('data/2_sd_labeling/correlated_lds_by_cid.pkl', 'rb') as f:
# correlated_lds_by_cid = dill.load(f)
# with open('data/2_sd_labeling/correlated_cids_by_ld.pkl', 'rb') as f:
# correlated_cids_by_ld = dill.load(f)
with open('data/2_sd_labeling/correlation_scores_by_ld_by_cid.pkl', 'rb') as f:
correlation_scores_by_ld_by_cid = dill.load(f)

# load names for CIDs from dc
sd_name_by_cid = {}
sd_df = dc.sds_by_lang['eng']
for cid in correlation_scores_by_ld_by_cid.keys():
sd_name_by_cid[cid] = list(sd_df[sd_df['cid'] == cid]['category'])[0]

# add SD names to correlation_scores_by_ld_by_cid keys
correlation_scores_by_ld_by_sd = defaultdict(dict)
for cid, lds in correlation_scores_by_ld_by_cid.items():
for ld, score in lds:
correlation_scores_by_ld_by_sd[cid + ' ' + sd_name_by_cid[cid]][ld] = score

# compute edit distance between sd names and ld names
def compute_edit_distances_between_sd_names_and_ld_names(sds_and_lds_df):
edit_distances = defaultdict(dict)
for sd_name, lds in correlation_scores_by_ld_by_sd.items():
for ld, score in lds.items():
pure_sd_name = ' '.join(sd_name.split(' ')[1:])
edit_distances[sd_name][ld] = edit_distance(pure_sd_name, ld) / max(len(pure_sd_name), len(ld))
for idx, row in sds_and_lds_df.iterrows():
try:
sd_name = row['semantic domain name'].lower()
ld = row['lexical domain name'].lower()
pure_sd_name = sd_name # ' '.join(sd_name.split(' ')[1:])
dist = edit_distance(pure_sd_name, ld) / max(len(pure_sd_name), len(ld))
sds_and_lds_df.loc[idx, 'edit distance'] = dist
edit_distances[sd_name][ld] = dist
except:
print('Error: ' + str(row))

# save sd_ld_mapping to csv file
sds_and_lds_df.to_csv('data/2_sd_labeling/sd_ld_mapping.csv', index=False)

# sort edit_distances by edit distance
for sd_name, lds in edit_distances.items():
Expand All @@ -147,16 +135,26 @@ def correlate_domain_names(dc):
f.write('\t' + ld + '\t' + str(score) + '\n')
f.write('\n')

# compute distances between SDs and LDs with word vectors

def compute_word_vector_distances_between_sd_names_and_ld_names(sds_and_lds_df):
# load word vectors
word_vectors = api.load('word2vec-google-news-300')

# compute distances
distances = defaultdict(dict)
for sd_name, lds in correlation_scores_by_ld_by_sd.items():
for ld, score in lds.items():
pure_sd_name = ' '.join(sd_name.split(' ')[1:])
distances[sd_name][ld] = compute_word_vector_distance(pure_sd_name, ld, word_vectors)
for idx, row in sds_and_lds_df.iterrows():
try:
sd_name = row['semantic domain name'].lower()
ld = row['lexical domain name'].lower()
pure_sd_name = sd_name # ' '.join(sd_name.split(' ')[1:])
dist = compute_word_vector_distance(pure_sd_name, ld, word_vectors)
sds_and_lds_df.loc[idx, 'word vector distance'] = dist
distances[sd_name][ld] = dist
except:
print('Error: ' + str(row))

# save sd_ld_mapping to csv file
sds_and_lds_df.to_csv('data/2_sd_labeling/sd_ld_mapping.csv', index=False)

# filter out lds with distance > 2
for sd_name, lds in distances.items():
Expand All @@ -181,10 +179,39 @@ def correlate_domain_names(dc):
f.write('\n')


def load_sd_ld_mapping(code_by_ld):
def correlate_domain_names(dc):
# load results
# with open('data/2_sd_labeling/correlated_lds_by_cid.pkl', 'rb') as f:
# correlated_lds_by_cid = dill.load(f)
# with open('data/2_sd_labeling/correlated_cids_by_ld.pkl', 'rb') as f:
# correlated_cids_by_ld = dill.load(f)
with open('data/2_sd_labeling/correlation_scores_by_ld_by_cid.pkl', 'rb') as f:
correlation_scores_by_ld_by_cid = dill.load(f)

# load names for CIDs from dc
sd_name_by_cid = {}
sd_df = dc.sds_by_lang['eng']
for cid in correlation_scores_by_ld_by_cid.keys():
sd_name_by_cid[cid] = list(sd_df[sd_df['cid'] == cid]['category'])[0]

# add SD names to correlation_scores_by_ld_by_cid keys
correlation_scores_by_ld_by_sd = defaultdict(dict)
for cid, lds in correlation_scores_by_ld_by_cid.items():
for ld, score in lds:
correlation_scores_by_ld_by_sd[cid + ' ' + sd_name_by_cid[cid]][ld] = score

compute_edit_distances_between_sd_names_and_ld_names(correlation_scores_by_ld_by_sd)
compute_word_vector_distances_between_sd_names_and_ld_names(correlation_scores_by_ld_by_sd)


def load_sd_ld_mapping(dc, code_by_ld):
# load sd_ld_mapping.xlsx
sd_ld_mapping = pd.read_excel('data/2_sd_labeling/sd_ld_mapping.xlsx', engine='openpyxl')

# add_missing_sd_codes(dc, sd_ld_mapping)
# compute_edit_distances_between_sd_names_and_ld_names(sd_ld_mapping)
compute_word_vector_distances_between_sd_names_and_ld_names(sd_ld_mapping)

# add code column
sd_ld_mapping['lexical domain code'] = sd_ld_mapping['lexical domain name'].apply(lambda x: code_by_ld.loc[x])

Expand Down Expand Up @@ -247,8 +274,12 @@ def add_missing_sd_codes(dc, sd_ld_mapping):
# sd_df[sd_df['category'] == sd_ld_mapping['semantic domain name']]['cid']
for index, row in sd_ld_mapping.iterrows():
if type(row['semantic domain name']) == str:
sd_ld_mapping.loc[index, 'semantic domain code'] = sd_df[sd_df['category'] == row['semantic domain name']][
'cid'].values[0]
try:
sd_ld_mapping.loc[index, 'semantic domain code'] = \
sd_df[sd_df['category'] == row['semantic domain name']][
'cid'].values[0]
except:
print('Error: ' + row['semantic domain name'])


def correlate_sds_and_lds_by_words(sd_dataframe, words_by_domain, code_by_ld):
Expand Down Expand Up @@ -285,7 +316,6 @@ def correlate_sds_and_lds_by_words(sd_dataframe, words_by_domain, code_by_ld):
'lexical domain code',
'overlap'])])


sd_ld_mapping.to_csv('data/2_sd_labeling/sd_ld_mapping.csv', index=False)


Expand All @@ -298,7 +328,7 @@ def correlate_sds_and_lds_by_words(sd_dataframe, words_by_domain, code_by_ld):
dc = LinkPredictionDictionaryCreator(['bid-eng-DBY-1000', 'bid-fra-fob-1000'], score_threshold=0.2)
dc._preprocess_data()

sdi = SemanticDomainIdentifier(dc)
# sdi = SemanticDomainIdentifier(dc)

# correlate_domain_names(dc)

Expand All @@ -320,7 +350,7 @@ def correlate_sds_and_lds_by_words(sd_dataframe, words_by_domain, code_by_ld):
code_by_domain = code_by_domain.set_index('domains')
code_by_domain = code_by_domain.rename(columns={'domain_codes': 'domain_code'})
code_by_domain = code_by_domain.sort_values(by=['domain_code'], ascending=True)
# load_sd_ld_mapping(code_by_domain)
load_sd_ld_mapping(dc, code_by_domain)

# filter all lines that contain the word "moon"
moon_lines = domains[domains['english'] == 'moon']
Expand All @@ -337,7 +367,7 @@ def correlate_sds_and_lds_by_words(sd_dataframe, words_by_domain, code_by_ld):
print(words_by_domain)

# Count distinct domains
print(domains['domains'].nunique()) # 1020 (OT + NT), 669 (NT)
print(domains['domains'].nunique()) # 1019 (OT + NT), 668 (NT) (not counting empty domain: '')

# correlate_domains(domains, sdi)

Expand Down

0 comments on commit c7791a0

Please sign in to comment.