From 157d113141540ac889787cd6ed79387698497212 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Mon, 15 May 2023 02:32:06 +0300 Subject: [PATCH 01/69] Initial commit --- lingvodoc/schema/query.py | 146 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 142 insertions(+), 4 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 8c270b5c..726c7921 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -364,6 +364,7 @@ # Setting up logging. log = logging.getLogger(__name__) +logging.disable(level=logging.INFO) # Trying to set up celery logging. @@ -9232,6 +9233,7 @@ def async_cognate_analysis( with transaction.manager: try: + breakpoint() CognateAnalysis.perform_cognate_analysis( language_str, source_perspective_id, @@ -9883,6 +9885,7 @@ def tag_data_plpgsql( entry_already_set.update(entry_id_set) group_list.append(entry_id_set) + breakpoint() return entry_already_set, group_list, time.time() - start_time @staticmethod @@ -10970,6 +10973,7 @@ def perform_cognate_analysis( CognateAnalysis.tag_data_plpgsql( perspective_info_list, group_field_id)) + breakpoint() else: @@ -11034,7 +11038,7 @@ def perform_cognate_analysis( sg_both_count = 0 source_perspective_index = None - + breakpoint() for index, (perspective_id, transcription_field_id, translation_field_id) in \ enumerate(perspective_info_list): @@ -12630,6 +12634,140 @@ def f(axes, embedding_pca): return CognateAnalysis(**result_dict) + @staticmethod + def swadesh_statistics( + language_str, + source_perspective_id, + base_language_id, + base_language_name, + group_field_id, + perspective_info_list, + multi_list, + multi_name_list, + mode, + distance_flag, + reference_perspective_id, + figure_flag, + distance_vowel_flag, + distance_consonant_flag, + match_translations_value, + only_orphans_flag, + locale_id, + storage, + task_status=None, + __debug_flag__=False, + __intermediate_flag__=False): + + # Gathering entry grouping data. + perspective_dict = collections.defaultdict(dict) + + # entry_already_set = set() + # group_list = [] + # tag_dict = collections.defaultdict(set) + + text_dict = {} + entry_id_dict = {} + + entry_already_set, group_list, group_time = ( + CognateAnalysis.tag_data_plpgsql( + perspective_info_list, group_field_id)) + + # Getting text data for each perspective. + + # dbTranslation = aliased(dbEntity, name='Translation') + # dbPublishingTranslation = aliased(dbPublishingEntity, name='PublishingTranslation') + # source_perspective_index = None + + for index, (perspective_id, transcription_field_id, translation_field_id) in \ + enumerate(perspective_info_list): + + # if perspective_id == source_perspective_id: + # source_perspective_index = index + + # Getting and saving perspective info. + perspective = DBSession.query(dbPerspective).filter_by( + client_id=perspective_id[0], object_id=perspective_id[1]).first() + + perspective_name = perspective.get_translation(locale_id) + dictionary_name = perspective.parent.get_translation(locale_id) + + transcription_rules = ( + '' if not perspective.additional_metadata else + perspective.additional_metadata.get('transcription_rules', '')) + + perspective_data = perspective_dict[perspective_id] + + perspective_data['perspective_name'] = perspective_name + perspective_data['dictionary_name'] = dictionary_name + perspective_data['transcription_rules'] = transcription_rules + + log.debug( + '\ncognate_analysis {0}:' + '\n dictionary {1}/{2}: {3}' + '\n perspective {4}/{5}: {6}' + '\n transcription_rules: {7}'.format( + language_str, + perspective.parent_client_id, perspective.parent_object_id, + repr(dictionary_name.strip()), + perspective_id[0], perspective_id[1], + repr(perspective_name.strip()), + repr(transcription_rules))) + + # Getting text data. + translation_query = ( + DBSession.query( + dbLexicalEntry.client_id, + dbLexicalEntry.object_id).filter( + dbLexicalEntry.parent_client_id == perspective_id[0], + dbLexicalEntry.parent_object_id == perspective_id[1], + dbLexicalEntry.marked_for_deletion == False, + dbEntity.parent_client_id == dbLexicalEntry.client_id, + dbEntity.parent_object_id == dbLexicalEntry.object_id, + dbEntity.field_client_id == translation_field_id[0], + dbEntity.field_object_id == translation_field_id[1], + dbEntity.marked_for_deletion == False, + dbPublishingEntity.client_id == dbEntity.client_id, + dbPublishingEntity.object_id == dbEntity.object_id, + dbPublishingEntity.published == True, + dbPublishingEntity.accepted == True) + + .add_columns( + func.array_agg(dbEntity.content).label('translation')) + + .group_by(dbLexicalEntry)) + + # If we are in asynchronous mode, we need to look up how many data rows we need + # to process for this perspective. + if task_status is not None: + row_count = translation_query.count() + + log.debug( + 'cognate_analysis {0}: perspective {1}/{2}: {3} data rows'.format( + language_str, + perspective_id[0], perspective_id[1], + row_count)) + + # Grouping translations by lexical entries. + for row_index, row in enumerate(translation_query.all()): + entry_id = tuple(row[:2]) + transcription_list, translation_list = row[2:4] + + translation_list = ( + [] if not translation_list else [ + translation.strip() + for translation in translation_list + if translation.strip()]) + + # Saving translation data. + entry_data_list = (index, translation_list) + text_dict[entry_id] = entry_data_list + + entry_id_key = ( + index, + (' ʽ' + '|'.join(translation_list) + 'ʼ' if translation_list else '')) + + entry_id_dict[entry_id_key] = entry_id + @staticmethod def mutate(self, info, **args): """ @@ -12823,7 +12961,7 @@ def mutate(self, info, **args): cognate_suggestions_f if mode == 'suggestions' else cognate_analysis_f) - if analysis_f is None: + if analysis_f is None and False: return ResponseError(message = 'Analysis library fuction \'{0}()\' is absent, ' @@ -12875,7 +13013,7 @@ def mutate(self, info, **args): request.response.status = HTTPOk.code if synchronous: - + breakpoint() CognateAnalysis.perform_cognate_analysis( language_str, source_perspective_id, @@ -12933,7 +13071,7 @@ def mutate(self, info, **args): # We do not use acoustic data, so we perform cognate analysis synchronously. else: - + #breakpoint() return CognateAnalysis.perform_cognate_analysis( language_str, source_perspective_id, From 41e87deca7d434d1008660f276d8d9193cc81843 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Mon, 15 May 2023 21:27:32 +0300 Subject: [PATCH 02/69] return CognateAnalysis.swadesh_statistics --- lingvodoc/schema/query.py | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 726c7921..8cf51fd1 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -9233,7 +9233,6 @@ def async_cognate_analysis( with transaction.manager: try: - breakpoint() CognateAnalysis.perform_cognate_analysis( language_str, source_perspective_id, @@ -9885,7 +9884,6 @@ def tag_data_plpgsql( entry_already_set.update(entry_id_set) group_list.append(entry_id_set) - breakpoint() return entry_already_set, group_list, time.time() - start_time @staticmethod @@ -12750,7 +12748,7 @@ def swadesh_statistics( # Grouping translations by lexical entries. for row_index, row in enumerate(translation_query.all()): entry_id = tuple(row[:2]) - transcription_list, translation_list = row[2:4] + translation_list = row[2] translation_list = ( [] if not translation_list else [ @@ -12758,6 +12756,8 @@ def swadesh_statistics( for translation in translation_list if translation.strip()]) + print(translation_list) + # Saving translation data. entry_data_list = (index, translation_list) text_dict[entry_id] = entry_data_list @@ -13068,10 +13068,34 @@ def mutate(self, info, **args): return CognateAnalysis(triumph = True) - # We do not use acoustic data, so we perform cognate analysis synchronously. + elif mode == 'swadesh': + return CognateAnalysis.swadesh_statistics( + language_str, + source_perspective_id, + base_language_id, + base_language_name, + group_field_id, + perspective_info_list, + multi_list, + multi_name_list, + mode, + distance_flag, + reference_perspective_id, + figure_flag, + distance_vowel_flag, + distance_consonant_flag, + match_translations_value, + only_orphans_flag, + locale_id, + storage, + None, + __debug_flag__, + __intermediate_flag__) + + # We do not use acoustic data, so we perform cognate analysis synchronously. else: - #breakpoint() + return CognateAnalysis.perform_cognate_analysis( language_str, source_perspective_id, From cff8e76de1f02e93985dcf431cc1d6198300e9e2 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Tue, 16 May 2023 13:30:40 +0300 Subject: [PATCH 03/69] Next steps --- lingvodoc/schema/query.py | 97 ++++++++++++++------------------------- 1 file changed, 35 insertions(+), 62 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 8cf51fd1..377687f4 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -10971,7 +10971,6 @@ def perform_cognate_analysis( CognateAnalysis.tag_data_plpgsql( perspective_info_list, group_field_id)) - breakpoint() else: @@ -12657,93 +12656,67 @@ def swadesh_statistics( __intermediate_flag__=False): # Gathering entry grouping data. - perspective_dict = collections.defaultdict(dict) - - # entry_already_set = set() - # group_list = [] - # tag_dict = collections.defaultdict(set) - + #perspective_dict = collections.defaultdict(dict) text_dict = {} entry_id_dict = {} - entry_already_set, group_list, group_time = ( + _, group_list, group_time = ( CognateAnalysis.tag_data_plpgsql( perspective_info_list, group_field_id)) - # Getting text data for each perspective. - - # dbTranslation = aliased(dbEntity, name='Translation') - # dbPublishingTranslation = aliased(dbPublishingEntity, name='PublishingTranslation') - # source_perspective_index = None + #print(f"*** Group list: {group_list}") + # Getting text data for each perspective. for index, (perspective_id, transcription_field_id, translation_field_id) in \ enumerate(perspective_info_list): - # if perspective_id == source_perspective_id: - # source_perspective_index = index - # Getting and saving perspective info. - perspective = DBSession.query(dbPerspective).filter_by( - client_id=perspective_id[0], object_id=perspective_id[1]).first() + perspective = ( + DBSession + .query(dbPerspective) + .filter_by(client_id=perspective_id[0], object_id=perspective_id[1]) + .first() + ) perspective_name = perspective.get_translation(locale_id) dictionary_name = perspective.parent.get_translation(locale_id) - - transcription_rules = ( - '' if not perspective.additional_metadata else - perspective.additional_metadata.get('transcription_rules', '')) - - perspective_data = perspective_dict[perspective_id] - - perspective_data['perspective_name'] = perspective_name - perspective_data['dictionary_name'] = dictionary_name - perspective_data['transcription_rules'] = transcription_rules + #perspective_data = perspective_dict[perspective_id] + #perspective_data['perspective_name'] = perspective_name + #perspective_data['dictionary_name'] = dictionary_name log.debug( '\ncognate_analysis {0}:' '\n dictionary {1}/{2}: {3}' - '\n perspective {4}/{5}: {6}' - '\n transcription_rules: {7}'.format( + '\n perspective {4}/{5}: {6}'.format( language_str, perspective.parent_client_id, perspective.parent_object_id, repr(dictionary_name.strip()), perspective_id[0], perspective_id[1], - repr(perspective_name.strip()), - repr(transcription_rules))) + repr(perspective_name.strip()))) # Getting text data. translation_query = ( - DBSession.query( - dbLexicalEntry.client_id, - dbLexicalEntry.object_id).filter( - dbLexicalEntry.parent_client_id == perspective_id[0], - dbLexicalEntry.parent_object_id == perspective_id[1], - dbLexicalEntry.marked_for_deletion == False, - dbEntity.parent_client_id == dbLexicalEntry.client_id, - dbEntity.parent_object_id == dbLexicalEntry.object_id, - dbEntity.field_client_id == translation_field_id[0], - dbEntity.field_object_id == translation_field_id[1], - dbEntity.marked_for_deletion == False, - dbPublishingEntity.client_id == dbEntity.client_id, - dbPublishingEntity.object_id == dbEntity.object_id, - dbPublishingEntity.published == True, - dbPublishingEntity.accepted == True) - + DBSession + .query( + dbLexicalEntry.client_id, + dbLexicalEntry.object_id) + .filter( + dbLexicalEntry.parent_client_id == perspective_id[0], + dbLexicalEntry.parent_object_id == perspective_id[1], + dbLexicalEntry.marked_for_deletion == False, + dbEntity.parent_client_id == dbLexicalEntry.client_id, + dbEntity.parent_object_id == dbLexicalEntry.object_id, + dbEntity.field_client_id == translation_field_id[0], + dbEntity.field_object_id == translation_field_id[1], + dbEntity.marked_for_deletion == False, + dbPublishingEntity.client_id == dbEntity.client_id, + dbPublishingEntity.object_id == dbEntity.object_id, + dbPublishingEntity.published == True, + dbPublishingEntity.accepted == True) .add_columns( - func.array_agg(dbEntity.content).label('translation')) - - .group_by(dbLexicalEntry)) - - # If we are in asynchronous mode, we need to look up how many data rows we need - # to process for this perspective. - if task_status is not None: - row_count = translation_query.count() - - log.debug( - 'cognate_analysis {0}: perspective {1}/{2}: {3} data rows'.format( - language_str, - perspective_id[0], perspective_id[1], - row_count)) + func.array_agg(dbEntity.content).label('translation')) + .group_by(dbLexicalEntry) + .all()) # Grouping translations by lexical entries. for row_index, row in enumerate(translation_query.all()): From 41cd7354b8b8f4398bfaf97aa2014f73095fe25d Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Tue, 16 May 2023 13:32:35 +0300 Subject: [PATCH 04/69] Fix --- lingvodoc/schema/query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 377687f4..9314300d 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -12719,7 +12719,7 @@ def swadesh_statistics( .all()) # Grouping translations by lexical entries. - for row_index, row in enumerate(translation_query.all()): + for row_index, row in enumerate(translation_query): entry_id = tuple(row[:2]) translation_list = row[2] From bf175c2f79b4ee3ef83f641a0559dc07fc0b0a26 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Tue, 16 May 2023 14:51:09 +0300 Subject: [PATCH 05/69] Compare_translations --- lingvodoc/schema/query.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 9314300d..cb57da26 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -12655,6 +12655,24 @@ def swadesh_statistics( __debug_flag__=False, __intermediate_flag__=False): + swadesh_list = ['я','ты','мы','этот, это','тот, то','кто','что','не','все','много','один','два','большой', + 'долгий','маленький','женщина','мужчина','человек','рыба','птица','собака','вошь','дерево', + 'семя','лист','корень','кора','кожа','мясо','кровь','кость','жир','яйцо','рог','хвост','перо', + 'волосы','голова','ухо','глаз','нос','рот','зуб','язык (орган)','ноготь','нога (стопа)','колено', + 'рука (кисть)','живот','горло','грудь','сердце','печень','пить','есть (кушать)','кусать','видеть', + 'слышать','знать','спать','умирать','убивать','плавать','летать','гулять','приходить','лежать', + 'сидеть','стоять','дать','сказать','солнце','луна','звезда','вода','дождь','камень','песок', + 'земля','облако','дым','огонь','пепел','гореть','дорога,тропа','гора','красный','зелёный', + 'жёлтый','белый','чёрный','ночь','тёплый','холодный','полный','новый','хороший','круглый', + 'сухой','имя'] + + def compare_translations(swadesh_lex, dictionary_lex): + def split_lex(lex): + return set(form.lower() + for form in lex.replace(' ', ',').split(',') + if form and '(' not in form) + return bool(split_lex(swadesh_lex) & split_lex(dictionary_lex)) + # Gathering entry grouping data. #perspective_dict = collections.defaultdict(dict) text_dict = {} @@ -12729,7 +12747,7 @@ def swadesh_statistics( for translation in translation_list if translation.strip()]) - print(translation_list) + print(entry_id, translation_list) # Saving translation data. entry_data_list = (index, translation_list) From ffcaf089189077996f70b3462bff36e1d459a1c5 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Tue, 16 May 2023 16:56:00 +0300 Subject: [PATCH 06/69] Loop by 100words and by translation_list --- lingvodoc/schema/query.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index cb57da26..fdde67d0 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -12668,24 +12668,23 @@ def swadesh_statistics( def compare_translations(swadesh_lex, dictionary_lex): def split_lex(lex): - return set(form.lower() - for form in lex.replace(' ', ',').split(',') - if form and '(' not in form) + # Split by comma and open bracket to separate + # various forms of lexem and extra explanation if is + return set(form.strip().lower() + for form in lex.replace('(', ',').split(',') + if form and ')' not in form) return bool(split_lex(swadesh_lex) & split_lex(dictionary_lex)) # Gathering entry grouping data. - #perspective_dict = collections.defaultdict(dict) text_dict = {} entry_id_dict = {} - _, group_list, group_time = ( + _, group_list, _ = ( CognateAnalysis.tag_data_plpgsql( perspective_info_list, group_field_id)) - #print(f"*** Group list: {group_list}") - # Getting text data for each perspective. - for index, (perspective_id, transcription_field_id, translation_field_id) in \ + for index, (perspective_id, _, translation_field_id) in \ enumerate(perspective_info_list): # Getting and saving perspective info. @@ -12698,9 +12697,6 @@ def split_lex(lex): perspective_name = perspective.get_translation(locale_id) dictionary_name = perspective.parent.get_translation(locale_id) - #perspective_data = perspective_dict[perspective_id] - #perspective_data['perspective_name'] = perspective_name - #perspective_data['dictionary_name'] = dictionary_name log.debug( '\ncognate_analysis {0}:' @@ -12747,7 +12743,10 @@ def split_lex(lex): for translation in translation_list if translation.strip()]) - print(entry_id, translation_list) + for swadesh_lex in swadesh_list: + for translation_lex in translation_list: + if compare_translations(swadesh_lex, translation_lex): + print(entry_id, translation_lex) # Saving translation data. entry_data_list = (index, translation_list) From e6108005617ce92893a41a26e4e3ef0496fb63c2 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Tue, 16 May 2023 21:21:26 +0300 Subject: [PATCH 07/69] First result --- lingvodoc/schema/query.py | 45 +++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index fdde67d0..59a8c12f 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -12672,21 +12672,19 @@ def split_lex(lex): # various forms of lexem and extra explanation if is return set(form.strip().lower() for form in lex.replace('(', ',').split(',') - if form and ')' not in form) + if form and (')' not in form)) return bool(split_lex(swadesh_lex) & split_lex(dictionary_lex)) - # Gathering entry grouping data. - text_dict = {} - entry_id_dict = {} - _, group_list, _ = ( CognateAnalysis.tag_data_plpgsql( perspective_info_list, group_field_id)) # Getting text data for each perspective. + entries_map = {} for index, (perspective_id, _, translation_field_id) in \ enumerate(perspective_info_list): + ''' # Getting and saving perspective info. perspective = ( DBSession @@ -12707,6 +12705,7 @@ def split_lex(lex): repr(dictionary_name.strip()), perspective_id[0], perspective_id[1], repr(perspective_name.strip()))) + ''' # Getting text data. translation_query = ( @@ -12733,6 +12732,7 @@ def split_lex(lex): .all()) # Grouping translations by lexical entries. + entries_map[perspective_id] = set() for row_index, row in enumerate(translation_query): entry_id = tuple(row[:2]) translation_list = row[2] @@ -12746,17 +12746,30 @@ def split_lex(lex): for swadesh_lex in swadesh_list: for translation_lex in translation_list: if compare_translations(swadesh_lex, translation_lex): - print(entry_id, translation_lex) - - # Saving translation data. - entry_data_list = (index, translation_list) - text_dict[entry_id] = entry_data_list - - entry_id_key = ( - index, - (' ʽ' + '|'.join(translation_list) + 'ʼ' if translation_list else '')) - - entry_id_dict[entry_id_key] = entry_id + entries_map[perspective_id].add(entry_id) + #print(entry_id, translation_lex) + + # Create dictionary of sets: + # keys: pepspective_id + # values: numbers of groups where an entry from dictionary is met + links = {} + for perspective, entries in entries_map.items(): + links[perspective] = set() + for index_group, group in enumerate(group_list): + if (entries & group): + links[perspective].add(index_group) + + # Calculate intersection between lists of group numbers for all the perspectives + # So length of this intersection is the similarity of corresponding perspectives + similarity = {} + for perspective1, groups1 in links.items(): + similarity[perspective1] = {} + print(perspective1, end=' :: ') + for perspective2, groups2 in links.items(): + commons = len(groups1 & groups2) + similarity[perspective1][perspective2] = commons + print(f"{perspective2}:{commons}", end=' | ') + print() @staticmethod def mutate(self, info, **args): From 63dcb47438818d1becb777c8fef8dc8eb1d02e05 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Tue, 16 May 2023 21:41:19 +0300 Subject: [PATCH 08/69] Minor --- lingvodoc/schema/query.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 59a8c12f..f50a2af1 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -12668,11 +12668,12 @@ def swadesh_statistics( def compare_translations(swadesh_lex, dictionary_lex): def split_lex(lex): - # Split by comma and open bracket to separate - # various forms of lexem and extra explanation if is + # Split by commas and open brackets to separate + # various forms of lexem and extra note if is return set(form.strip().lower() for form in lex.replace('(', ',').split(',') - if form and (')' not in form)) + if form.strip() and (')' not in form)) #exclude notes + # return true if the intersection is not empty return bool(split_lex(swadesh_lex) & split_lex(dictionary_lex)) _, group_list, _ = ( @@ -12680,6 +12681,7 @@ def split_lex(lex): perspective_info_list, group_field_id)) # Getting text data for each perspective. + # entries_map gathers words from Swadesh' list met in perspectives entries_map = {} for index, (perspective_id, _, translation_field_id) in \ enumerate(perspective_info_list): @@ -12751,7 +12753,7 @@ def split_lex(lex): # Create dictionary of sets: # keys: pepspective_id - # values: numbers of groups where an entry from dictionary is met + # values: numbers of etymological groups where an entry from dictionary is met links = {} for perspective, entries in entries_map.items(): links[perspective] = set() From 18a340cc184edf25440f07f87799ce079422e649 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Wed, 17 May 2023 15:05:06 +0300 Subject: [PATCH 09/69] Separated classes --- lingvodoc/schema/query.py | 418 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 418 insertions(+) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index f50a2af1..46420db6 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -12631,6 +12631,424 @@ def f(axes, embedding_pca): return CognateAnalysis(**result_dict) + @staticmethod + def mutate(self, info, **args): + """ + mutation CognateAnalysis { + cognate_analysis( + base_language_id: [508, 41], + group_field_id: [66, 25], + perspective_info_list: [ + [[425, 4], [66, 8], [66, 10]], + [[1552, 1759], [66, 8], [66, 10]], + [[418, 4], [66, 8], [66, 10]]]) + { + triumph + entity_count + dictionary_count + group_count + not_enough_count + text_count + result + } + } + """ + + # Administrator / perspective author / editing permission check. + + error_str = ( + 'Only administrator, perspective author and users with perspective editing permissions ' + 'can perform cognate analysis.') + + client_id = info.context.request.authenticated_userid + + if not client_id: + raise ResponseError(error_str) + + user = Client.get_user_by_client_id(client_id) + + author_client_id_set = ( + + set( + client_id + for (client_id, _), _, _ in args['perspective_info_list'])) + + author_id_check = ( + + DBSession + + .query( + + DBSession + .query(literal(1)) + .filter( + Client.id.in_(author_client_id_set), + Client.user_id == user.id) + .exists()) + + .scalar()) + + if (user.id != 1 and + not author_id_check and + not info.context.acl_check_if('edit', 'perspective', args['source_perspective_id'])): + + raise ResponseError(error_str) + + # Getting arguments. + + source_perspective_id = args['source_perspective_id'] + base_language_id = args['base_language_id'] + + group_field_id = args['group_field_id'] + perspective_info_list = args['perspective_info_list'] + multi_list = args.get('multi_list') + + mode = args.get('mode') + + distance_flag = args.get('distance_flag') + reference_perspective_id = args.get('reference_perspective_id') + + figure_flag = args.get('figure_flag') + distance_vowel_flag = args.get('distance_vowel_flag') + distance_consonant_flag = args.get('distance_consonant_flag') + + match_translations_value = args.get('match_translations_value', 1) + only_orphans_flag = args.get('only_orphans_flag', True) + + __debug_flag__ = args.get('debug_flag', False) + __intermediate_flag__ = args.get('intermediate_flag', False) + + synchronous = args.get('synchronous', False) + + language_str = ( + '{0}/{1}, language {2}/{3}'.format( + source_perspective_id[0], source_perspective_id[1], + base_language_id[0], base_language_id[1])) + + try: + + # Getting base language info. + + locale_id = info.context.get('locale_id') or 2 + + base_language = DBSession.query(dbLanguage).filter_by( + client_id = base_language_id[0], object_id = base_language_id[1]).first() + + base_language_name = base_language.get_translation(locale_id) + + request = info.context.request + storage = request.registry.settings['storage'] + + # Getting multi-language info, if required. + + if multi_list is None: + multi_list = [] + + multi_name_list = [] + + for language_id, perspective_count in multi_list: + + language = DBSession.query(dbLanguage).filter_by( + client_id = language_id[0], object_id = language_id[1]).first() + + multi_name_list.append( + language.get_translation(locale_id)) + + # Language tag. + + if mode == 'multi': + + multi_str = ', '.join( + '{0}/{1}'.format(*id) + for id, count in multi_list) + + language_str = ( + '{0}/{1}, languages {2}'.format( + source_perspective_id[0], source_perspective_id[1], + multi_str)) + + # Showing cognate analysis info, checking cognate analysis library presence. + + log.debug( + '\ncognate_analysis {}:' + '\n base language: {}' + '\n group field: {}/{}' + '\n perspectives and transcription/translation fields: {}' + '\n multi_list: {}' + '\n multi_name_list: {}' + '\n mode: {}' + '\n distance_flag: {}' + '\n reference_perspective_id: {}' + '\n figure_flag: {}' + '\n distance_vowel_flag: {}' + '\n distance_consonant_flag: {}' + '\n match_translations_value: {}' + '\n only_orphans_flag: {} ({})' + '\n __debug_flag__: {}' + '\n __intermediate_flag__: {}' + '\n cognate_analysis_f: {}' + '\n cognate_acoustic_analysis_f: {}' + '\n cognate_distance_analysis_f: {}' + '\n cognate_reconstruction_f: {}' + '\n cognate_reconstruction_multi_f: {}' + '\n cognate_suggestions_f: {}'.format( + language_str, + repr(base_language_name.strip()), + group_field_id[0], group_field_id[1], + perspective_info_list, + multi_list, + multi_name_list, + repr(mode), + distance_flag, + reference_perspective_id, + figure_flag, + distance_vowel_flag, + distance_consonant_flag, + match_translations_value, + only_orphans_flag, int(only_orphans_flag), + __debug_flag__, + __intermediate_flag__, + repr(cognate_analysis_f), + repr(cognate_acoustic_analysis_f), + repr(cognate_distance_analysis_f), + repr(cognate_reconstruction_f), + repr(cognate_reconstruction_multi_f), + repr(cognate_suggestions_f))) + + # Checking if we have analysis function ready. + + analysis_f = ( + cognate_acoustic_analysis_f if mode == 'acoustic' else + cognate_reconstruction_f if mode == 'reconstruction' else + cognate_reconstruction_multi_f if mode == 'multi' else + cognate_suggestions_f if mode == 'suggestions' else + cognate_analysis_f) + + if analysis_f is None and False: + + return ResponseError(message = + 'Analysis library fuction \'{0}()\' is absent, ' + 'please contact system administrator.'.format( + 'CognateAcousticAnalysis_GetAllOutput' if mode == 'acoustic' else + 'CognateReconstruct_GetAllOutput' if mode == 'reconstruction' else + 'CognateMultiReconstruct_GetAllOutput' if mode == 'multi' else + 'GuessCognates_GetAllOutput' if mode == 'suggestions' else + 'CognateAnalysis_GetAllOutput')) + + # Transforming client/object pair ids from lists to 2-tuples. + + source_perspective_id = tuple(source_perspective_id) + base_language_id = tuple(base_language_id) + group_field_id = tuple(group_field_id) + + perspective_info_list = [ + + (tuple(perspective_id), + tuple(transcription_field_id), + tuple(translation_field_id)) + + for perspective_id, + transcription_field_id, + translation_field_id in perspective_info_list] + + multi_list = [ + [tuple(language_id), perspective_count] + for language_id, perspective_count in multi_list] + + if reference_perspective_id is not None: + reference_perspective_id = tuple(reference_perspective_id) + + # If we are to use acoustic data, we will launch cognate analysis in asynchronous mode. + + if mode == 'acoustic': + + client_id = request.authenticated_userid + + user_id = ( + Client.get_user_by_client_id(client_id).id + if client_id else anonymous_userid(request)) + + task_status = TaskStatus( + user_id, 'Cognate acoustic analysis', base_language_name, 5) + + # Launching cognate acoustic analysis asynchronously. + + request.response.status = HTTPOk.code + + if synchronous: + breakpoint() + CognateAnalysis.perform_cognate_analysis( + language_str, + source_perspective_id, + base_language_id, + base_language_name, + group_field_id, + perspective_info_list, + multi_list, + multi_name_list, + mode, + None, + None, + None, + None, + None, + match_translations_value, + only_orphans_flag, + locale_id, + storage, + task_status, + __debug_flag__, + __intermediate_flag__) + + else: + + async_cognate_analysis.delay( + language_str, + source_perspective_id, + base_language_id, + base_language_name, + group_field_id, + perspective_info_list, + multi_list, + multi_name_list, + mode, + distance_flag, + reference_perspective_id, + figure_flag, + distance_vowel_flag, + distance_consonant_flag, + match_translations_value, + only_orphans_flag, + locale_id, + storage, + task_status.key, + request.registry.settings['cache_kwargs'], + request.registry.settings['sqlalchemy.url'], + __debug_flag__, + __intermediate_flag__) + + # Signifying that we've successfully launched asynchronous cognate acoustic analysis. + + return CognateAnalysis(triumph = True) + + elif mode == 'swadesh': + + return CognateAnalysis.swadesh_statistics( + language_str, + source_perspective_id, + base_language_id, + base_language_name, + group_field_id, + perspective_info_list, + multi_list, + multi_name_list, + mode, + distance_flag, + reference_perspective_id, + figure_flag, + distance_vowel_flag, + distance_consonant_flag, + match_translations_value, + only_orphans_flag, + locale_id, + storage, + None, + __debug_flag__, + __intermediate_flag__) + + # We do not use acoustic data, so we perform cognate analysis synchronously. + else: + + return CognateAnalysis.perform_cognate_analysis( + language_str, + source_perspective_id, + base_language_id, + base_language_name, + group_field_id, + perspective_info_list, + multi_list, + multi_name_list, + mode, + distance_flag, + reference_perspective_id, + figure_flag, + distance_vowel_flag, + distance_consonant_flag, + match_translations_value, + only_orphans_flag, + locale_id, + storage, + None, + __debug_flag__, + __intermediate_flag__) + + # Exception occured while we tried to perform cognate analysis. + + except Exception as exception: + + traceback_string = ''.join(traceback.format_exception( + exception, exception, exception.__traceback__))[:-1] + + log.warning( + 'cognate_analysis {0}: exception'.format( + language_str)) + + log.warning(traceback_string) + + return ResponseError(message = + 'Exception:\n' + traceback_string) + + +class SwadeshAnalysis(graphene.Mutation): + class Arguments: + + source_perspective_id = LingvodocID(required = True) + base_language_id = LingvodocID(required = True) + + group_field_id = LingvodocID(required = True) + perspective_info_list = graphene.List(graphene.List(LingvodocID), required = True) + multi_list = graphene.List(ObjectVal) + + mode = graphene.String() + + distance_flag = graphene.Boolean() + reference_perspective_id = LingvodocID() + + figure_flag = graphene.Boolean() + distance_vowel_flag = graphene.Boolean() + distance_consonant_flag = graphene.Boolean() + + match_translations_value = graphene.Int() + only_orphans_flag = graphene.Boolean() + + debug_flag = graphene.Boolean() + intermediate_flag = graphene.Boolean() + + synchronous = graphene.Boolean() + + triumph = graphene.Boolean() + + dictionary_count = graphene.Int() + group_count = graphene.Int() + not_enough_count = graphene.Int() + transcription_count = graphene.Int() + translation_count = graphene.Int() + + result = graphene.String() + xlsx_url = graphene.String() + distance_list = graphene.Field(ObjectVal) + figure_url = graphene.String() + + minimum_spanning_tree = graphene.List(graphene.List(graphene.Int)) + embedding_2d = graphene.List(graphene.List(graphene.Float)) + embedding_3d = graphene.List(graphene.List(graphene.Float)) + perspective_name_list = graphene.List(graphene.String) + + suggestion_list = graphene.List(ObjectVal) + suggestion_field_id = LingvodocID() + + intermediate_url_list = graphene.List(graphene.String) + @staticmethod def swadesh_statistics( language_str, From 43b49484dc1bec38cdf9e9f700d7046fe5ea2435 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Wed, 17 May 2023 18:39:13 +0300 Subject: [PATCH 10/69] Next steps to separate classes --- lingvodoc/schema/query.py | 331 ++------------------------------------ 1 file changed, 14 insertions(+), 317 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 46420db6..393204a1 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13007,71 +13007,15 @@ class Arguments: group_field_id = LingvodocID(required = True) perspective_info_list = graphene.List(graphene.List(LingvodocID), required = True) - multi_list = graphene.List(ObjectVal) - - mode = graphene.String() - - distance_flag = graphene.Boolean() - reference_perspective_id = LingvodocID() - - figure_flag = graphene.Boolean() - distance_vowel_flag = graphene.Boolean() - distance_consonant_flag = graphene.Boolean() - - match_translations_value = graphene.Int() - only_orphans_flag = graphene.Boolean() - - debug_flag = graphene.Boolean() - intermediate_flag = graphene.Boolean() - - synchronous = graphene.Boolean() triumph = graphene.Boolean() - dictionary_count = graphene.Int() - group_count = graphene.Int() - not_enough_count = graphene.Int() - transcription_count = graphene.Int() - translation_count = graphene.Int() - - result = graphene.String() - xlsx_url = graphene.String() - distance_list = graphene.Field(ObjectVal) - figure_url = graphene.String() - - minimum_spanning_tree = graphene.List(graphene.List(graphene.Int)) - embedding_2d = graphene.List(graphene.List(graphene.Float)) - embedding_3d = graphene.List(graphene.List(graphene.Float)) - perspective_name_list = graphene.List(graphene.String) - - suggestion_list = graphene.List(ObjectVal) - suggestion_field_id = LingvodocID() - - intermediate_url_list = graphene.List(graphene.String) - @staticmethod def swadesh_statistics( language_str, - source_perspective_id, - base_language_id, - base_language_name, group_field_id, perspective_info_list, - multi_list, - multi_name_list, - mode, - distance_flag, - reference_perspective_id, - figure_flag, - distance_vowel_flag, - distance_consonant_flag, - match_translations_value, - only_orphans_flag, - locale_id, - storage, - task_status=None, - __debug_flag__=False, - __intermediate_flag__=False): + locale_id): swadesh_list = ['я','ты','мы','этот, это','тот, то','кто','что','не','все','много','один','два','большой', 'долгий','маленький','женщина','мужчина','человек','рыба','птица','собака','вошь','дерево', @@ -13117,7 +13061,7 @@ def split_lex(lex): dictionary_name = perspective.parent.get_translation(locale_id) log.debug( - '\ncognate_analysis {0}:' + '\nswadesh_analysis {0}:' '\n dictionary {1}/{2}: {3}' '\n perspective {4}/{5}: {6}'.format( language_str, @@ -13194,8 +13138,8 @@ def split_lex(lex): @staticmethod def mutate(self, info, **args): """ - mutation CognateAnalysis { - cognate_analysis( + mutation SwadeshAnalysis { + swadesh_analysis( base_language_id: [508, 41], group_field_id: [66, 25], perspective_info_list: [ @@ -13203,22 +13147,14 @@ def mutate(self, info, **args): [[1552, 1759], [66, 8], [66, 10]], [[418, 4], [66, 8], [66, 10]]]) { - triumph - entity_count - dictionary_count - group_count - not_enough_count - text_count - result - } + triumph } } """ # Administrator / perspective author / editing permission check. - error_str = ( 'Only administrator, perspective author and users with perspective editing permissions ' - 'can perform cognate analysis.') + 'can perform swadesh analysis.') client_id = info.context.request.authenticated_userid @@ -13261,24 +13197,6 @@ def mutate(self, info, **args): group_field_id = args['group_field_id'] perspective_info_list = args['perspective_info_list'] - multi_list = args.get('multi_list') - - mode = args.get('mode') - - distance_flag = args.get('distance_flag') - reference_perspective_id = args.get('reference_perspective_id') - - figure_flag = args.get('figure_flag') - distance_vowel_flag = args.get('distance_vowel_flag') - distance_consonant_flag = args.get('distance_consonant_flag') - - match_translations_value = args.get('match_translations_value', 1) - only_orphans_flag = args.get('only_orphans_flag', True) - - __debug_flag__ = args.get('debug_flag', False) - __intermediate_flag__ = args.get('intermediate_flag', False) - - synchronous = args.get('synchronous', False) language_str = ( '{0}/{1}, language {2}/{3}'.format( @@ -13299,102 +13217,6 @@ def mutate(self, info, **args): request = info.context.request storage = request.registry.settings['storage'] - # Getting multi-language info, if required. - - if multi_list is None: - multi_list = [] - - multi_name_list = [] - - for language_id, perspective_count in multi_list: - - language = DBSession.query(dbLanguage).filter_by( - client_id = language_id[0], object_id = language_id[1]).first() - - multi_name_list.append( - language.get_translation(locale_id)) - - # Language tag. - - if mode == 'multi': - - multi_str = ', '.join( - '{0}/{1}'.format(*id) - for id, count in multi_list) - - language_str = ( - '{0}/{1}, languages {2}'.format( - source_perspective_id[0], source_perspective_id[1], - multi_str)) - - # Showing cognate analysis info, checking cognate analysis library presence. - - log.debug( - '\ncognate_analysis {}:' - '\n base language: {}' - '\n group field: {}/{}' - '\n perspectives and transcription/translation fields: {}' - '\n multi_list: {}' - '\n multi_name_list: {}' - '\n mode: {}' - '\n distance_flag: {}' - '\n reference_perspective_id: {}' - '\n figure_flag: {}' - '\n distance_vowel_flag: {}' - '\n distance_consonant_flag: {}' - '\n match_translations_value: {}' - '\n only_orphans_flag: {} ({})' - '\n __debug_flag__: {}' - '\n __intermediate_flag__: {}' - '\n cognate_analysis_f: {}' - '\n cognate_acoustic_analysis_f: {}' - '\n cognate_distance_analysis_f: {}' - '\n cognate_reconstruction_f: {}' - '\n cognate_reconstruction_multi_f: {}' - '\n cognate_suggestions_f: {}'.format( - language_str, - repr(base_language_name.strip()), - group_field_id[0], group_field_id[1], - perspective_info_list, - multi_list, - multi_name_list, - repr(mode), - distance_flag, - reference_perspective_id, - figure_flag, - distance_vowel_flag, - distance_consonant_flag, - match_translations_value, - only_orphans_flag, int(only_orphans_flag), - __debug_flag__, - __intermediate_flag__, - repr(cognate_analysis_f), - repr(cognate_acoustic_analysis_f), - repr(cognate_distance_analysis_f), - repr(cognate_reconstruction_f), - repr(cognate_reconstruction_multi_f), - repr(cognate_suggestions_f))) - - # Checking if we have analysis function ready. - - analysis_f = ( - cognate_acoustic_analysis_f if mode == 'acoustic' else - cognate_reconstruction_f if mode == 'reconstruction' else - cognate_reconstruction_multi_f if mode == 'multi' else - cognate_suggestions_f if mode == 'suggestions' else - cognate_analysis_f) - - if analysis_f is None and False: - - return ResponseError(message = - 'Analysis library fuction \'{0}()\' is absent, ' - 'please contact system administrator.'.format( - 'CognateAcousticAnalysis_GetAllOutput' if mode == 'acoustic' else - 'CognateReconstruct_GetAllOutput' if mode == 'reconstruction' else - 'CognateMultiReconstruct_GetAllOutput' if mode == 'multi' else - 'GuessCognates_GetAllOutput' if mode == 'suggestions' else - 'CognateAnalysis_GetAllOutput')) - # Transforming client/object pair ids from lists to 2-tuples. source_perspective_id = tuple(source_perspective_id) @@ -13411,146 +13233,20 @@ def mutate(self, info, **args): transcription_field_id, translation_field_id in perspective_info_list] - multi_list = [ - [tuple(language_id), perspective_count] - for language_id, perspective_count in multi_list] - - if reference_perspective_id is not None: - reference_perspective_id = tuple(reference_perspective_id) - - # If we are to use acoustic data, we will launch cognate analysis in asynchronous mode. - - if mode == 'acoustic': - - client_id = request.authenticated_userid - - user_id = ( - Client.get_user_by_client_id(client_id).id - if client_id else anonymous_userid(request)) - - task_status = TaskStatus( - user_id, 'Cognate acoustic analysis', base_language_name, 5) - - # Launching cognate acoustic analysis asynchronously. - - request.response.status = HTTPOk.code - - if synchronous: - breakpoint() - CognateAnalysis.perform_cognate_analysis( - language_str, - source_perspective_id, - base_language_id, - base_language_name, - group_field_id, - perspective_info_list, - multi_list, - multi_name_list, - mode, - None, - None, - None, - None, - None, - match_translations_value, - only_orphans_flag, - locale_id, - storage, - task_status, - __debug_flag__, - __intermediate_flag__) - - else: - - async_cognate_analysis.delay( - language_str, - source_perspective_id, - base_language_id, - base_language_name, - group_field_id, - perspective_info_list, - multi_list, - multi_name_list, - mode, - distance_flag, - reference_perspective_id, - figure_flag, - distance_vowel_flag, - distance_consonant_flag, - match_translations_value, - only_orphans_flag, - locale_id, - storage, - task_status.key, - request.registry.settings['cache_kwargs'], - request.registry.settings['sqlalchemy.url'], - __debug_flag__, - __intermediate_flag__) - - # Signifying that we've successfully launched asynchronous cognate acoustic analysis. - - return CognateAnalysis(triumph = True) - - elif mode == 'swadesh': - - return CognateAnalysis.swadesh_statistics( - language_str, - source_perspective_id, - base_language_id, - base_language_name, - group_field_id, - perspective_info_list, - multi_list, - multi_name_list, - mode, - distance_flag, - reference_perspective_id, - figure_flag, - distance_vowel_flag, - distance_consonant_flag, - match_translations_value, - only_orphans_flag, - locale_id, - storage, - None, - __debug_flag__, - __intermediate_flag__) - - # We do not use acoustic data, so we perform cognate analysis synchronously. - else: - - return CognateAnalysis.perform_cognate_analysis( - language_str, - source_perspective_id, - base_language_id, - base_language_name, - group_field_id, - perspective_info_list, - multi_list, - multi_name_list, - mode, - distance_flag, - reference_perspective_id, - figure_flag, - distance_vowel_flag, - distance_consonant_flag, - match_translations_value, - only_orphans_flag, - locale_id, - storage, - None, - __debug_flag__, - __intermediate_flag__) - - # Exception occured while we tried to perform cognate analysis. + return SwadeshAnalysis.swadesh_statistics( + language_str, + group_field_id, + perspective_info_list, + locale_id) + # Exception occured while we tried to perform swadesh analysis. except Exception as exception: traceback_string = ''.join(traceback.format_exception( exception, exception, exception.__traceback__))[:-1] log.warning( - 'cognate_analysis {0}: exception'.format( + 'swadesh_analysis {0}: exception'.format( language_str)) log.warning(traceback_string) @@ -19392,6 +19088,7 @@ class MyMutations(graphene.ObjectType): starling_etymology = StarlingEtymology.Field() phonemic_analysis = PhonemicAnalysis.Field() cognate_analysis = CognateAnalysis.Field() + swadesh_analysis = SwadeshAnalysis.Field() phonology = Phonology.Field() phonological_statistical_distance = PhonologicalStatisticalDistance.Field() sound_and_markup = SoundAndMarkup.Field() From 072bd39c2c63cdf3901f3a0605e166df6a6a5228 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Wed, 17 May 2023 19:35:44 +0300 Subject: [PATCH 11/69] Fixes and cleanup --- lingvodoc/schema/query.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 393204a1..236ee835 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -364,7 +364,7 @@ # Setting up logging. log = logging.getLogger(__name__) -logging.disable(level=logging.INFO) +#logging.disable(level=logging.INFO) # Trying to set up celery logging. @@ -11035,7 +11035,6 @@ def perform_cognate_analysis( sg_both_count = 0 source_perspective_index = None - breakpoint() for index, (perspective_id, transcription_field_id, translation_field_id) in \ enumerate(perspective_info_list): @@ -12876,7 +12875,6 @@ def mutate(self, info, **args): request.response.status = HTTPOk.code if synchronous: - breakpoint() CognateAnalysis.perform_cognate_analysis( language_str, source_perspective_id, From 3fca48b09d5e6f5d90c0f299f3b3e0e31d79cd98 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Thu, 18 May 2023 14:24:57 +0300 Subject: [PATCH 12/69] Calculate commons_total as intersection Swadesh' entries --- lingvodoc/schema/query.py | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 236ee835..e1d4f21a 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -364,7 +364,7 @@ # Setting up logging. log = logging.getLogger(__name__) -#logging.disable(level=logging.INFO) +logging.disable(level=logging.INFO) # Trying to set up celery logging. @@ -13041,8 +13041,10 @@ def split_lex(lex): perspective_info_list, group_field_id)) # Getting text data for each perspective. - # entries_map gathers words from Swadesh' list met in perspectives - entries_map = {} + # entries_set gathers entry_id(s) of words met in Swadesh' list + # swadesh_set gathers numbers of words within Swadesh' list + entries_set = {} + swadesh_set = {} for index, (perspective_id, _, translation_field_id) in \ enumerate(perspective_info_list): @@ -13094,7 +13096,8 @@ def split_lex(lex): .all()) # Grouping translations by lexical entries. - entries_map[perspective_id] = set() + entries_set[perspective_id] = set() + swadesh_set[perspective_id] = set() for row_index, row in enumerate(translation_query): entry_id = tuple(row[:2]) translation_list = row[2] @@ -13105,32 +13108,36 @@ def split_lex(lex): for translation in translation_list if translation.strip()]) - for swadesh_lex in swadesh_list: + for swadesh_num, swadesh_lex in enumerate(swadesh_list): for translation_lex in translation_list: if compare_translations(swadesh_lex, translation_lex): - entries_map[perspective_id].add(entry_id) - #print(entry_id, translation_lex) + # Store entry_id and number of the lex within Swadesh' list + entries_set[perspective_id].add(entry_id) + swadesh_set[perspective_id].add(swadesh_num) + #print(entry_id, swadesh_num, translation_lex) # Create dictionary of sets: # keys: pepspective_id # values: numbers of etymological groups where an entry from dictionary is met links = {} - for perspective, entries in entries_map.items(): + for perspective, entries in entries_set.items(): links[perspective] = set() - for index_group, group in enumerate(group_list): + for group_index, group in enumerate(group_list): if (entries & group): - links[perspective].add(index_group) + links[perspective].add(group_index) # Calculate intersection between lists of group numbers for all the perspectives # So length of this intersection is the similarity of corresponding perspectives similarity = {} - for perspective1, groups1 in links.items(): + for n1, (perspective1, groups1) in enumerate(links.items()): similarity[perspective1] = {} print(perspective1, end=' :: ') - for perspective2, groups2 in links.items(): - commons = len(groups1 & groups2) - similarity[perspective1][perspective2] = commons - print(f"{perspective2}:{commons}", end=' | ') + for n2, (perspective2, groups2) in enumerate(links.items()): + if n2 <= n1: continue #exclude duplicates and self-to-self + commons_total = len(swadesh_set[perspective1] & swadesh_set[perspective2]) + commons_linked = len(groups1 & groups2) + similarity[perspective1][perspective2] = commons_total, commons_linked + print(f"{perspective2}:{commons_linked}/{commons_total}", end=' | ') print() @staticmethod From 15bb2f0723328b033d679fe6c0ebabd2c2486e4a Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Thu, 18 May 2023 14:43:53 +0300 Subject: [PATCH 13/69] divergence_time --- lingvodoc/schema/query.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index e1d4f21a..755c5d29 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13126,8 +13126,9 @@ def split_lex(lex): if (entries & group): links[perspective].add(group_index) - # Calculate intersection between lists of group numbers for all the perspectives + # Calculate intersection between lists of group numbers # So length of this intersection is the similarity of corresponding perspectives + # commons_total means amount of Swadesh's lexems met in the both perspectives similarity = {} for n1, (perspective1, groups1) in enumerate(links.items()): similarity[perspective1] = {} @@ -13136,8 +13137,9 @@ def split_lex(lex): if n2 <= n1: continue #exclude duplicates and self-to-self commons_total = len(swadesh_set[perspective1] & swadesh_set[perspective2]) commons_linked = len(groups1 & groups2) + divergence_time = -10 * math.log(commons_linked / commons_total) similarity[perspective1][perspective2] = commons_total, commons_linked - print(f"{perspective2}:{commons_linked}/{commons_total}", end=' | ') + print(f"{perspective2}:{commons_linked}/{commons_total}:{divergence_time}", end=' | ') print() @staticmethod From 47a473dc153a975f982c3eb8a00188d86a629b69 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Thu, 18 May 2023 14:47:37 +0300 Subject: [PATCH 14/69] Fix --- lingvodoc/schema/query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 755c5d29..d4659e4e 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13137,7 +13137,7 @@ def split_lex(lex): if n2 <= n1: continue #exclude duplicates and self-to-self commons_total = len(swadesh_set[perspective1] & swadesh_set[perspective2]) commons_linked = len(groups1 & groups2) - divergence_time = -10 * math.log(commons_linked / commons_total) + divergence_time = (-10 * math.log(commons_linked / commons_total) if commons_total > 0 else -1) similarity[perspective1][perspective2] = commons_total, commons_linked print(f"{perspective2}:{commons_linked}/{commons_total}:{divergence_time}", end=' | ') print() From 9272fa9c60c9a6d251df9351b6b883c788fe0702 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Thu, 18 May 2023 14:58:56 +0300 Subject: [PATCH 15/69] Math fix --- lingvodoc/schema/query.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index d4659e4e..de296af8 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13137,7 +13137,8 @@ def split_lex(lex): if n2 <= n1: continue #exclude duplicates and self-to-self commons_total = len(swadesh_set[perspective1] & swadesh_set[perspective2]) commons_linked = len(groups1 & groups2) - divergence_time = (-10 * math.log(commons_linked / commons_total) if commons_total > 0 else -1) + # If commons_linked > 0 then commons_total > 0 all the more. If not then this is a bug. + divergence_time = (-10 * math.log(commons_linked / commons_total) if commons_linked > 0 else -1) similarity[perspective1][perspective2] = commons_total, commons_linked print(f"{perspective2}:{commons_linked}/{commons_total}:{divergence_time}", end=' | ') print() From ab40a1c1b50933067bed45f9978f3648f5a76539 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Thu, 18 May 2023 18:30:26 +0300 Subject: [PATCH 16/69] Exclude borrowings --- lingvodoc/schema/query.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index de296af8..e7c98c21 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13032,7 +13032,9 @@ def split_lex(lex): # various forms of lexem and extra note if is return set(form.strip().lower() for form in lex.replace('(', ',').split(',') - if form.strip() and (')' not in form)) #exclude notes + if form.strip() + and (')' not in form) + and (' заим.' not in form)) #exclude notes and borrowings # return true if the intersection is not empty return bool(split_lex(swadesh_lex) & split_lex(dictionary_lex)) @@ -13140,7 +13142,7 @@ def split_lex(lex): # If commons_linked > 0 then commons_total > 0 all the more. If not then this is a bug. divergence_time = (-10 * math.log(commons_linked / commons_total) if commons_linked > 0 else -1) similarity[perspective1][perspective2] = commons_total, commons_linked - print(f"{perspective2}:{commons_linked}/{commons_total}:{divergence_time}", end=' | ') + print(f"{perspective2}:{commons_linked}/{commons_total}:{divergence_time:.2f}", end=' | ') print() @staticmethod From 16f8f0aad2eaad9fb1d32ad926f8be82621725c8 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Thu, 18 May 2023 18:59:05 +0300 Subject: [PATCH 17/69] Cleanup --- lingvodoc/schema/query.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index e7c98c21..a6dc4421 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13033,8 +13033,8 @@ def split_lex(lex): return set(form.strip().lower() for form in lex.replace('(', ',').split(',') if form.strip() - and (')' not in form) - and (' заим.' not in form)) #exclude notes and borrowings + and ')' not in form + and ' заим.' not in form) #exclude notes and borrowings # return true if the intersection is not empty return bool(split_lex(swadesh_lex) & split_lex(dictionary_lex)) @@ -13139,8 +13139,8 @@ def split_lex(lex): if n2 <= n1: continue #exclude duplicates and self-to-self commons_total = len(swadesh_set[perspective1] & swadesh_set[perspective2]) commons_linked = len(groups1 & groups2) - # If commons_linked > 0 then commons_total > 0 all the more. If not then this is a bug. - divergence_time = (-10 * math.log(commons_linked / commons_total) if commons_linked > 0 else -1) + # commons_linked > 0 means that commons_total > 0 even more so + divergence_time = math.log(commons_linked / commons_total) / -0.14 if commons_linked > 0 else -1 similarity[perspective1][perspective2] = commons_total, commons_linked print(f"{perspective2}:{commons_linked}/{commons_total}:{divergence_time:.2f}", end=' | ') print() From 64e70d610d17ca0c0bb37c935b783f2b4934a333 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Thu, 18 May 2023 19:23:56 +0300 Subject: [PATCH 18/69] Minor --- lingvodoc/schema/query.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index a6dc4421..5805964d 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13140,9 +13140,9 @@ def split_lex(lex): commons_total = len(swadesh_set[perspective1] & swadesh_set[perspective2]) commons_linked = len(groups1 & groups2) # commons_linked > 0 means that commons_total > 0 even more so - divergence_time = math.log(commons_linked / commons_total) / -0.14 if commons_linked > 0 else -1 - similarity[perspective1][perspective2] = commons_total, commons_linked - print(f"{perspective2}:{commons_linked}/{commons_total}:{divergence_time:.2f}", end=' | ') + distance = math.log(commons_linked / commons_total) / -0.14 if commons_linked > 0 else -1 + similarity[perspective1][perspective2] = commons_linked, commons_total + print(f"{perspective2}:{commons_linked}/{commons_total}:{distance:.2f}", end=' | ') print() @staticmethod From d530914fb30cfa26a72af210d43bc8d0ae0f7705 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Thu, 18 May 2023 20:49:51 +0300 Subject: [PATCH 19/69] distance_graph function --- lingvodoc/schema/query.py | 782 +++++++++++++++++++------------------- 1 file changed, 400 insertions(+), 382 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 5805964d..9a77feaa 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -10887,6 +10887,399 @@ def f_callback(xyz): return result_x, f(result.x) + @staticmethod + def distance_graph( + language_str, + distance_data_array, + distance_header_array, + __debug_flag__): + + d_ij = (distance_data_array + distance_data_array.T) / 2 + + log.debug( + '\ncognate_analysis {0}:' + '\ndistance_header_array:\n{1}' + '\ndistance_data_array:\n{2}' + '\nd_ij:\n{3}'.format( + language_str, + distance_header_array, + distance_data_array, + d_ij)) + + # Projecting the graph into a 2d plane via relative distance strain optimization, using PCA to + # orient it left-right. + + if len(distance_data_array) > 1: + + embedding_2d, strain_2d = ( + CognateAnalysis.graph_2d_embedding(d_ij, verbose = __debug_flag__)) + + embedding_2d_pca = ( + sklearn.decomposition.PCA(n_components = 2) + .fit_transform(embedding_2d)) + + distance_2d = sklearn.metrics.euclidean_distances(embedding_2d) + + else: + + embedding_2d = numpy.zeros((1, 2)) + embedding_2d_pca = numpy.zeros((1, 2)) + + strain_2d = 0.0 + + distance_2d = numpy.zeros((1, 1)) + + # Showing what we computed. + + log.debug( + '\ncognate_analysis {0}:' + '\nembedding 2d:\n{1}' + '\nembedding 2d (PCA-oriented):\n{2}' + '\nstrain 2d:\n{3}' + '\ndistances 2d:\n{4}'.format( + language_str, + embedding_2d, + embedding_2d_pca, + strain_2d, + distance_2d)) + + # And now the same with 3d embedding. + + if len(distance_data_array) > 1: + + embedding_3d, strain_3d = ( + CognateAnalysis.graph_3d_embedding(d_ij, verbose = __debug_flag__)) + + # At least three points, standard PCA-based orientation. + + if len(distance_data_array) >= 3: + + embedding_3d_pca = ( + sklearn.decomposition.PCA(n_components = 3) + .fit_transform(embedding_3d)) + + # Only two points, so we take 2d embedding and extend it with zeros. + + else: + + embedding_3d_pca = ( + + numpy.hstack(( + embedding_2d_pca, + numpy.zeros((embedding_2d_pca.shape[0], 1))))) + + # Making 3d embedding actually 3d, if required. + + if embedding_3d_pca.shape[1] <= 2: + + embedding_3d_pca = ( + + numpy.hstack(( + embedding_3d_pca, + numpy.zeros((embedding_3d_pca.shape[0], 1))))) + + distance_3d = ( + sklearn.metrics.euclidean_distances(embedding_3d_pca)) + + else: + + embedding_3d = numpy.zeros((1, 3)) + embedding_3d_pca = numpy.zeros((1, 3)) + + strain_3d = 0.0 + + distance_3d = numpy.zeros((1, 1)) + + # Showing what we've get. + + log.debug( + '\ncognate_analysis {0}:' + '\nembedding 3d:\n{1}' + '\nembedding 3d (PCA-oriented):\n{2}' + '\nstrain 3d:\n{3}' + '\ndistances 3d:\n{4}'.format( + language_str, + embedding_3d, + embedding_3d_pca, + strain_3d, + distance_3d)) + + # Computing minimum spanning tree via standard Jarnik-Prim-Dijkstra algorithm using 2d and 3d + # embedding distances to break ties. + + if len(distance_data_array) <= 1: + mst_list = [] + + else: + + d_min, d_extra_min, min_i, min_j = min( + (d_ij[i,j], distance_2d[i,j] + distance_3d[i,j], i, j) + for i in range(d_ij.shape[0] - 1) + for j in range(i + 1, d_ij.shape[0])) + + mst_list = [(min_i, min_j)] + mst_dict = {} + + # MST construction initialization. + + for i in range(d_ij.shape[0]): + + if i == min_i or i == min_j: + continue + + d_min_i = (d_ij[i, min_i], distance_2d[i, min_i] + distance_3d[i, min_i]) + d_min_j = (d_ij[i, min_j], distance_2d[i, min_j] + distance_3d[i, min_j]) + + mst_dict[i] = ( + (d_min_i, min_i) if d_min_i <= d_min_j else + (d_min_j, min_i)) + + # Iterative MST construction. + + while len(mst_dict) > 0: + + (d_min, d_extra_min, i_min, i_from_min) = min( + (d, d_extra, i, i_from) for i, ((d, d_extra), i_from) in mst_dict.items()) + + log.debug('\n' + pprint.pformat(mst_dict)) + log.debug('\n' + repr((i_from_min, i_min, d_min, d_extra_min))) + + mst_list.append((i_from_min, i_min)) + del mst_dict[i_min] + + # Updating shortest connection info. + + for i_to in mst_dict.keys(): + + d_to = (d_ij[i_min, i_to], distance_2d[i_min, i_to] + distance_3d[i_min, i_to]) + + if d_to < mst_dict[i_to][0]: + mst_dict[i_to] = (d_to, i_min) + + log.debug( + '\ncognate_analysis {0}:' + '\nminimum spanning tree:\n{1}'.format( + language_str, + pprint.pformat(mst_list))) + + # Plotting with matplotlib. + + figure = pyplot.figure(figsize = (10, 10)) + axes = figure.add_subplot(212) + + axes.set_title( + 'Etymological distance tree (relative distance embedding)', + fontsize = 14, family = 'Gentium') + + axes.axis('equal') + axes.axis('off') + axes.autoscale() + + def f(axes, embedding_pca): + """ + Plots specified graph embedding on a given axis. + """ + + flag_3d = numpy.size(embedding_pca, 1) > 2 + + for index, (position, name) in enumerate( + zip(embedding_pca, distance_header_array)): + + # Checking if any of the previous perspectives are already in this perspective's + # position. + + same_position_index = None + + for i, p in enumerate(embedding_pca[:index]): + if numpy.linalg.norm(position - p) <= 1e-3: + + same_position_index = i + break + + color = matplotlib.colors.hsv_to_rgb( + [(same_position_index or index) * 1.0 / len(distance_header_array), 0.5, 0.75]) + + label_same_str = ( + '' if same_position_index is None else + ' (same as {0})'.format(same_position_index + 1)) + + kwargs = { + 's': 35, + 'color': color, + 'label': '{0}) {1}{2}'.format(index + 1, name, label_same_str)} + + axes.scatter(*position, **kwargs) + + # Annotating position with its number, but only if we hadn't already annotated nearby. + + if same_position_index is None: + + if flag_3d: + + axes.text( + position[0] + 0.01, position[1], position[2] + 0.01, + str(index + 1), None, fontsize = 14) + + else: + + axes.annotate( + str(index + 1), + (position[0] + 0.01, position[1] - 0.005), + fontsize = 14) + + # Plotting minimum spanning trees. + + line_list = [ + (embedding_pca[i], embedding_pca[j]) + for i, j in mst_list] + + line_collection = ( + Line3DCollection if flag_3d else LineCollection)( + line_list, zorder = 0, color = 'gray') + + axes.add_collection(line_collection) + + pyplot.setp(axes.texts, family = 'Gentium') + + # Plotting our embedding, creating the legend. + + f(axes, embedding_2d_pca) + + pyplot.tight_layout() + + legend = axes.legend( + scatterpoints = 1, + loc = 'upper center', + bbox_to_anchor = (0.5, -0.05), + frameon = False, + handlelength = 0.5, + handletextpad = 0.75, + fontsize = 14) + + pyplot.setp(legend.texts, family = 'Gentium') + axes.autoscale_view() + + # Saving generated figure for debug purposes, if required. + + if __debug_flag__: + + figure_file_name = ( + 'figure cognate distance{0}.png'.format( + mode_name_str)) + + with open(figure_file_name, 'wb') as figure_file: + + pyplot.savefig( + figure_file, + bbox_extra_artists = (legend,), + bbox_inches = 'tight', + pad_inches = 0.25, + format = 'png') + + # Also generating 3d embedding figure. + + figure_3d = pyplot.figure() + figure_3d.set_size_inches(16, 10) + + axes_3d = figure_3d.add_subplot(111, projection = '3d') + + axes_3d.axis('equal') + axes_3d.view_init(elev = 30, azim = -75) + + f(axes_3d, embedding_3d_pca) + + # Setting up legend. + + axes_3d.set_xlabel('X') + axes_3d.set_ylabel('Y') + axes_3d.set_zlabel('Z') + + legend_3d = axes_3d.legend( + scatterpoints = 1, + loc = 'upper center', + bbox_to_anchor = (0.5, -0.05), + frameon = False, + handlelength = 0.5, + handletextpad = 0.75, + fontsize = 14) + + pyplot.setp(legend_3d.texts, family = 'Gentium') + + # Fake cubic bounding box to force axis aspect ratios, see + # https://stackoverflow.com/a/13701747/2016856. + + X = embedding_3d_pca[:,0] + Y = embedding_3d_pca[:,1] + Z = embedding_3d_pca[:,2] + + max_range = numpy.array([ + X.max() - X.min(), Y.max() - Y.min(), Z.max() - Z.min()]).max() + + Xb = ( + 0.5 * max_range * numpy.mgrid[-1:2:2,-1:2:2,-1:2:2][0].flatten() + + 0.5 * (X.max() + X.min())) + + Yb = ( + 0.5 * max_range * numpy.mgrid[-1:2:2,-1:2:2,-1:2:2][1].flatten() + + 0.5 * (Y.max() + Y.min())) + + Zb = ( + 0.5 * max_range * numpy.mgrid[-1:2:2,-1:2:2,-1:2:2][2].flatten() + + 0.5 * (Z.max() + Z.min())) + + for xb, yb, zb in zip(Xb, Yb, Zb): + axes_3d.plot([xb], [yb], [zb], 'w') + + axes_3d.autoscale_view() + + # And saving it. + + figure_3d_file_name = ( + 'figure 3d cognate distance{0}.png'.format( + mode_name_str)) + + with open(figure_3d_file_name, 'wb') as figure_3d_file: + + figure_3d.savefig( + figure_3d_file, + bbox_extra_artists = (legend_3d,), + bbox_inches = 'tight', + pad_inches = 0.25, + format = 'png') + + # Storing generated figure as a PNG image. + + figure_filename = pathvalidate.sanitize_filename( + '{0} cognate{1} analysis {2:04d}.{3:02d}.{4:02d}.png'.format( + base_language_name[:64], + ' ' + mode if mode else '', + current_datetime.year, + current_datetime.month, + current_datetime.day)) + + figure_path = os.path.join(storage_dir, figure_filename) + os.makedirs(os.path.dirname(figure_path), exist_ok = True) + + with open(figure_path, 'wb') as figure_file: + + figure.savefig( + figure_file, + bbox_extra_artists = (legend,), + bbox_inches = 'tight', + pad_inches = 0.25, + format = 'png') + + figure_url = ''.join([ + storage['prefix'], storage['static_route'], + 'cognate', '/', str(cur_time), '/', figure_filename]) + + return ( + figure_url, + mst_list, + embedding_2d_pca, + embedding_3d_pca + ) + @staticmethod def perform_cognate_analysis( language_str, @@ -12196,396 +12589,21 @@ def perform_cognate_analysis( distance_list)) # Generating distance graph, if required. - figure_url = None - mst_list = None embedding_2d_pca = None embedding_3d_pca = None if figure_flag: - - d_ij = (distance_data_array + distance_data_array.T) / 2 - - log.debug( - '\ncognate_analysis {0}:' - '\ndistance_header_array:\n{1}' - '\ndistance_data_array:\n{2}' - '\nd_ij:\n{3}'.format( - language_str, - distance_header_array, - distance_data_array, - d_ij)) - - # Projecting the graph into a 2d plane via relative distance strain optimization, using PCA to - # orient it left-right. - - if len(distance_data_array) > 1: - - embedding_2d, strain_2d = ( - CognateAnalysis.graph_2d_embedding(d_ij, verbose = __debug_flag__)) - - embedding_2d_pca = ( - sklearn.decomposition.PCA(n_components = 2) - .fit_transform(embedding_2d)) - - distance_2d = sklearn.metrics.euclidean_distances(embedding_2d) - - else: - - embedding_2d = numpy.zeros((1, 2)) - embedding_2d_pca = numpy.zeros((1, 2)) - - strain_2d = 0.0 - - distance_2d = numpy.zeros((1, 1)) - - # Showing what we computed. - - log.debug( - '\ncognate_analysis {0}:' - '\nembedding 2d:\n{1}' - '\nembedding 2d (PCA-oriented):\n{2}' - '\nstrain 2d:\n{3}' - '\ndistances 2d:\n{4}'.format( - language_str, - embedding_2d, - embedding_2d_pca, - strain_2d, - distance_2d)) - - # And now the same with 3d embedding. - - if len(distance_data_array) > 1: - - embedding_3d, strain_3d = ( - CognateAnalysis.graph_3d_embedding(d_ij, verbose = __debug_flag__)) - - # At least three points, standard PCA-based orientation. - - if len(distance_data_array) >= 3: - - embedding_3d_pca = ( - sklearn.decomposition.PCA(n_components = 3) - .fit_transform(embedding_3d)) - - # Only two points, so we take 2d embedding and extend it with zeros. - - else: - - embedding_3d_pca = ( - - numpy.hstack(( - embedding_2d_pca, - numpy.zeros((embedding_2d_pca.shape[0], 1))))) - - # Making 3d embedding actually 3d, if required. - - if embedding_3d_pca.shape[1] <= 2: - - embedding_3d_pca = ( - - numpy.hstack(( - embedding_3d_pca, - numpy.zeros((embedding_3d_pca.shape[0], 1))))) - - distance_3d = ( - sklearn.metrics.euclidean_distances(embedding_3d_pca)) - - else: - - embedding_3d = numpy.zeros((1, 3)) - embedding_3d_pca = numpy.zeros((1, 3)) - - strain_3d = 0.0 - - distance_3d = numpy.zeros((1, 1)) - - # Showing what we've get. - - log.debug( - '\ncognate_analysis {0}:' - '\nembedding 3d:\n{1}' - '\nembedding 3d (PCA-oriented):\n{2}' - '\nstrain 3d:\n{3}' - '\ndistances 3d:\n{4}'.format( - language_str, - embedding_3d, - embedding_3d_pca, - strain_3d, - distance_3d)) - - # Computing minimum spanning tree via standard Jarnik-Prim-Dijkstra algorithm using 2d and 3d - # embedding distances to break ties. - - if len(distance_data_array) <= 1: - mst_list = [] - - else: - - d_min, d_extra_min, min_i, min_j = min( - (d_ij[i,j], distance_2d[i,j] + distance_3d[i,j], i, j) - for i in range(d_ij.shape[0] - 1) - for j in range(i + 1, d_ij.shape[0])) - - mst_list = [(min_i, min_j)] - mst_dict = {} - - # MST construction initialization. - - for i in range(d_ij.shape[0]): - - if i == min_i or i == min_j: - continue - - d_min_i = (d_ij[i, min_i], distance_2d[i, min_i] + distance_3d[i, min_i]) - d_min_j = (d_ij[i, min_j], distance_2d[i, min_j] + distance_3d[i, min_j]) - - mst_dict[i] = ( - (d_min_i, min_i) if d_min_i <= d_min_j else - (d_min_j, min_i)) - - # Iterative MST construction. - - while len(mst_dict) > 0: - - (d_min, d_extra_min, i_min, i_from_min) = min( - (d, d_extra, i, i_from) for i, ((d, d_extra), i_from) in mst_dict.items()) - - log.debug('\n' + pprint.pformat(mst_dict)) - log.debug('\n' + repr((i_from_min, i_min, d_min, d_extra_min))) - - mst_list.append((i_from_min, i_min)) - del mst_dict[i_min] - - # Updating shortest connection info. - - for i_to in mst_dict.keys(): - - d_to = (d_ij[i_min, i_to], distance_2d[i_min, i_to] + distance_3d[i_min, i_to]) - - if d_to < mst_dict[i_to][0]: - mst_dict[i_to] = (d_to, i_min) - - log.debug( - '\ncognate_analysis {0}:' - '\nminimum spanning tree:\n{1}'.format( - language_str, - pprint.pformat(mst_list))) - - # Plotting with matplotlib. - - figure = pyplot.figure(figsize = (10, 10)) - axes = figure.add_subplot(212) - - axes.set_title( - 'Etymological distance tree (relative distance embedding)', - fontsize = 14, family = 'Gentium') - - axes.axis('equal') - axes.axis('off') - axes.autoscale() - - def f(axes, embedding_pca): - """ - Plots specified graph embedding on a given axis. - """ - - flag_3d = numpy.size(embedding_pca, 1) > 2 - - for index, (position, name) in enumerate( - zip(embedding_pca, distance_header_array)): - - # Checking if any of the previous perspectives are already in this perspective's - # position. - - same_position_index = None - - for i, p in enumerate(embedding_pca[:index]): - if numpy.linalg.norm(position - p) <= 1e-3: - - same_position_index = i - break - - color = matplotlib.colors.hsv_to_rgb( - [(same_position_index or index) * 1.0 / len(distance_header_array), 0.5, 0.75]) - - label_same_str = ( - '' if same_position_index is None else - ' (same as {0})'.format(same_position_index + 1)) - - kwargs = { - 's': 35, - 'color': color, - 'label': '{0}) {1}{2}'.format(index + 1, name, label_same_str)} - - axes.scatter(*position, **kwargs) - - # Annotating position with its number, but only if we hadn't already annotated nearby. - - if same_position_index is None: - - if flag_3d: - - axes.text( - position[0] + 0.01, position[1], position[2] + 0.01, - str(index + 1), None, fontsize = 14) - - else: - - axes.annotate( - str(index + 1), - (position[0] + 0.01, position[1] - 0.005), - fontsize = 14) - - # Plotting minimum spanning trees. - - line_list = [ - (embedding_pca[i], embedding_pca[j]) - for i, j in mst_list] - - line_collection = ( - Line3DCollection if flag_3d else LineCollection)( - line_list, zorder = 0, color = 'gray') - - axes.add_collection(line_collection) - - pyplot.setp(axes.texts, family = 'Gentium') - - # Plotting our embedding, creating the legend. - - f(axes, embedding_2d_pca) - - pyplot.tight_layout() - - legend = axes.legend( - scatterpoints = 1, - loc = 'upper center', - bbox_to_anchor = (0.5, -0.05), - frameon = False, - handlelength = 0.5, - handletextpad = 0.75, - fontsize = 14) - - pyplot.setp(legend.texts, family = 'Gentium') - axes.autoscale_view() - - # Saving generated figure for debug purposes, if required. - - if __debug_flag__: - - figure_file_name = ( - 'figure cognate distance{0}.png'.format( - mode_name_str)) - - with open(figure_file_name, 'wb') as figure_file: - - pyplot.savefig( - figure_file, - bbox_extra_artists = (legend,), - bbox_inches = 'tight', - pad_inches = 0.25, - format = 'png') - - # Also generating 3d embedding figure. - - figure_3d = pyplot.figure() - figure_3d.set_size_inches(16, 10) - - axes_3d = figure_3d.add_subplot(111, projection = '3d') - - axes_3d.axis('equal') - axes_3d.view_init(elev = 30, azim = -75) - - f(axes_3d, embedding_3d_pca) - - # Setting up legend. - - axes_3d.set_xlabel('X') - axes_3d.set_ylabel('Y') - axes_3d.set_zlabel('Z') - - legend_3d = axes_3d.legend( - scatterpoints = 1, - loc = 'upper center', - bbox_to_anchor = (0.5, -0.05), - frameon = False, - handlelength = 0.5, - handletextpad = 0.75, - fontsize = 14) - - pyplot.setp(legend_3d.texts, family = 'Gentium') - - # Fake cubic bounding box to force axis aspect ratios, see - # https://stackoverflow.com/a/13701747/2016856. - - X = embedding_3d_pca[:,0] - Y = embedding_3d_pca[:,1] - Z = embedding_3d_pca[:,2] - - max_range = numpy.array([ - X.max() - X.min(), Y.max() - Y.min(), Z.max() - Z.min()]).max() - - Xb = ( - 0.5 * max_range * numpy.mgrid[-1:2:2,-1:2:2,-1:2:2][0].flatten() + - 0.5 * (X.max() + X.min())) - - Yb = ( - 0.5 * max_range * numpy.mgrid[-1:2:2,-1:2:2,-1:2:2][1].flatten() + - 0.5 * (Y.max() + Y.min())) - - Zb = ( - 0.5 * max_range * numpy.mgrid[-1:2:2,-1:2:2,-1:2:2][2].flatten() + - 0.5 * (Z.max() + Z.min())) - - for xb, yb, zb in zip(Xb, Yb, Zb): - axes_3d.plot([xb], [yb], [zb], 'w') - - axes_3d.autoscale_view() - - # And saving it. - - figure_3d_file_name = ( - 'figure 3d cognate distance{0}.png'.format( - mode_name_str)) - - with open(figure_3d_file_name, 'wb') as figure_3d_file: - - figure_3d.savefig( - figure_3d_file, - bbox_extra_artists = (legend_3d,), - bbox_inches = 'tight', - pad_inches = 0.25, - format = 'png') - - # Storing generated figure as a PNG image. - - figure_filename = pathvalidate.sanitize_filename( - '{0} cognate{1} analysis {2:04d}.{3:02d}.{4:02d}.png'.format( - base_language_name[:64], - ' ' + mode if mode else '', - current_datetime.year, - current_datetime.month, - current_datetime.day)) - - figure_path = os.path.join(storage_dir, figure_filename) - os.makedirs(os.path.dirname(figure_path), exist_ok = True) - - with open(figure_path, 'wb') as figure_file: - - figure.savefig( - figure_file, - bbox_extra_artists = (legend,), - bbox_inches = 'tight', - pad_inches = 0.25, - format = 'png') - - figure_url = ''.join([ - storage['prefix'], storage['static_route'], - 'cognate', '/', str(cur_time), '/', figure_filename]) + figure_url, mst_list, embedding_2d_pca, embedding_3d_pca = \ + distance_graph( + language_str, + distance_data_array, + distance_header_array, + __debug_flag__ + ) # Finalizing task status, if required, returning result. - if task_status is not None: result_link_list = ( From b02f0da4c6867b471fe5461623cdfbb689e1b8e2 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Thu, 18 May 2023 20:59:11 +0300 Subject: [PATCH 20/69] Switch back Cognate analysis --- lingvodoc/schema/query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 9a77feaa..af974fd2 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -12841,7 +12841,7 @@ def mutate(self, info, **args): cognate_suggestions_f if mode == 'suggestions' else cognate_analysis_f) - if analysis_f is None and False: + if analysis_f is None: return ResponseError(message = 'Analysis library fuction \'{0}()\' is absent, ' From 614aee6fad8374e551162d0d13a108b68dc29c06 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Fri, 19 May 2023 16:15:49 +0300 Subject: [PATCH 21/69] Fix --- lingvodoc/schema/query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index af974fd2..3ebbe069 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -12596,7 +12596,7 @@ def perform_cognate_analysis( if figure_flag: figure_url, mst_list, embedding_2d_pca, embedding_3d_pca = \ - distance_graph( + CognateAnalysis.distance_graph( language_str, distance_data_array, distance_header_array, From 2a9e4b4a7b15c5af1acf312fc1240da2730f679f Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Fri, 19 May 2023 18:30:59 +0300 Subject: [PATCH 22/69] Fix --- lingvodoc/schema/query.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 3ebbe069..2b4e4ec0 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -10890,6 +10890,7 @@ def f_callback(xyz): @staticmethod def distance_graph( language_str, + base_language_name, distance_data_array, distance_header_array, __debug_flag__): @@ -12598,6 +12599,7 @@ def perform_cognate_analysis( figure_url, mst_list, embedding_2d_pca, embedding_3d_pca = \ CognateAnalysis.distance_graph( language_str, + base_language_name, distance_data_array, distance_header_array, __debug_flag__ @@ -13046,13 +13048,14 @@ def swadesh_statistics( def compare_translations(swadesh_lex, dictionary_lex): def split_lex(lex): + if ' заим.' in lex: + return set() # Split by commas and open brackets to separate - # various forms of lexem and extra note if is + # various forms of lexeme and extra note if is return set(form.strip().lower() for form in lex.replace('(', ',').split(',') if form.strip() - and ')' not in form - and ' заим.' not in form) #exclude notes and borrowings + and ')' not in form) # exclude notes # return true if the intersection is not empty return bool(split_lex(swadesh_lex) & split_lex(dictionary_lex)) From 8208eef7fd0507b8fae75edacf23e83b1a24eb39 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Fri, 19 May 2023 19:36:39 +0300 Subject: [PATCH 23/69] distance_data_array --- lingvodoc/schema/query.py | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 2b4e4ec0..f3c22b56 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -10893,7 +10893,7 @@ def distance_graph( base_language_name, distance_data_array, distance_header_array, - __debug_flag__): + __debug_flag__ = False): d_ij = (distance_data_array + distance_data_array.T) / 2 @@ -13031,6 +13031,7 @@ class Arguments: @staticmethod def swadesh_statistics( language_str, + base_language_name, group_field_id, perspective_info_list, locale_id): @@ -13068,10 +13069,12 @@ def split_lex(lex): # swadesh_set gathers numbers of words within Swadesh' list entries_set = {} swadesh_set = {} + distance_array_size = len(perspective_info_list) + distance_data_array = numpy.full((distance_array_size, distance_array_size), 100) + distance_header_array = numpy.empty(distance_array_size) for index, (perspective_id, _, translation_field_id) in \ enumerate(perspective_info_list): - ''' # Getting and saving perspective info. perspective = ( DBSession @@ -13081,18 +13084,7 @@ def split_lex(lex): ) perspective_name = perspective.get_translation(locale_id) - dictionary_name = perspective.parent.get_translation(locale_id) - - log.debug( - '\nswadesh_analysis {0}:' - '\n dictionary {1}/{2}: {3}' - '\n perspective {4}/{5}: {6}'.format( - language_str, - perspective.parent_client_id, perspective.parent_object_id, - repr(dictionary_name.strip()), - perspective_id[0], perspective_id[1], - repr(perspective_name.strip()))) - ''' + distance_header_array[index] = perspective_name # Getting text data. translation_query = ( @@ -13157,15 +13149,23 @@ def split_lex(lex): similarity[perspective1] = {} print(perspective1, end=' :: ') for n2, (perspective2, groups2) in enumerate(links.items()): - if n2 <= n1: continue #exclude duplicates and self-to-self - commons_total = len(swadesh_set[perspective1] & swadesh_set[perspective2]) + #if n2 <= n1: continue #exclude duplicates and self-to-self commons_linked = len(groups1 & groups2) + commons_total = len(swadesh_set[perspective1] & swadesh_set[perspective2]) # commons_linked > 0 means that commons_total > 0 even more so - distance = math.log(commons_linked / commons_total) / -0.14 if commons_linked > 0 else -1 - similarity[perspective1][perspective2] = commons_linked, commons_total + distance = math.log(commons_linked / commons_total) / -0.14 if commons_linked > 0 else 100 + distance_data_array[n1][n2] = distance + #similarity[perspective1][perspective2] = commons_linked, commons_total print(f"{perspective2}:{commons_linked}/{commons_total}:{distance:.2f}", end=' | ') print() + CognateAnalysis.distance_graph( + language_str, + base_language_name, + distance_data_array, + distance_header_array + ) + @staticmethod def mutate(self, info, **args): """ @@ -13266,6 +13266,7 @@ def mutate(self, info, **args): return SwadeshAnalysis.swadesh_statistics( language_str, + base_language_name, group_field_id, perspective_info_list, locale_id) From f57492e1f1767e5c5deab7f44ee6f1897f74c44f Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Fri, 19 May 2023 23:31:52 +0300 Subject: [PATCH 24/69] Result dict --- lingvodoc/schema/query.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index f3c22b56..632b9b21 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -10893,6 +10893,7 @@ def distance_graph( base_language_name, distance_data_array, distance_header_array, + mode, __debug_flag__ = False): d_ij = (distance_data_array + distance_data_array.T) / 2 @@ -12602,6 +12603,7 @@ def perform_cognate_analysis( base_language_name, distance_data_array, distance_header_array, + mode, __debug_flag__ ) @@ -13028,6 +13030,10 @@ class Arguments: triumph = graphene.Boolean() + minimum_spanning_tree = graphene.List(graphene.List(graphene.Int)) + embedding_2d = graphene.List(graphene.List(graphene.Float)) + embedding_3d = graphene.List(graphene.List(graphene.Float)) + @staticmethod def swadesh_statistics( language_str, @@ -13163,9 +13169,28 @@ def split_lex(lex): language_str, base_language_name, distance_data_array, - distance_header_array + distance_header_array, + "swadesh" ) + result_dict = ( + + dict( + triumph = True, + + minimum_spanning_tree = mst_list, + embedding_2d = embedding_2d_pca, + embedding_3d = embedding_3d_pca)) + + if __debug_flag__ and __result_flag__: + + with gzip.open( + result_file_name, 'wb') as result_file: + + pickle.dump(result_dict, result_file) + + return SwadeshAnalysis(**result_dict) + @staticmethod def mutate(self, info, **args): """ From 2a1d07214002803c136ec1c8f56a1a347f99b128 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Sun, 21 May 2023 23:13:30 +0300 Subject: [PATCH 25/69] Tuned distance_graph --- lingvodoc/schema/query.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 632b9b21..03a6ba43 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -10894,6 +10894,9 @@ def distance_graph( distance_data_array, distance_header_array, mode, + storage, + storage_dir, + figure_filename, __debug_flag__ = False): d_ij = (distance_data_array + distance_data_array.T) / 2 @@ -12604,6 +12607,9 @@ def perform_cognate_analysis( distance_data_array, distance_header_array, mode, + storage, + storage_dir, + figure_filename, __debug_flag__ ) @@ -13165,13 +13171,17 @@ def split_lex(lex): print(f"{perspective2}:{commons_linked}/{commons_total}:{distance:.2f}", end=' | ') print() - CognateAnalysis.distance_graph( - language_str, - base_language_name, - distance_data_array, - distance_header_array, - "swadesh" - ) + _, mst_list, embedding_2d_pca, embedding_3d_pca = \ + CognateAnalysis.distance_graph( + language_str, + base_language_name, + distance_data_array, + distance_header_array, + "swadesh", + storage, + storage_dir, + figure_filename + ) result_dict = ( From 76ab301e5ad241675c096835a335430f1378ce4e Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Sun, 21 May 2023 23:34:07 +0300 Subject: [PATCH 26/69] Cleanup --- lingvodoc/schema/query.py | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 03a6ba43..d940aa36 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -12957,31 +12957,6 @@ def mutate(self, info, **args): return CognateAnalysis(triumph = True) - elif mode == 'swadesh': - - return CognateAnalysis.swadesh_statistics( - language_str, - source_perspective_id, - base_language_id, - base_language_name, - group_field_id, - perspective_info_list, - multi_list, - multi_name_list, - mode, - distance_flag, - reference_perspective_id, - figure_flag, - distance_vowel_flag, - distance_consonant_flag, - match_translations_value, - only_orphans_flag, - locale_id, - storage, - None, - __debug_flag__, - __intermediate_flag__) - # We do not use acoustic data, so we perform cognate analysis synchronously. else: From ba1b8bd7a4cd91b0a984509dadb5fb6b636bb047 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Mon, 22 May 2023 00:07:37 +0300 Subject: [PATCH 27/69] Storage arg --- lingvodoc/schema/query.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index d940aa36..93cce66c 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -10896,7 +10896,6 @@ def distance_graph( mode, storage, storage_dir, - figure_filename, __debug_flag__ = False): d_ij = (distance_data_array + distance_data_array.T) / 2 @@ -12609,7 +12608,6 @@ def perform_cognate_analysis( mode, storage, storage_dir, - figure_filename, __debug_flag__ ) @@ -13021,7 +13019,8 @@ def swadesh_statistics( base_language_name, group_field_id, perspective_info_list, - locale_id): + locale_id, + storage): swadesh_list = ['я','ты','мы','этот, это','тот, то','кто','что','не','все','много','один','два','большой', 'долгий','маленький','женщина','мужчина','человек','рыба','птица','собака','вошь','дерево', @@ -13146,6 +13145,9 @@ def split_lex(lex): print(f"{perspective2}:{commons_linked}/{commons_total}:{distance:.2f}", end=' | ') print() + cur_time = time.time() + storage_dir = os.path.join(storage['path'], 'swadesh', str(cur_time)) + _, mst_list, embedding_2d_pca, embedding_3d_pca = \ CognateAnalysis.distance_graph( language_str, @@ -13154,8 +13156,7 @@ def split_lex(lex): distance_header_array, "swadesh", storage, - storage_dir, - figure_filename + storage_dir ) result_dict = ( @@ -13279,7 +13280,8 @@ def mutate(self, info, **args): base_language_name, group_field_id, perspective_info_list, - locale_id) + locale_id, + storage) # Exception occured while we tried to perform swadesh analysis. except Exception as exception: From 4f7938a61e88e774585ab17ca651aa17d4bab503 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Mon, 22 May 2023 00:50:36 +0300 Subject: [PATCH 28/69] current_datetime --- lingvodoc/schema/query.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 93cce66c..8c2ee4bb 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -11252,7 +11252,7 @@ def f(axes, embedding_pca): format = 'png') # Storing generated figure as a PNG image. - + current_datetime = datetime.datetime.now(datetime.timezone.utc) figure_filename = pathvalidate.sanitize_filename( '{0} cognate{1} analysis {2:04d}.{3:02d}.{4:02d}.png'.format( base_language_name[:64], @@ -13055,9 +13055,9 @@ def split_lex(lex): # swadesh_set gathers numbers of words within Swadesh' list entries_set = {} swadesh_set = {} - distance_array_size = len(perspective_info_list) - distance_data_array = numpy.full((distance_array_size, distance_array_size), 100) - distance_header_array = numpy.empty(distance_array_size) + dictionary_count = len(perspective_info_list) + distance_data_array = numpy.full((dictionary_count, dictionary_count), 100) + distance_header_array = numpy.empty(dictionary_count) for index, (perspective_id, _, translation_field_id) in \ enumerate(perspective_info_list): From 8a36aaf28980cb12c1979efd96d496420d7aec15 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Mon, 22 May 2023 01:22:27 +0300 Subject: [PATCH 29/69] cur_time --- lingvodoc/schema/query.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 8c2ee4bb..4357a559 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -11273,6 +11273,7 @@ def f(axes, embedding_pca): pad_inches = 0.25, format = 'png') + cur_time = time.time() figure_url = ''.join([ storage['prefix'], storage['static_route'], 'cognate', '/', str(cur_time), '/', figure_filename]) From eeea3d8ffd2be15d14198ec114d3ecd66fde4b90 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Mon, 22 May 2023 18:43:04 +0300 Subject: [PATCH 30/69] Used handleResult --- lingvodoc/schema/query.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 4357a559..fde8b5cc 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13058,7 +13058,7 @@ def split_lex(lex): swadesh_set = {} dictionary_count = len(perspective_info_list) distance_data_array = numpy.full((dictionary_count, dictionary_count), 100) - distance_header_array = numpy.empty(dictionary_count) + distance_header_array = numpy.empty(dictionary_count, dtype='object') for index, (perspective_id, _, translation_field_id) in \ enumerate(perspective_info_list): @@ -13159,7 +13159,6 @@ def split_lex(lex): storage, storage_dir ) - result_dict = ( dict( @@ -13169,13 +13168,6 @@ def split_lex(lex): embedding_2d = embedding_2d_pca, embedding_3d = embedding_3d_pca)) - if __debug_flag__ and __result_flag__: - - with gzip.open( - result_file_name, 'wb') as result_file: - - pickle.dump(result_dict, result_file) - return SwadeshAnalysis(**result_dict) @staticmethod From 56a66e5ad4cb6a15710059040f3a91d1f5f4c75d Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Mon, 22 May 2023 20:18:39 +0300 Subject: [PATCH 31/69] perspective_name_list --- lingvodoc/schema/query.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index fde8b5cc..f09dc798 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13013,6 +13013,7 @@ class Arguments: minimum_spanning_tree = graphene.List(graphene.List(graphene.Int)) embedding_2d = graphene.List(graphene.List(graphene.Float)) embedding_3d = graphene.List(graphene.List(graphene.Float)) + perspective_name_list = graphene.List(graphene.String) @staticmethod def swadesh_statistics( @@ -13166,7 +13167,8 @@ def split_lex(lex): minimum_spanning_tree = mst_list, embedding_2d = embedding_2d_pca, - embedding_3d = embedding_3d_pca)) + embedding_3d = embedding_3d_pca, + perspective_name_list = distance_header_array)) return SwadeshAnalysis(**result_dict) From 3a42ee8d582df9f67b49e656f57496323e6d117f Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Tue, 23 May 2023 17:57:05 +0300 Subject: [PATCH 32/69] First graph --- lingvodoc/schema/query.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index f09dc798..f03423c5 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13071,8 +13071,8 @@ def split_lex(lex): .first() ) - perspective_name = perspective.get_translation(locale_id) - distance_header_array[index] = perspective_name + dictionary_name = perspective.parent.get_translation(locale_id) + distance_header_array[index] = dictionary_name # Getting text data. translation_query = ( From 5b2925c1318c1cada2861847eaea5ecaafad3aae Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Thu, 25 May 2023 12:28:35 +0300 Subject: [PATCH 33/69] __plot_flag__ --- lingvodoc/schema/query.py | 309 +++++++++++++++++++------------------- 1 file changed, 157 insertions(+), 152 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index f03423c5..51f141eb 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -10896,6 +10896,7 @@ def distance_graph( mode, storage, storage_dir, + __plot_flag__ = True, __debug_flag__ = False): d_ij = (distance_data_array + distance_data_array.T) / 2 @@ -11067,216 +11068,219 @@ def distance_graph( pprint.pformat(mst_list))) # Plotting with matplotlib. + figure_url = None + if __plot_flag__: - figure = pyplot.figure(figsize = (10, 10)) - axes = figure.add_subplot(212) + figure = pyplot.figure(figsize = (10, 10)) + axes = figure.add_subplot(212) - axes.set_title( - 'Etymological distance tree (relative distance embedding)', - fontsize = 14, family = 'Gentium') + axes.set_title( + 'Etymological distance tree (relative distance embedding)', + fontsize = 14, family = 'Gentium') - axes.axis('equal') - axes.axis('off') - axes.autoscale() + axes.axis('equal') + axes.axis('off') + axes.autoscale() - def f(axes, embedding_pca): - """ - Plots specified graph embedding on a given axis. - """ + def f(axes, embedding_pca): + """ + Plots specified graph embedding on a given axis. + """ - flag_3d = numpy.size(embedding_pca, 1) > 2 + flag_3d = numpy.size(embedding_pca, 1) > 2 - for index, (position, name) in enumerate( - zip(embedding_pca, distance_header_array)): + for index, (position, name) in enumerate( + zip(embedding_pca, distance_header_array)): - # Checking if any of the previous perspectives are already in this perspective's - # position. + # Checking if any of the previous perspectives are already in this perspective's + # position. - same_position_index = None + same_position_index = None - for i, p in enumerate(embedding_pca[:index]): - if numpy.linalg.norm(position - p) <= 1e-3: + for i, p in enumerate(embedding_pca[:index]): + if numpy.linalg.norm(position - p) <= 1e-3: - same_position_index = i - break + same_position_index = i + break - color = matplotlib.colors.hsv_to_rgb( - [(same_position_index or index) * 1.0 / len(distance_header_array), 0.5, 0.75]) + color = matplotlib.colors.hsv_to_rgb( + [(same_position_index or index) * 1.0 / len(distance_header_array), 0.5, 0.75]) - label_same_str = ( - '' if same_position_index is None else - ' (same as {0})'.format(same_position_index + 1)) + label_same_str = ( + '' if same_position_index is None else + ' (same as {0})'.format(same_position_index + 1)) - kwargs = { - 's': 35, - 'color': color, - 'label': '{0}) {1}{2}'.format(index + 1, name, label_same_str)} + kwargs = { + 's': 35, + 'color': color, + 'label': '{0}) {1}{2}'.format(index + 1, name, label_same_str)} - axes.scatter(*position, **kwargs) + axes.scatter(*position, **kwargs) - # Annotating position with its number, but only if we hadn't already annotated nearby. + # Annotating position with its number, but only if we hadn't already annotated nearby. - if same_position_index is None: + if same_position_index is None: - if flag_3d: + if flag_3d: - axes.text( - position[0] + 0.01, position[1], position[2] + 0.01, - str(index + 1), None, fontsize = 14) + axes.text( + position[0] + 0.01, position[1], position[2] + 0.01, + str(index + 1), None, fontsize = 14) - else: + else: - axes.annotate( - str(index + 1), - (position[0] + 0.01, position[1] - 0.005), - fontsize = 14) + axes.annotate( + str(index + 1), + (position[0] + 0.01, position[1] - 0.005), + fontsize = 14) - # Plotting minimum spanning trees. + # Plotting minimum spanning trees. - line_list = [ - (embedding_pca[i], embedding_pca[j]) - for i, j in mst_list] + line_list = [ + (embedding_pca[i], embedding_pca[j]) + for i, j in mst_list] - line_collection = ( - Line3DCollection if flag_3d else LineCollection)( - line_list, zorder = 0, color = 'gray') + line_collection = ( + Line3DCollection if flag_3d else LineCollection)( + line_list, zorder = 0, color = 'gray') - axes.add_collection(line_collection) + axes.add_collection(line_collection) - pyplot.setp(axes.texts, family = 'Gentium') + pyplot.setp(axes.texts, family = 'Gentium') - # Plotting our embedding, creating the legend. + # Plotting our embedding, creating the legend. - f(axes, embedding_2d_pca) + f(axes, embedding_2d_pca) - pyplot.tight_layout() + pyplot.tight_layout() - legend = axes.legend( - scatterpoints = 1, - loc = 'upper center', - bbox_to_anchor = (0.5, -0.05), - frameon = False, - handlelength = 0.5, - handletextpad = 0.75, - fontsize = 14) + legend = axes.legend( + scatterpoints = 1, + loc = 'upper center', + bbox_to_anchor = (0.5, -0.05), + frameon = False, + handlelength = 0.5, + handletextpad = 0.75, + fontsize = 14) - pyplot.setp(legend.texts, family = 'Gentium') - axes.autoscale_view() + pyplot.setp(legend.texts, family = 'Gentium') + axes.autoscale_view() - # Saving generated figure for debug purposes, if required. + # Saving generated figure for debug purposes, if required. - if __debug_flag__: + if __debug_flag__: - figure_file_name = ( - 'figure cognate distance{0}.png'.format( - mode_name_str)) + figure_file_name = ( + 'figure cognate distance{0}.png'.format( + mode_name_str)) - with open(figure_file_name, 'wb') as figure_file: + with open(figure_file_name, 'wb') as figure_file: - pyplot.savefig( - figure_file, - bbox_extra_artists = (legend,), - bbox_inches = 'tight', - pad_inches = 0.25, - format = 'png') + pyplot.savefig( + figure_file, + bbox_extra_artists = (legend,), + bbox_inches = 'tight', + pad_inches = 0.25, + format = 'png') - # Also generating 3d embedding figure. + # Also generating 3d embedding figure. - figure_3d = pyplot.figure() - figure_3d.set_size_inches(16, 10) + figure_3d = pyplot.figure() + figure_3d.set_size_inches(16, 10) - axes_3d = figure_3d.add_subplot(111, projection = '3d') + axes_3d = figure_3d.add_subplot(111, projection = '3d') - axes_3d.axis('equal') - axes_3d.view_init(elev = 30, azim = -75) + axes_3d.axis('equal') + axes_3d.view_init(elev = 30, azim = -75) - f(axes_3d, embedding_3d_pca) + f(axes_3d, embedding_3d_pca) - # Setting up legend. + # Setting up legend. - axes_3d.set_xlabel('X') - axes_3d.set_ylabel('Y') - axes_3d.set_zlabel('Z') + axes_3d.set_xlabel('X') + axes_3d.set_ylabel('Y') + axes_3d.set_zlabel('Z') - legend_3d = axes_3d.legend( - scatterpoints = 1, - loc = 'upper center', - bbox_to_anchor = (0.5, -0.05), - frameon = False, - handlelength = 0.5, - handletextpad = 0.75, - fontsize = 14) + legend_3d = axes_3d.legend( + scatterpoints = 1, + loc = 'upper center', + bbox_to_anchor = (0.5, -0.05), + frameon = False, + handlelength = 0.5, + handletextpad = 0.75, + fontsize = 14) - pyplot.setp(legend_3d.texts, family = 'Gentium') + pyplot.setp(legend_3d.texts, family = 'Gentium') - # Fake cubic bounding box to force axis aspect ratios, see - # https://stackoverflow.com/a/13701747/2016856. + # Fake cubic bounding box to force axis aspect ratios, see + # https://stackoverflow.com/a/13701747/2016856. - X = embedding_3d_pca[:,0] - Y = embedding_3d_pca[:,1] - Z = embedding_3d_pca[:,2] + X = embedding_3d_pca[:,0] + Y = embedding_3d_pca[:,1] + Z = embedding_3d_pca[:,2] - max_range = numpy.array([ - X.max() - X.min(), Y.max() - Y.min(), Z.max() - Z.min()]).max() + max_range = numpy.array([ + X.max() - X.min(), Y.max() - Y.min(), Z.max() - Z.min()]).max() - Xb = ( - 0.5 * max_range * numpy.mgrid[-1:2:2,-1:2:2,-1:2:2][0].flatten() + - 0.5 * (X.max() + X.min())) + Xb = ( + 0.5 * max_range * numpy.mgrid[-1:2:2,-1:2:2,-1:2:2][0].flatten() + + 0.5 * (X.max() + X.min())) - Yb = ( - 0.5 * max_range * numpy.mgrid[-1:2:2,-1:2:2,-1:2:2][1].flatten() + - 0.5 * (Y.max() + Y.min())) + Yb = ( + 0.5 * max_range * numpy.mgrid[-1:2:2,-1:2:2,-1:2:2][1].flatten() + + 0.5 * (Y.max() + Y.min())) - Zb = ( - 0.5 * max_range * numpy.mgrid[-1:2:2,-1:2:2,-1:2:2][2].flatten() + - 0.5 * (Z.max() + Z.min())) + Zb = ( + 0.5 * max_range * numpy.mgrid[-1:2:2,-1:2:2,-1:2:2][2].flatten() + + 0.5 * (Z.max() + Z.min())) - for xb, yb, zb in zip(Xb, Yb, Zb): - axes_3d.plot([xb], [yb], [zb], 'w') + for xb, yb, zb in zip(Xb, Yb, Zb): + axes_3d.plot([xb], [yb], [zb], 'w') - axes_3d.autoscale_view() + axes_3d.autoscale_view() - # And saving it. + # And saving it. - figure_3d_file_name = ( - 'figure 3d cognate distance{0}.png'.format( - mode_name_str)) + figure_3d_file_name = ( + 'figure 3d cognate distance{0}.png'.format( + mode_name_str)) - with open(figure_3d_file_name, 'wb') as figure_3d_file: + with open(figure_3d_file_name, 'wb') as figure_3d_file: - figure_3d.savefig( - figure_3d_file, - bbox_extra_artists = (legend_3d,), + figure_3d.savefig( + figure_3d_file, + bbox_extra_artists = (legend_3d,), + bbox_inches = 'tight', + pad_inches = 0.25, + format = 'png') + + # Storing generated figure as a PNG image. + current_datetime = datetime.datetime.now(datetime.timezone.utc) + figure_filename = pathvalidate.sanitize_filename( + '{0} cognate{1} analysis {2:04d}.{3:02d}.{4:02d}.png'.format( + base_language_name[:64], + ' ' + mode if mode else '', + current_datetime.year, + current_datetime.month, + current_datetime.day)) + + figure_path = os.path.join(storage_dir, figure_filename) + os.makedirs(os.path.dirname(figure_path), exist_ok = True) + + with open(figure_path, 'wb') as figure_file: + + figure.savefig( + figure_file, + bbox_extra_artists = (legend,), bbox_inches = 'tight', pad_inches = 0.25, format = 'png') - # Storing generated figure as a PNG image. - current_datetime = datetime.datetime.now(datetime.timezone.utc) - figure_filename = pathvalidate.sanitize_filename( - '{0} cognate{1} analysis {2:04d}.{3:02d}.{4:02d}.png'.format( - base_language_name[:64], - ' ' + mode if mode else '', - current_datetime.year, - current_datetime.month, - current_datetime.day)) - - figure_path = os.path.join(storage_dir, figure_filename) - os.makedirs(os.path.dirname(figure_path), exist_ok = True) - - with open(figure_path, 'wb') as figure_file: - - figure.savefig( - figure_file, - bbox_extra_artists = (legend,), - bbox_inches = 'tight', - pad_inches = 0.25, - format = 'png') - - cur_time = time.time() - figure_url = ''.join([ - storage['prefix'], storage['static_route'], - 'cognate', '/', str(cur_time), '/', figure_filename]) + cur_time = time.time() + figure_url = ''.join([ + storage['prefix'], storage['static_route'], + 'cognate', '/', str(cur_time), '/', figure_filename]) + ### Plotting with matplotlib ends return ( figure_url, @@ -13158,7 +13162,8 @@ def split_lex(lex): distance_header_array, "swadesh", storage, - storage_dir + storage_dir, + __plot_flag__ = False ) result_dict = ( @@ -13191,7 +13196,7 @@ def mutate(self, info, **args): # Administrator / perspective author / editing permission check. error_str = ( 'Only administrator, perspective author and users with perspective editing permissions ' - 'can perform swadesh analysis.') + 'can perform Swadesh analysis.') client_id = info.context.request.authenticated_userid From f9633c06c18c51405b1720dbd2265ac376378e29 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Thu, 25 May 2023 23:30:56 +0300 Subject: [PATCH 34/69] Gathered result_pool, fixed a vulnerability --- lingvodoc/schema/query.py | 73 +++++++++++++++++++++++++++++++-------- 1 file changed, 59 insertions(+), 14 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 51f141eb..c47a4e06 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13061,10 +13061,8 @@ def split_lex(lex): # swadesh_set gathers numbers of words within Swadesh' list entries_set = {} swadesh_set = {} - dictionary_count = len(perspective_info_list) - distance_data_array = numpy.full((dictionary_count, dictionary_count), 100) - distance_header_array = numpy.empty(dictionary_count, dtype='object') - for index, (perspective_id, _, translation_field_id) in \ + result_pool = {} + for index, (perspective_id, word_field_id, translation_field_id) in \ enumerate(perspective_info_list): # Getting and saving perspective info. @@ -13074,11 +13072,32 @@ def split_lex(lex): .filter_by(client_id=perspective_id[0], object_id=perspective_id[1]) .first() ) - dictionary_name = perspective.parent.get_translation(locale_id) - distance_header_array[index] = dictionary_name # Getting text data. + word_query = ( + DBSession + .query( + dbLexicalEntry.client_id, + dbLexicalEntry.object_id) + .filter( + dbLexicalEntry.parent_client_id == perspective_id[0], + dbLexicalEntry.parent_object_id == perspective_id[1], + dbLexicalEntry.marked_for_deletion == False, + dbEntity.parent_client_id == dbLexicalEntry.client_id, + dbEntity.parent_object_id == dbLexicalEntry.object_id, + dbEntity.field_client_id == word_field_id[0], + dbEntity.field_object_id == word_field_id[1], + dbEntity.marked_for_deletion == False, + dbPublishingEntity.client_id == dbEntity.client_id, + dbPublishingEntity.object_id == dbEntity.object_id, + dbPublishingEntity.published == True, + dbPublishingEntity.accepted == True) + .add_columns( + func.array_agg(dbEntity.content).label('word')) + .group_by(dbLexicalEntry) + .subquery()) + translation_query = ( DBSession .query( @@ -13100,14 +13119,30 @@ def split_lex(lex): .add_columns( func.array_agg(dbEntity.content).label('translation')) .group_by(dbLexicalEntry) + .subquery()) + + # Main query for word/translation data. + data_query = ( + DBSession + .query(word_query) + .outerjoin(translation_query, and_( + word_query.c.client_id == translation_query.c.client_id, + word_query.c.object_id == translation_query.c.object_id)) + .add_columns( + translation_query.c.translation) .all()) # Grouping translations by lexical entries. entries_set[perspective_id] = set() swadesh_set[perspective_id] = set() - for row_index, row in enumerate(translation_query): + result_pool[perspective_id] = {'name': dictionary_name} + for row_index, row in enumerate(data_query): entry_id = tuple(row[:2]) - translation_list = row[2] + word_list, translation_list = row[2:4] + + # If we have no words for this lexical entry, we skip it altogether. + if not word_list: + continue translation_list = ( [] if not translation_list else [ @@ -13121,24 +13156,35 @@ def split_lex(lex): # Store entry_id and number of the lex within Swadesh' list entries_set[perspective_id].add(entry_id) swadesh_set[perspective_id].add(swadesh_num) - #print(entry_id, swadesh_num, translation_lex) + # Store the entry content in human readable format + result_pool[perspective_id][entry_id] = { + 'group': None, + 'swadesh': swadesh_lex, + 'word': word_list[0], + 'translation': translation_lex + } # Create dictionary of sets: # keys: pepspective_id # values: numbers of etymological groups where an entry from dictionary is met - links = {} + links = collections.OrderedDict() for perspective, entries in entries_set.items(): links[perspective] = set() for group_index, group in enumerate(group_list): - if (entries & group): + linked = entries & group + if linked: links[perspective].add(group_index) + result_pool[perspective][linked.pop()]['group'] = group_index + + dictionary_count = len(links) + distance_data_array = numpy.full((dictionary_count, dictionary_count), 100) + distance_header_array = numpy.empty(dictionary_count, dtype='object') # Calculate intersection between lists of group numbers # So length of this intersection is the similarity of corresponding perspectives # commons_total means amount of Swadesh's lexems met in the both perspectives - similarity = {} for n1, (perspective1, groups1) in enumerate(links.items()): - similarity[perspective1] = {} + distance_header_array[n1] = result_pool[perspective1]['name'] print(perspective1, end=' :: ') for n2, (perspective2, groups2) in enumerate(links.items()): #if n2 <= n1: continue #exclude duplicates and self-to-self @@ -13147,7 +13193,6 @@ def split_lex(lex): # commons_linked > 0 means that commons_total > 0 even more so distance = math.log(commons_linked / commons_total) / -0.14 if commons_linked > 0 else 100 distance_data_array[n1][n2] = distance - #similarity[perspective1][perspective2] = commons_linked, commons_total print(f"{perspective2}:{commons_linked}/{commons_total}:{distance:.2f}", end=' | ') print() From ee005c9c82097a6fb899ccb40569e83c56a1cdfb Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Thu, 25 May 2023 23:59:54 +0300 Subject: [PATCH 35/69] Cleanup --- lingvodoc/schema/query.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index c47a4e06..4538a5e9 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13041,6 +13041,7 @@ def swadesh_statistics( def compare_translations(swadesh_lex, dictionary_lex): def split_lex(lex): + #TODO: move this condition if ' заим.' in lex: return set() # Split by commas and open brackets to separate @@ -13156,7 +13157,7 @@ def split_lex(lex): # Store entry_id and number of the lex within Swadesh' list entries_set[perspective_id].add(entry_id) swadesh_set[perspective_id].add(swadesh_num) - # Store the entry content in human readable format + # Store the entry's content in human readable format result_pool[perspective_id][entry_id] = { 'group': None, 'swadesh': swadesh_lex, @@ -13178,7 +13179,7 @@ def split_lex(lex): dictionary_count = len(links) distance_data_array = numpy.full((dictionary_count, dictionary_count), 100) - distance_header_array = numpy.empty(dictionary_count, dtype='object') + distance_header_array = numpy.full(dictionary_count, "", dtype='object') # Calculate intersection between lists of group numbers # So length of this intersection is the similarity of corresponding perspectives @@ -13196,18 +13197,15 @@ def split_lex(lex): print(f"{perspective2}:{commons_linked}/{commons_total}:{distance:.2f}", end=' | ') print() - cur_time = time.time() - storage_dir = os.path.join(storage['path'], 'swadesh', str(cur_time)) - _, mst_list, embedding_2d_pca, embedding_3d_pca = \ CognateAnalysis.distance_graph( language_str, base_language_name, distance_data_array, distance_header_array, - "swadesh", - storage, - storage_dir, + None, + None, + None, __plot_flag__ = False ) result_dict = ( From 568eb3e280fab0c2ce52156d4e495f2612afc097 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Fri, 26 May 2023 11:01:28 +0300 Subject: [PATCH 36/69] Args right order --- lingvodoc/schema/query.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 4538a5e9..855ea0de 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -10896,8 +10896,8 @@ def distance_graph( mode, storage, storage_dir, - __plot_flag__ = True, - __debug_flag__ = False): + __debug_flag__ = False, + __plot_flag__ = True): d_ij = (distance_data_array + distance_data_array.T) / 2 From 90324a1e4c976621679dbadd25d9c26e26afb7df Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Fri, 26 May 2023 22:43:03 +0300 Subject: [PATCH 37/69] create_table --- lingvodoc/schema/query.py | 63 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 855ea0de..c2758fc3 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13019,6 +13019,64 @@ class Arguments: embedding_3d = graphene.List(graphene.List(graphene.Float)) perspective_name_list = graphene.List(graphene.String) + @staticmethod + def create_table(result_pool, group_count): + ''' + Keys: + result_pool[perspective_id][entry_id] + Fields: + 'group': group_index, + 'swadesh': swadesh_lex, + 'word': word_list[0], + 'translation': translation_lex + ''' + + space = ' ' + col_len = 62 + def combine(*args): + result = space * 2 + fld_len = ((col_len - 2) // len(args)) - 2 + + for s in args: + result += f"{str(s).ljust(fld_len)[:fld_len]}{space * 2}" + return result + + dict_count = len(result_pool) + #print(f"{dict_count}:{result_pool}) + + # 'groups' is horizontals in table before 'single' + groups = [[None] * dict_count] * group_count + + # 'single' is verticals in table after 'groups' + # first element in every vertical is the dictionary name + single = [[]] * dict_count + + # re-group by group number and add joined values + for dict_index, perspective in enumerate(result_pool.values()): + dict_name = combine(f"{dict_index + 1}. {perspective['name']}") + single[dict_index].append(dict_name) + + for entry in perspective.values(): + print(entry) + group_num = entry['group'] + entry_text = combine(entry['swadesh'], entry['word'], entry['translate']) + if group_num: + groups[group_num][dict_index] = entry_text + else: + single[dict_index].append(entry_text) + + # iterate through 'groups' and 'single' and concatenate result + result = "" + # headers + result += ''.join(single[:][0]) + '\n\n' + # groups by lines + result += '\n'.join(''.join(line) for line in groups) + # not-cognates by columns + for indent, entries in enumerate(single): + result += '\n'.join(space * col_len * indent + entry for entry in entries) + + return result + @staticmethod def swadesh_statistics( language_str, @@ -13208,11 +13266,16 @@ def split_lex(lex): None, __plot_flag__ = False ) + + result = SwadeshAnalysis.create_table(result_pool, len(group_list)) + print(result) + result_dict = ( dict( triumph = True, + #result = result, minimum_spanning_tree = mst_list, embedding_2d = embedding_2d_pca, embedding_3d = embedding_3d_pca, From cd8f5220299e581f9558df8c9530390c49d7a686 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Fri, 26 May 2023 22:54:11 +0300 Subject: [PATCH 38/69] Fix --- lingvodoc/schema/query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index c2758fc3..a58c1a8c 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13057,7 +13057,7 @@ def combine(*args): single[dict_index].append(dict_name) for entry in perspective.values(): - print(entry) + if not isinstance(entry, dict): continue group_num = entry['group'] entry_text = combine(entry['swadesh'], entry['word'], entry['translate']) if group_num: From 08e5bd8b6086779b22dcf85aaac9f94369b674df Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Fri, 26 May 2023 23:07:54 +0300 Subject: [PATCH 39/69] Fix --- lingvodoc/schema/query.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index a58c1a8c..3dfc7ba3 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13042,10 +13042,9 @@ def combine(*args): return result dict_count = len(result_pool) - #print(f"{dict_count}:{result_pool}) # 'groups' is horizontals in table before 'single' - groups = [[None] * dict_count] * group_count + groups = [[""] * dict_count] * group_count # 'single' is verticals in table after 'groups' # first element in every vertical is the dictionary name @@ -13059,7 +13058,8 @@ def combine(*args): for entry in perspective.values(): if not isinstance(entry, dict): continue group_num = entry['group'] - entry_text = combine(entry['swadesh'], entry['word'], entry['translate']) + entry_text = combine(entry['swadesh'], entry['word'], entry['translation']) + print(entry_text) if group_num: groups[group_num][dict_index] = entry_text else: From 74e40a4753ce696ba16b2d100b5a073e722dea56 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Sun, 28 May 2023 00:03:10 +0300 Subject: [PATCH 40/69] Some fixes --- lingvodoc/schema/query.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 3dfc7ba3..d2dc72a7 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13014,6 +13014,7 @@ class Arguments: triumph = graphene.Boolean() + result = graphene.String() minimum_spanning_tree = graphene.List(graphene.List(graphene.Int)) embedding_2d = graphene.List(graphene.List(graphene.Float)) embedding_3d = graphene.List(graphene.List(graphene.Float)) @@ -13033,6 +13034,7 @@ def create_table(result_pool, group_count): space = ' ' col_len = 62 + # get length-fixed lines def combine(*args): result = space * 2 fld_len = ((col_len - 2) // len(args)) - 2 @@ -13044,36 +13046,33 @@ def combine(*args): dict_count = len(result_pool) # 'groups' is horizontals in table before 'single' - groups = [[""] * dict_count] * group_count + groups = numpy.full((group_count, dict_count), space*col_len, dtype='object') # 'single' is verticals in table after 'groups' # first element in every vertical is the dictionary name - single = [[]] * dict_count + single = [None] * dict_count # re-group by group number and add joined values for dict_index, perspective in enumerate(result_pool.values()): dict_name = combine(f"{dict_index + 1}. {perspective['name']}") - single[dict_index].append(dict_name) - + single[dict_index] = [dict_name] for entry in perspective.values(): if not isinstance(entry, dict): continue group_num = entry['group'] entry_text = combine(entry['swadesh'], entry['word'], entry['translation']) - print(entry_text) if group_num: groups[group_num][dict_index] = entry_text else: single[dict_index].append(entry_text) - # iterate through 'groups' and 'single' and concatenate result result = "" # headers - result += ''.join(single[:][0]) + '\n\n' + result += ''.join(single[n][0] for n in range(dict_count)) + '\n\n' # groups by lines - result += '\n'.join(''.join(line) for line in groups) + result += '\n'.join(''.join(line) for line in groups) + '\n' # not-cognates by columns for indent, entries in enumerate(single): - result += '\n'.join(space * col_len * indent + entry for entry in entries) + result += '\n'.join(space * col_len * indent + entry for entry in entries[1:]) return result @@ -13244,7 +13243,7 @@ def split_lex(lex): # commons_total means amount of Swadesh's lexems met in the both perspectives for n1, (perspective1, groups1) in enumerate(links.items()): distance_header_array[n1] = result_pool[perspective1]['name'] - print(perspective1, end=' :: ') + #print(perspective1, end=' :: ') for n2, (perspective2, groups2) in enumerate(links.items()): #if n2 <= n1: continue #exclude duplicates and self-to-self commons_linked = len(groups1 & groups2) @@ -13252,8 +13251,8 @@ def split_lex(lex): # commons_linked > 0 means that commons_total > 0 even more so distance = math.log(commons_linked / commons_total) / -0.14 if commons_linked > 0 else 100 distance_data_array[n1][n2] = distance - print(f"{perspective2}:{commons_linked}/{commons_total}:{distance:.2f}", end=' | ') - print() + #print(f"{perspective2}:{commons_linked}/{commons_total}:{distance:.2f}", end=' | ') + #print() _, mst_list, embedding_2d_pca, embedding_3d_pca = \ CognateAnalysis.distance_graph( @@ -13268,14 +13267,13 @@ def split_lex(lex): ) result = SwadeshAnalysis.create_table(result_pool, len(group_list)) - print(result) result_dict = ( dict( triumph = True, - #result = result, + result = result, minimum_spanning_tree = mst_list, embedding_2d = embedding_2d_pca, embedding_3d = embedding_3d_pca, From 4b915daa9ac8121cfc2f39bdd0c892adcffa7592 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Sun, 28 May 2023 10:40:39 +0300 Subject: [PATCH 41/69] Some fixes --- lingvodoc/schema/query.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index d2dc72a7..2b050353 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13033,7 +13033,7 @@ def create_table(result_pool, group_count): ''' space = ' ' - col_len = 62 + col_len = 50 # get length-fixed lines def combine(*args): result = space * 2 @@ -13046,7 +13046,7 @@ def combine(*args): dict_count = len(result_pool) # 'groups' is horizontals in table before 'single' - groups = numpy.full((group_count, dict_count), space*col_len, dtype='object') + groups = numpy.full((group_count, dict_count), '.'*col_len, dtype='object') # 'single' is verticals in table after 'groups' # first element in every vertical is the dictionary name @@ -13072,7 +13072,8 @@ def combine(*args): result += '\n'.join(''.join(line) for line in groups) + '\n' # not-cognates by columns for indent, entries in enumerate(single): - result += '\n'.join(space * col_len * indent + entry for entry in entries[1:]) + result += '\n'.join(space * col_len * indent + entry + for entry in entries[1:]) + '\n' return result From 9a5f04c49d345abefddb1ce24c3171d73fea465e Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Sun, 28 May 2023 16:00:26 +0300 Subject: [PATCH 42/69] Used pandas dataframe --- lingvodoc/schema/query.py | 26 ++++++++++++-------------- server-requirements-final.txt | 1 + 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 2b050353..187c7008 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13032,8 +13032,11 @@ def create_table(result_pool, group_count): 'translation': translation_lex ''' + import pandas as pd + space = ' ' col_len = 50 + # get length-fixed lines def combine(*args): result = space * 2 @@ -13041,29 +13044,23 @@ def combine(*args): for s in args: result += f"{str(s).ljust(fld_len)[:fld_len]}{space * 2}" - return result - - dict_count = len(result_pool) - - # 'groups' is horizontals in table before 'single' - groups = numpy.full((group_count, dict_count), '.'*col_len, dtype='object') - # 'single' is verticals in table after 'groups' - # first element in every vertical is the dictionary name - single = [None] * dict_count + return result + groups = pd.DataFrame() # re-group by group number and add joined values for dict_index, perspective in enumerate(result_pool.values()): dict_name = combine(f"{dict_index + 1}. {perspective['name']}") - single[dict_index] = [dict_name] for entry in perspective.values(): if not isinstance(entry, dict): continue group_num = entry['group'] entry_text = combine(entry['swadesh'], entry['word'], entry['translation']) if group_num: - groups[group_num][dict_index] = entry_text + groups.loc[group_num, dict_name] = entry_text else: - single[dict_index].append(entry_text) + groups.loc[group_count, dict_name] = entry_text + group_count += 1 + ''' # iterate through 'groups' and 'single' and concatenate result result = "" # headers @@ -13074,8 +13071,9 @@ def combine(*args): for indent, entries in enumerate(single): result += '\n'.join(space * col_len * indent + entry for entry in entries[1:]) + '\n' + ''' - return result + return groups.to_html(index=False) @staticmethod def swadesh_statistics( @@ -13120,7 +13118,7 @@ def split_lex(lex): # swadesh_set gathers numbers of words within Swadesh' list entries_set = {} swadesh_set = {} - result_pool = {} + result_pool = collections.OrderedDict() for index, (perspective_id, word_field_id, translation_field_id) in \ enumerate(perspective_info_list): diff --git a/server-requirements-final.txt b/server-requirements-final.txt index 460af8a7..557b0d02 100644 --- a/server-requirements-final.txt +++ b/server-requirements-final.txt @@ -1 +1,2 @@ matplotlib==1.5.3 +pandas==2.0.1 From 142900251f03f5c09f17d809235b85bf840cd33a Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Mon, 29 May 2023 00:46:37 +0300 Subject: [PATCH 43/69] Pretty table --- lingvodoc/schema/query.py | 9 ++++++--- server-requirements-final.txt | 1 + 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 187c7008..d12351a9 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13033,7 +13033,9 @@ def create_table(result_pool, group_count): ''' import pandas as pd + from pretty_html_table import build_table + ''' space = ' ' col_len = 50 @@ -13046,15 +13048,16 @@ def combine(*args): result += f"{str(s).ljust(fld_len)[:fld_len]}{space * 2}" return result + ''' groups = pd.DataFrame() # re-group by group number and add joined values for dict_index, perspective in enumerate(result_pool.values()): - dict_name = combine(f"{dict_index + 1}. {perspective['name']}") + dict_name = f"{dict_index + 1}. {perspective['name']}" for entry in perspective.values(): if not isinstance(entry, dict): continue group_num = entry['group'] - entry_text = combine(entry['swadesh'], entry['word'], entry['translation']) + entry_text = f"{entry['swadesh']} | {entry['word']} | {entry['translation']}" if group_num: groups.loc[group_num, dict_name] = entry_text else: @@ -13073,7 +13076,7 @@ def combine(*args): for entry in entries[1:]) + '\n' ''' - return groups.to_html(index=False) + return build_table(groups, 'blue_light', width="300px") @staticmethod def swadesh_statistics( diff --git a/server-requirements-final.txt b/server-requirements-final.txt index 557b0d02..5497c99d 100644 --- a/server-requirements-final.txt +++ b/server-requirements-final.txt @@ -1,2 +1,3 @@ matplotlib==1.5.3 pandas==2.0.1 +pretty_html_table From 778118ba3878f1eed98f24ccda2950ac538f5f44 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Mon, 29 May 2023 12:14:31 +0300 Subject: [PATCH 44/69] Show borrowed words --- lingvodoc/schema/query.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index d12351a9..e5da7c78 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13055,7 +13055,8 @@ def combine(*args): for dict_index, perspective in enumerate(result_pool.values()): dict_name = f"{dict_index + 1}. {perspective['name']}" for entry in perspective.values(): - if not isinstance(entry, dict): continue + if not isinstance(entry, dict): + continue group_num = entry['group'] entry_text = f"{entry['swadesh']} | {entry['word']} | {entry['translation']}" if group_num: @@ -13076,7 +13077,7 @@ def combine(*args): for entry in entries[1:]) + '\n' ''' - return build_table(groups, 'blue_light', width="300px") + return build_table(groups, 'blue_light', width="300px") @staticmethod def swadesh_statistics( @@ -13100,12 +13101,9 @@ def swadesh_statistics( def compare_translations(swadesh_lex, dictionary_lex): def split_lex(lex): - #TODO: move this condition - if ' заим.' in lex: - return set() # Split by commas and open brackets to separate # various forms of lexeme and extra note if is - return set(form.strip().lower() + return set(f" {form}".lower().replace(" заим.", "").strip() for form in lex.replace('(', ',').split(',') if form.strip() and ')' not in form) # exclude notes @@ -13213,28 +13211,32 @@ def split_lex(lex): for swadesh_num, swadesh_lex in enumerate(swadesh_list): for translation_lex in translation_list: if compare_translations(swadesh_lex, translation_lex): - # Store entry_id and number of the lex within Swadesh' list - entries_set[perspective_id].add(entry_id) - swadesh_set[perspective_id].add(swadesh_num) # Store the entry's content in human readable format result_pool[perspective_id][entry_id] = { 'group': None, + 'borrowed': (" заим." in f" {word_list[0]} {translation_lex}"), 'swadesh': swadesh_lex, 'word': word_list[0], 'translation': translation_lex } + # Store entry_id and number of the lex within Swadesh' list + entries_set[perspective_id].add(entry_id) + if not result_pool[perspective_id][entry_id]['borrowed']: + swadesh_set[perspective_id].add(swadesh_num) # Create dictionary of sets: # keys: pepspective_id # values: numbers of etymological groups where an entry from dictionary is met links = collections.OrderedDict() - for perspective, entries in entries_set.items(): - links[perspective] = set() + for perspective_id, entries in entries_set.items(): + links[perspective_id] = set() for group_index, group in enumerate(group_list): linked = entries & group if linked: - links[perspective].add(group_index) - result_pool[perspective][linked.pop()]['group'] = group_index + entry_id = linked.pop() + result_pool[perspective_id][entry_id]['group'] = group_index + if not result_pool[perspective_id][entry_id]['borrowed']: + links[perspective_id].add(group_index) dictionary_count = len(links) distance_data_array = numpy.full((dictionary_count, dictionary_count), 100) From e6058bba0e86931950ce70c67a78dbfb21f64570 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Mon, 29 May 2023 14:46:16 +0300 Subject: [PATCH 45/69] pre-export to xlsx --- lingvodoc/schema/query.py | 77 ++++++++++++++++++++++++--------------- 1 file changed, 47 insertions(+), 30 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index e5da7c78..aba56f8c 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13015,40 +13015,26 @@ class Arguments: triumph = graphene.Boolean() result = graphene.String() + xlsx_url = graphene.String() minimum_spanning_tree = graphene.List(graphene.List(graphene.Int)) embedding_2d = graphene.List(graphene.List(graphene.Float)) embedding_3d = graphene.List(graphene.List(graphene.Float)) perspective_name_list = graphene.List(graphene.String) @staticmethod - def create_table(result_pool, group_count): + def export_dataframe(result_pool, group_count): ''' Keys: result_pool[perspective_id][entry_id] Fields: 'group': group_index, + 'borrowed': bool, 'swadesh': swadesh_lex, 'word': word_list[0], 'translation': translation_lex ''' import pandas as pd - from pretty_html_table import build_table - - ''' - space = ' ' - col_len = 50 - - # get length-fixed lines - def combine(*args): - result = space * 2 - fld_len = ((col_len - 2) // len(args)) - 2 - - for s in args: - result += f"{str(s).ljust(fld_len)[:fld_len]}{space * 2}" - - return result - ''' groups = pd.DataFrame() # re-group by group number and add joined values @@ -13064,20 +13050,46 @@ def combine(*args): else: groups.loc[group_count, dict_name] = entry_text group_count += 1 + + return groups + + @staticmethod + def export_xlsx( + result_dataframe, + base_language_name, + storage + ): + # Exporting analysis results as an Excel file. + + current_datetime = datetime.datetime.now(datetime.timezone.utc) + xlsx_filename = pathvalidate.sanitize_filename( + '{0} {1} {2:04d}.{3:02d}.{4:02d}.xlsx'.format( + base_language_name[:64], + 'glottochronology', + current_datetime.year, + current_datetime.month, + current_datetime.day)) + + cur_time = time.time() + storage_dir = os.path.join(storage['path'], 'glottochronology', str(cur_time)) + + # Storing Excel file with the results. + + xlsx_path = os.path.join(storage_dir, xlsx_filename) + os.makedirs(os.path.dirname(xlsx_path), exist_ok=True) + ''' - # iterate through 'groups' and 'single' and concatenate result - result = "" - # headers - result += ''.join(single[n][0] for n in range(dict_count)) + '\n\n' - # groups by lines - result += '\n'.join(''.join(line) for line in groups) + '\n' - # not-cognates by columns - for indent, entries in enumerate(single): - result += '\n'.join(space * col_len * indent + entry - for entry in entries[1:]) + '\n' + workbook_stream.seek(0) + + with open(xlsx_path, 'wb') as xlsx_file: + shutil.copyfileobj(workbook_stream, xlsx_file) ''' - return build_table(groups, 'blue_light', width="300px") + xlsx_url = ''.join([ + storage['prefix'], storage['static_route'], + 'glottochronology', '/', str(cur_time), '/', xlsx_filename]) + + return xlsx_url @staticmethod def swadesh_statistics( @@ -13088,6 +13100,8 @@ def swadesh_statistics( locale_id, storage): + from pretty_html_table import build_table + swadesh_list = ['я','ты','мы','этот, это','тот, то','кто','что','не','все','много','один','два','большой', 'долгий','маленький','женщина','мужчина','человек','рыба','птица','собака','вошь','дерево', 'семя','лист','корень','кора','кожа','мясо','кровь','кость','жир','яйцо','рог','хвост','перо', @@ -13270,14 +13284,17 @@ def split_lex(lex): __plot_flag__ = False ) - result = SwadeshAnalysis.create_table(result_pool, len(group_list)) + result_dataframe = SwadeshAnalysis.export_dataframe(result_pool, len(group_list)) + xlsx_url = SwadeshAnalysis.export_xlsx(result_dataframe, base_language_name, storage) + result_table = build_table(result_dataframe, 'blue_light', width="300px") result_dict = ( dict( triumph = True, - result = result, + result = result_table, + xlsx_url = xlsx_url, minimum_spanning_tree = mst_list, embedding_2d = embedding_2d_pca, embedding_3d = embedding_3d_pca, From 93d72c7ef8bf90db8528720f2f9b968235c7603a Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Mon, 29 May 2023 15:54:27 +0300 Subject: [PATCH 46/69] Export to xlsx --- lingvodoc/schema/query.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index aba56f8c..ea52613c 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13078,12 +13078,7 @@ def export_xlsx( xlsx_path = os.path.join(storage_dir, xlsx_filename) os.makedirs(os.path.dirname(xlsx_path), exist_ok=True) - ''' - workbook_stream.seek(0) - - with open(xlsx_path, 'wb') as xlsx_file: - shutil.copyfileobj(workbook_stream, xlsx_file) - ''' + result_dataframe.to_excel(xlsx_path, index=False, sheet_name='Glottochronology') xlsx_url = ''.join([ storage['prefix'], storage['static_route'], From 7ab3b3a5de4239aa7cee5730df68205bb775e649 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Tue, 30 May 2023 12:42:18 +0300 Subject: [PATCH 47/69] Using phonological transcription --- lingvodoc/schema/query.py | 32 ++++++++++++++++---------------- server-requirements-1.txt | 2 +- server-requirements-final.txt | 2 +- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index ea52613c..3b8e649c 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13030,7 +13030,7 @@ def export_dataframe(result_pool, group_count): 'group': group_index, 'borrowed': bool, 'swadesh': swadesh_lex, - 'word': word_list[0], + 'transcription': transcription_list[0], 'translation': translation_lex ''' @@ -13044,7 +13044,7 @@ def export_dataframe(result_pool, group_count): if not isinstance(entry, dict): continue group_num = entry['group'] - entry_text = f"{entry['swadesh']} | {entry['word']} | {entry['translation']}" + entry_text = f"{entry['swadesh']} | {entry['transcription']} | {entry['translation']}" if group_num: groups.loc[group_num, dict_name] = entry_text else: @@ -13129,7 +13129,7 @@ def split_lex(lex): entries_set = {} swadesh_set = {} result_pool = collections.OrderedDict() - for index, (perspective_id, word_field_id, translation_field_id) in \ + for index, (perspective_id, transcription_field_id, translation_field_id) in \ enumerate(perspective_info_list): # Getting and saving perspective info. @@ -13142,7 +13142,7 @@ def split_lex(lex): dictionary_name = perspective.parent.get_translation(locale_id) # Getting text data. - word_query = ( + transcription_query = ( DBSession .query( dbLexicalEntry.client_id, @@ -13153,15 +13153,15 @@ def split_lex(lex): dbLexicalEntry.marked_for_deletion == False, dbEntity.parent_client_id == dbLexicalEntry.client_id, dbEntity.parent_object_id == dbLexicalEntry.object_id, - dbEntity.field_client_id == word_field_id[0], - dbEntity.field_object_id == word_field_id[1], + dbEntity.field_client_id == transcription_field_id[0], + dbEntity.field_object_id == transcription_field_id[1], dbEntity.marked_for_deletion == False, dbPublishingEntity.client_id == dbEntity.client_id, dbPublishingEntity.object_id == dbEntity.object_id, dbPublishingEntity.published == True, dbPublishingEntity.accepted == True) .add_columns( - func.array_agg(dbEntity.content).label('word')) + func.array_agg(dbEntity.content).label('transcription')) .group_by(dbLexicalEntry) .subquery()) @@ -13188,13 +13188,13 @@ def split_lex(lex): .group_by(dbLexicalEntry) .subquery()) - # Main query for word/translation data. + # Main query for transcription/translation data. data_query = ( DBSession - .query(word_query) + .query(transcription_query) .outerjoin(translation_query, and_( - word_query.c.client_id == translation_query.c.client_id, - word_query.c.object_id == translation_query.c.object_id)) + transcription_query.c.client_id == translation_query.c.client_id, + transcription_query.c.object_id == translation_query.c.object_id)) .add_columns( translation_query.c.translation) .all()) @@ -13205,10 +13205,10 @@ def split_lex(lex): result_pool[perspective_id] = {'name': dictionary_name} for row_index, row in enumerate(data_query): entry_id = tuple(row[:2]) - word_list, translation_list = row[2:4] + transcription_list, translation_list = row[2:4] - # If we have no words for this lexical entry, we skip it altogether. - if not word_list: + # If we have no transcriptions for this lexical entry, we skip it altogether. + if not transcription_list: continue translation_list = ( @@ -13223,9 +13223,9 @@ def split_lex(lex): # Store the entry's content in human readable format result_pool[perspective_id][entry_id] = { 'group': None, - 'borrowed': (" заим." in f" {word_list[0]} {translation_lex}"), + 'borrowed': (" заим." in f" {transcription_list[0]} {translation_lex}"), 'swadesh': swadesh_lex, - 'word': word_list[0], + 'transcription': transcription_list[0], 'translation': translation_lex } # Store entry_id and number of the lex within Swadesh' list diff --git a/server-requirements-1.txt b/server-requirements-1.txt index 50c1d5b0..1fa56ad7 100644 --- a/server-requirements-1.txt +++ b/server-requirements-1.txt @@ -65,7 +65,7 @@ pyramid-debugtoolbar==3.0.4 pyramid-mailer==0.15.1 pyramid-mako==1.0.2 pyramid-tm==1.0.1 -python-dateutil==2.8.0 +python-dateutil==2.8.1 python-docx==0.8.10 python-editor==1.0.3 pytz==2018.5 diff --git a/server-requirements-final.txt b/server-requirements-final.txt index 5497c99d..a90fd4b7 100644 --- a/server-requirements-final.txt +++ b/server-requirements-final.txt @@ -1,3 +1,3 @@ matplotlib==1.5.3 -pandas==2.0.1 +pandas==1.4.3 pretty_html_table From fa350bfe905b4a771d186b19b284c1f857510417 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Tue, 30 May 2023 14:05:05 +0300 Subject: [PATCH 48/69] Exclude tiny (<50 words) dictionaries --- lingvodoc/schema/query.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 3b8e649c..44688cc6 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13041,6 +13041,8 @@ def export_dataframe(result_pool, group_count): for dict_index, perspective in enumerate(result_pool.values()): dict_name = f"{dict_index + 1}. {perspective['name']}" for entry in perspective.values(): + # 'entry' iterator may present 'name' or 'suite' field + # but not an inner dictionary for entry if not isinstance(entry, dict): continue group_num = entry['group'] @@ -13202,7 +13204,10 @@ def split_lex(lex): # Grouping translations by lexical entries. entries_set[perspective_id] = set() swadesh_set[perspective_id] = set() - result_pool[perspective_id] = {'name': dictionary_name} + result_pool[perspective_id] = { + 'name': dictionary_name, + 'suit': (len(data_query) > 50) + } for row_index, row in enumerate(data_query): entry_id = tuple(row[:2]) transcription_list, translation_list = row[2:4] @@ -13230,7 +13235,8 @@ def split_lex(lex): } # Store entry_id and number of the lex within Swadesh' list entries_set[perspective_id].add(entry_id) - if not result_pool[perspective_id][entry_id]['borrowed']: + if (result_pool[perspective_id]['suit'] and + not result_pool[perspective_id][entry_id]['borrowed']): swadesh_set[perspective_id].add(swadesh_num) # Create dictionary of sets: @@ -13244,7 +13250,8 @@ def split_lex(lex): if linked: entry_id = linked.pop() result_pool[perspective_id][entry_id]['group'] = group_index - if not result_pool[perspective_id][entry_id]['borrowed']: + if (result_pool[perspective_id]['suit'] and + not result_pool[perspective_id][entry_id]['borrowed']): links[perspective_id].add(group_index) dictionary_count = len(links) From f5b6e6042c5555f59b81715cd4b93591c5466a2a Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Tue, 30 May 2023 16:55:16 +0300 Subject: [PATCH 49/69] Sorting result table --- lingvodoc/schema/query.py | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 44688cc6..396b730a 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -362,6 +362,9 @@ import lingvodoc.scripts.docx_import as docx_import +import pandas as pd +from pretty_html_table import build_table + # Setting up logging. log = logging.getLogger(__name__) logging.disable(level=logging.INFO) @@ -13034,15 +13037,14 @@ def export_dataframe(result_pool, group_count): 'translation': translation_lex ''' - import pandas as pd - groups = pd.DataFrame() + single = pd.DataFrame() # re-group by group number and add joined values for dict_index, perspective in enumerate(result_pool.values()): dict_name = f"{dict_index + 1}. {perspective['name']}" for entry in perspective.values(): - # 'entry' iterator may present 'name' or 'suite' field - # but not an inner dictionary for entry + # 'entry' iterator may present string value of 'name' or 'suite' field + # but not a dictionary for one of entries. Continue in this case. if not isinstance(entry, dict): continue group_num = entry['group'] @@ -13050,14 +13052,15 @@ def export_dataframe(result_pool, group_count): if group_num: groups.loc[group_num, dict_name] = entry_text else: - groups.loc[group_count, dict_name] = entry_text + single.loc[group_count, dict_name] = entry_text group_count += 1 - return groups + return groups.sort_values(groups.columns[0]), single.sort_index() @staticmethod def export_xlsx( - result_dataframe, + result_dataframes, + sheet_names, base_language_name, storage ): @@ -13080,7 +13083,9 @@ def export_xlsx( xlsx_path = os.path.join(storage_dir, xlsx_filename) os.makedirs(os.path.dirname(xlsx_path), exist_ok=True) - result_dataframe.to_excel(xlsx_path, index=False, sheet_name='Glottochronology') + with pd.ExcelWriter(xlsx_path) as writer: + for n, df in enumerate(result_dataframes): + df.to_excel(writer, index=False, sheet_name=sheet_names[n]) xlsx_url = ''.join([ storage['prefix'], storage['static_route'], @@ -13097,8 +13102,6 @@ def swadesh_statistics( locale_id, storage): - from pretty_html_table import build_table - swadesh_list = ['я','ты','мы','этот, это','тот, то','кто','что','не','все','много','один','два','большой', 'долгий','маленький','женщина','мужчина','человек','рыба','птица','собака','вошь','дерево', 'семя','лист','корень','кора','кожа','мясо','кровь','кость','жир','яйцо','рог','хвост','перо', @@ -13286,16 +13289,19 @@ def split_lex(lex): __plot_flag__ = False ) - result_dataframe = SwadeshAnalysis.export_dataframe(result_pool, len(group_list)) - xlsx_url = SwadeshAnalysis.export_xlsx(result_dataframe, base_language_name, storage) - result_table = build_table(result_dataframe, 'blue_light', width="300px") + result_dataframes = SwadeshAnalysis.export_dataframe(result_pool, len(group_list)) + xlsx_url = SwadeshAnalysis.export_xlsx(result_dataframes, + ['Cognates', 'Singles'], + base_language_name, + storage) + result_tables = (build_table(result_dataframes[0], 'blue_light', width="300px"), + build_table(result_dataframes[1], 'green_light', width="300px")) result_dict = ( - dict( triumph = True, - result = result_table, + result = f"{result_tables[0]}\n\n{result_tables[1]}", xlsx_url = xlsx_url, minimum_spanning_tree = mst_list, embedding_2d = embedding_2d_pca, From e7b10ec184d557fff52740f047e656cf97ef8b53 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Tue, 30 May 2023 17:14:35 +0300 Subject: [PATCH 50/69] Refactoring --- lingvodoc/schema/query.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 396b730a..3808a13a 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13055,12 +13055,14 @@ def export_dataframe(result_pool, group_count): single.loc[group_count, dict_name] = entry_text group_count += 1 - return groups.sort_values(groups.columns[0]), single.sort_index() + return { + 'Cognates': groups.sort_values(groups.columns[0]), + 'Singles': single.sort_index() + } @staticmethod def export_xlsx( - result_dataframes, - sheet_names, + result, base_language_name, storage ): @@ -13084,8 +13086,8 @@ def export_xlsx( os.makedirs(os.path.dirname(xlsx_path), exist_ok=True) with pd.ExcelWriter(xlsx_path) as writer: - for n, df in enumerate(result_dataframes): - df.to_excel(writer, index=False, sheet_name=sheet_names[n]) + for sheet_name, df in result.items(): + df.to_excel(writer, index=False, sheet_name=sheet_name) xlsx_url = ''.join([ storage['prefix'], storage['static_route'], @@ -13289,13 +13291,10 @@ def split_lex(lex): __plot_flag__ = False ) - result_dataframes = SwadeshAnalysis.export_dataframe(result_pool, len(group_list)) - xlsx_url = SwadeshAnalysis.export_xlsx(result_dataframes, - ['Cognates', 'Singles'], - base_language_name, - storage) - result_tables = (build_table(result_dataframes[0], 'blue_light', width="300px"), - build_table(result_dataframes[1], 'green_light', width="300px")) + result = SwadeshAnalysis.export_dataframe(result_pool, len(group_list)) + xlsx_url = SwadeshAnalysis.export_xlsx(result, base_language_name, storage) + result_tables = (build_table(result['Cognates'], 'blue_light', width="300px"), + build_table(result['Singles'], 'green_light', width="300px")) result_dict = ( dict( From 3b195579c0b89a06ea2a31f70f2e479d13bf7cf1 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Tue, 30 May 2023 17:44:46 +0300 Subject: [PATCH 51/69] Set columns width --- lingvodoc/schema/query.py | 3 ++- server-requirements-final.txt | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 3808a13a..c47f63fa 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13085,9 +13085,10 @@ def export_xlsx( xlsx_path = os.path.join(storage_dir, xlsx_filename) os.makedirs(os.path.dirname(xlsx_path), exist_ok=True) - with pd.ExcelWriter(xlsx_path) as writer: + with pd.ExcelWriter(xlsx_path, engine='xlsxwriter') as writer: for sheet_name, df in result.items(): df.to_excel(writer, index=False, sheet_name=sheet_name) + writer.sheets[sheet_name].set_column(0, df.shape[1] - 1, 30) xlsx_url = ''.join([ storage['prefix'], storage['static_route'], diff --git a/server-requirements-final.txt b/server-requirements-final.txt index a90fd4b7..1a41656e 100644 --- a/server-requirements-final.txt +++ b/server-requirements-final.txt @@ -1,3 +1,4 @@ matplotlib==1.5.3 pandas==1.4.3 pretty_html_table +xlsxwriter From a408f5b08cec66d2d2cd1f40d2f7b0664ac67d06 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Tue, 30 May 2023 18:55:57 +0300 Subject: [PATCH 52/69] Bundles --- lingvodoc/schema/query.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index c47f63fa..f6a87016 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13025,7 +13025,7 @@ class Arguments: perspective_name_list = graphene.List(graphene.String) @staticmethod - def export_dataframe(result_pool, group_count): + def export_dataframe(result_pool, bundles): ''' Keys: result_pool[perspective_id][entry_id] @@ -13039,6 +13039,7 @@ def export_dataframe(result_pool, group_count): groups = pd.DataFrame() single = pd.DataFrame() + row_index = 0 # re-group by group number and add joined values for dict_index, perspective in enumerate(result_pool.values()): dict_name = f"{dict_index + 1}. {perspective['name']}" @@ -13049,11 +13050,11 @@ def export_dataframe(result_pool, group_count): continue group_num = entry['group'] entry_text = f"{entry['swadesh']} | {entry['transcription']} | {entry['translation']}" - if group_num: + if group_num and group_num in bundles: groups.loc[group_num, dict_name] = entry_text else: - single.loc[group_count, dict_name] = entry_text - group_count += 1 + single.loc[row_index, dict_name] = entry_text + row_index += 1 return { 'Cognates': groups.sort_values(groups.columns[0]), @@ -13267,18 +13268,16 @@ def split_lex(lex): # Calculate intersection between lists of group numbers # So length of this intersection is the similarity of corresponding perspectives # commons_total means amount of Swadesh's lexems met in the both perspectives + bundles = set() for n1, (perspective1, groups1) in enumerate(links.items()): distance_header_array[n1] = result_pool[perspective1]['name'] - #print(perspective1, end=' :: ') for n2, (perspective2, groups2) in enumerate(links.items()): - #if n2 <= n1: continue #exclude duplicates and self-to-self + bundles.update(groups1 & groups2) commons_linked = len(groups1 & groups2) commons_total = len(swadesh_set[perspective1] & swadesh_set[perspective2]) # commons_linked > 0 means that commons_total > 0 even more so distance = math.log(commons_linked / commons_total) / -0.14 if commons_linked > 0 else 100 distance_data_array[n1][n2] = distance - #print(f"{perspective2}:{commons_linked}/{commons_total}:{distance:.2f}", end=' | ') - #print() _, mst_list, embedding_2d_pca, embedding_3d_pca = \ CognateAnalysis.distance_graph( @@ -13292,7 +13291,7 @@ def split_lex(lex): __plot_flag__ = False ) - result = SwadeshAnalysis.export_dataframe(result_pool, len(group_list)) + result = SwadeshAnalysis.export_dataframe(result_pool, bundles) xlsx_url = SwadeshAnalysis.export_xlsx(result, base_language_name, storage) result_tables = (build_table(result['Cognates'], 'blue_light', width="300px"), build_table(result['Singles'], 'green_light', width="300px")) @@ -13301,7 +13300,7 @@ def split_lex(lex): dict( triumph = True, - result = f"{result_tables[0]}\n\n{result_tables[1]}", + result = f"{result_tables[0]}
\n\n
{result_tables[1]}", xlsx_url = xlsx_url, minimum_spanning_tree = mst_list, embedding_2d = embedding_2d_pca, From ccd9935ec04fe693c7ce404f70cf412aa624f6d2 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Tue, 30 May 2023 20:00:19 +0300 Subject: [PATCH 53/69] Full transcription --- lingvodoc/schema/query.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index f6a87016..ae56775a 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13229,15 +13229,16 @@ def split_lex(lex): for translation in translation_list if translation.strip()]) + transcription_lex = ', '.join(transcription_list) for swadesh_num, swadesh_lex in enumerate(swadesh_list): for translation_lex in translation_list: if compare_translations(swadesh_lex, translation_lex): # Store the entry's content in human readable format result_pool[perspective_id][entry_id] = { 'group': None, - 'borrowed': (" заим." in f" {transcription_list[0]} {translation_lex}"), + 'borrowed': (" заим." in f" {transcription_lex} {translation_lex}"), 'swadesh': swadesh_lex, - 'transcription': transcription_list[0], + 'transcription': transcription_lex, 'translation': translation_lex } # Store entry_id and number of the lex within Swadesh' list From e50fe2bc38461937fa515b005291263532ebfe6f Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Tue, 30 May 2023 20:43:35 +0300 Subject: [PATCH 54/69] Exclude self-to-self groups --- lingvodoc/schema/query.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index ae56775a..6981a120 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13273,12 +13273,15 @@ def split_lex(lex): for n1, (perspective1, groups1) in enumerate(links.items()): distance_header_array[n1] = result_pool[perspective1]['name'] for n2, (perspective2, groups2) in enumerate(links.items()): - bundles.update(groups1 & groups2) - commons_linked = len(groups1 & groups2) - commons_total = len(swadesh_set[perspective1] & swadesh_set[perspective2]) - # commons_linked > 0 means that commons_total > 0 even more so - distance = math.log(commons_linked / commons_total) / -0.14 if commons_linked > 0 else 100 - distance_data_array[n1][n2] = distance + if n1 == n2: + distance_data_array[n1][n2] = 0 + else: + bundles.update(groups1 & groups2) + commons_linked = len(groups1 & groups2) + commons_total = len(swadesh_set[perspective1] & swadesh_set[perspective2]) + # commons_linked > 0 means that commons_total > 0 even more so + distance = math.log(commons_linked / commons_total) / -0.14 if commons_linked > 0 else 100 + distance_data_array[n1][n2] = distance _, mst_list, embedding_2d_pca, embedding_3d_pca = \ CognateAnalysis.distance_graph( From 9062dfb3998ff5fa54cbd73302d73ca62d0775aa Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Tue, 30 May 2023 21:41:00 +0300 Subject: [PATCH 55/69] Fixed dependencies --- server-requirements-1.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server-requirements-1.txt b/server-requirements-1.txt index 1fa56ad7..a2d1bb7c 100644 --- a/server-requirements-1.txt +++ b/server-requirements-1.txt @@ -68,7 +68,7 @@ pyramid-tm==1.0.1 python-dateutil==2.8.1 python-docx==0.8.10 python-editor==1.0.3 -pytz==2018.5 +pytz==2020.1 PyYAML==5.2 redis==2.10.5 regex==2019.6.8 From f85915ba4b0c81564b5ac60d1f52fe49a3848ce8 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Tue, 30 May 2023 23:22:50 +0300 Subject: [PATCH 56/69] Text wrap --- lingvodoc/schema/query.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 6981a120..bd26f1bb 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13087,9 +13087,19 @@ def export_xlsx( os.makedirs(os.path.dirname(xlsx_path), exist_ok=True) with pd.ExcelWriter(xlsx_path, engine='xlsxwriter') as writer: + header_format = writer.book.add_format({'bold': True, + 'text_wrap': True, + 'valign': 'top', + 'fg_color': '#D7E4BC', + 'border': 1}) for sheet_name, df in result.items(): - df.to_excel(writer, index=False, sheet_name=sheet_name) - writer.sheets[sheet_name].set_column(0, df.shape[1] - 1, 30) + df.to_excel(writer, sheet_name=sheet_name, index=False, startrow=1, header=False) + worksheet = writer.sheets[sheet_name] + worksheet.set_column(0, df.shape[1] - 1, 30) + # Write the column headers with the defined format. + for col_num, value in enumerate(df.columns.values): + worksheet.write(0, col_num, value, header_format) + worksheet.set_row(0, 70) xlsx_url = ''.join([ storage['prefix'], storage['static_route'], From 6cc4a81a96b35d7f1fe60247d483e0215b36032c Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Tue, 30 May 2023 23:23:47 +0300 Subject: [PATCH 57/69] Deps --- server-requirements-1.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server-requirements-1.txt b/server-requirements-1.txt index a2d1bb7c..0441a615 100644 --- a/server-requirements-1.txt +++ b/server-requirements-1.txt @@ -20,7 +20,7 @@ configparser==4.0.2 cycler==0.10.0 DataProperty==0.42.1 defusedxml==0.6.0 -dill==0.3.5.1 +dill==0.3.6 docutils==0.15.2 dogpile.cache==0.6.8 et-xmlfile==1.0.1 From 0d2cf320413dc63b22a333f32d49584a5b386138 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Wed, 31 May 2023 16:50:11 +0300 Subject: [PATCH 58/69] Garbage collecting --- lingvodoc/schema/query.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index bd26f1bb..86b5ad9a 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13160,6 +13160,9 @@ def split_lex(lex): ) dictionary_name = perspective.parent.get_translation(locale_id) + # GC + del perspective + # Getting text data. transcription_query = ( DBSession @@ -13218,6 +13221,10 @@ def split_lex(lex): translation_query.c.translation) .all()) + # GC + del transcription_query + del translation_query + # Grouping translations by lexical entries. entries_set[perspective_id] = set() swadesh_set[perspective_id] = set() @@ -13257,6 +13264,9 @@ def split_lex(lex): not result_pool[perspective_id][entry_id]['borrowed']): swadesh_set[perspective_id].add(swadesh_num) + # GC + del data_query + # Create dictionary of sets: # keys: pepspective_id # values: numbers of etymological groups where an entry from dictionary is met @@ -13293,6 +13303,15 @@ def split_lex(lex): distance = math.log(commons_linked / commons_total) / -0.14 if commons_linked > 0 else 100 distance_data_array[n1][n2] = distance + result = SwadeshAnalysis.export_dataframe(result_pool, bundles) + # GC + del result_pool + xlsx_url = SwadeshAnalysis.export_xlsx(result, base_language_name, storage) + result_tables = (build_table(result['Cognates'], 'blue_light', width="300px"), + build_table(result['Singles'], 'green_light', width="300px")) + # GC + del result + _, mst_list, embedding_2d_pca, embedding_3d_pca = \ CognateAnalysis.distance_graph( language_str, @@ -13305,11 +13324,6 @@ def split_lex(lex): __plot_flag__ = False ) - result = SwadeshAnalysis.export_dataframe(result_pool, bundles) - xlsx_url = SwadeshAnalysis.export_xlsx(result, base_language_name, storage) - result_tables = (build_table(result['Cognates'], 'blue_light', width="300px"), - build_table(result['Singles'], 'green_light', width="300px")) - result_dict = ( dict( triumph = True, From df09c7079264563a9b06d42c7ed494e368f3823e Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Wed, 31 May 2023 18:06:29 +0300 Subject: [PATCH 59/69] Control output size --- lingvodoc/schema/query.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 86b5ad9a..db9d857d 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13057,7 +13057,7 @@ def export_dataframe(result_pool, bundles): row_index += 1 return { - 'Cognates': groups.sort_values(groups.columns[0]), + 'Cognates': groups if groups.empty else groups.sort_values(groups.columns[0]), 'Singles': single.sort_index() } @@ -13304,13 +13304,24 @@ def split_lex(lex): distance_data_array[n1][n2] = distance result = SwadeshAnalysis.export_dataframe(result_pool, bundles) + # GC del result_pool + xlsx_url = SwadeshAnalysis.export_xlsx(result, base_language_name, storage) result_tables = (build_table(result['Cognates'], 'blue_light', width="300px"), build_table(result['Singles'], 'green_light', width="300px")) + + # Control output size + huge_size = 1048576 + result = f"{result_tables[0]}
\n\n
{result_tables[1]}" + if len(result) > huge_size: + result = f"{result_tables[0]}
\n\nNote: The table with single words is not shown due to huge summary size
" + if len(result) > huge_size: + result = "
\n\nNote: The result tables are not shown due to huge summary size
" + # GC - del result + del result_tables _, mst_list, embedding_2d_pca, embedding_3d_pca = \ CognateAnalysis.distance_graph( @@ -13328,7 +13339,7 @@ def split_lex(lex): dict( triumph = True, - result = f"{result_tables[0]}
\n\n
{result_tables[1]}", + result = result, xlsx_url = xlsx_url, minimum_spanning_tree = mst_list, embedding_2d = embedding_2d_pca, From 35bc7664b6908bde3dad414a4e71a43b038de6ad Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Wed, 31 May 2023 19:33:14 +0300 Subject: [PATCH 60/69] Distances worksheet --- lingvodoc/schema/query.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index db9d857d..84583050 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13025,7 +13025,7 @@ class Arguments: perspective_name_list = graphene.List(graphene.String) @staticmethod - def export_dataframe(result_pool, bundles): + def export_dataframe(result_pool, distance_data_array, bundles): ''' Keys: result_pool[perspective_id][entry_id] @@ -13038,7 +13038,9 @@ def export_dataframe(result_pool, bundles): ''' groups = pd.DataFrame() - single = pd.DataFrame() + singles = pd.DataFrame() + distances = pd.DataFrame(distance_data_array, + columns=[perspective['name'] for perspective in result_pool.values()]) row_index = 0 # re-group by group number and add joined values for dict_index, perspective in enumerate(result_pool.values()): @@ -13049,16 +13051,17 @@ def export_dataframe(result_pool, bundles): if not isinstance(entry, dict): continue group_num = entry['group'] - entry_text = f"{entry['swadesh']} | {entry['transcription']} | {entry['translation']}" + entry_text = f"{entry['swadesh']} [ {entry['transcription']} ] {entry['translation']}" if group_num and group_num in bundles: groups.loc[group_num, dict_name] = entry_text else: - single.loc[row_index, dict_name] = entry_text + singles.loc[row_index, dict_name] = entry_text row_index += 1 return { 'Cognates': groups if groups.empty else groups.sort_values(groups.columns[0]), - 'Singles': single.sort_index() + 'Singles': singles.sort_index(), + 'Distances': distances.sort_index() } @staticmethod @@ -13093,7 +13096,14 @@ def export_xlsx( 'fg_color': '#D7E4BC', 'border': 1}) for sheet_name, df in result.items(): - df.to_excel(writer, sheet_name=sheet_name, index=False, startrow=1, header=False) + index = (sheet_name == 'Distances') + startcol = int(index) + df.to_excel(writer, + sheet_name=sheet_name, + index=index, + startrow=1, + startcol=startcol, + header=False) worksheet = writer.sheets[sheet_name] worksheet.set_column(0, df.shape[1] - 1, 30) # Write the column headers with the defined format. @@ -13303,7 +13313,7 @@ def split_lex(lex): distance = math.log(commons_linked / commons_total) / -0.14 if commons_linked > 0 else 100 distance_data_array[n1][n2] = distance - result = SwadeshAnalysis.export_dataframe(result_pool, bundles) + result = SwadeshAnalysis.export_dataframe(result_pool, distance_data_array, bundles) # GC del result_pool From a5faa455b9eef593d5e1e9d67291acc57528ce32 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Wed, 31 May 2023 20:18:58 +0300 Subject: [PATCH 61/69] Float distances --- lingvodoc/schema/query.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 84583050..287c89c6 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13041,10 +13041,13 @@ def export_dataframe(result_pool, distance_data_array, bundles): singles = pd.DataFrame() distances = pd.DataFrame(distance_data_array, columns=[perspective['name'] for perspective in result_pool.values()]) + # Start index for distances from 1 to match with dictionaries numbers + distances.index += 1 + row_index = 0 # re-group by group number and add joined values for dict_index, perspective in enumerate(result_pool.values()): - dict_name = f"{dict_index + 1}. {perspective['name']}" + dict_name = perspective['name'] for entry in perspective.values(): # 'entry' iterator may present string value of 'name' or 'suite' field # but not a dictionary for one of entries. Continue in this case. @@ -13102,13 +13105,12 @@ def export_xlsx( sheet_name=sheet_name, index=index, startrow=1, - startcol=startcol, header=False) worksheet = writer.sheets[sheet_name] - worksheet.set_column(0, df.shape[1] - 1, 30) + worksheet.set_column(0, df.shape[1] - 1 + startcol, 30) # Write the column headers with the defined format. for col_num, value in enumerate(df.columns.values): - worksheet.write(0, col_num, value, header_format) + worksheet.write(0, col_num + startcol, value, header_format) worksheet.set_row(0, 70) xlsx_url = ''.join([ @@ -13239,7 +13241,7 @@ def split_lex(lex): entries_set[perspective_id] = set() swadesh_set[perspective_id] = set() result_pool[perspective_id] = { - 'name': dictionary_name, + 'name': f"{index + 1}. {dictionary_name}", 'suit': (len(data_query) > 50) } for row_index, row in enumerate(data_query): @@ -13293,7 +13295,7 @@ def split_lex(lex): links[perspective_id].add(group_index) dictionary_count = len(links) - distance_data_array = numpy.full((dictionary_count, dictionary_count), 100) + distance_data_array = numpy.full((dictionary_count, dictionary_count), 100, dtype='float') distance_header_array = numpy.full(dictionary_count, "", dtype='object') # Calculate intersection between lists of group numbers From 42d3ee3d7143952f555073390ff5efd37c20d207 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Wed, 31 May 2023 20:26:12 +0300 Subject: [PATCH 62/69] Cleanup --- lingvodoc/schema/query.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 287c89c6..6c227ce0 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13099,19 +13099,21 @@ def export_xlsx( 'fg_color': '#D7E4BC', 'border': 1}) for sheet_name, df in result.items(): + worksheet = writer.sheets[sheet_name] index = (sheet_name == 'Distances') startcol = int(index) + df.to_excel(writer, sheet_name=sheet_name, index=index, startrow=1, header=False) - worksheet = writer.sheets[sheet_name] + + worksheet.set_row(0, 70) worksheet.set_column(0, df.shape[1] - 1 + startcol, 30) # Write the column headers with the defined format. for col_num, value in enumerate(df.columns.values): worksheet.write(0, col_num + startcol, value, header_format) - worksheet.set_row(0, 70) xlsx_url = ''.join([ storage['prefix'], storage['static_route'], From 8cda6f7193ba692cd15e47976a54020153c441df Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Wed, 31 May 2023 20:36:00 +0300 Subject: [PATCH 63/69] Fix --- lingvodoc/schema/query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 6c227ce0..88a2a774 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13099,7 +13099,6 @@ def export_xlsx( 'fg_color': '#D7E4BC', 'border': 1}) for sheet_name, df in result.items(): - worksheet = writer.sheets[sheet_name] index = (sheet_name == 'Distances') startcol = int(index) @@ -13109,6 +13108,7 @@ def export_xlsx( startrow=1, header=False) + worksheet = writer.sheets[sheet_name] worksheet.set_row(0, 70) worksheet.set_column(0, df.shape[1] - 1 + startcol, 30) # Write the column headers with the defined format. From 652cae7c4de62d5808c64b698992b1f4d379adf8 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Wed, 31 May 2023 20:45:30 +0300 Subject: [PATCH 64/69] Cleanup --- lingvodoc/schema/query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 88a2a774..4f9764ca 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13110,7 +13110,7 @@ def export_xlsx( worksheet = writer.sheets[sheet_name] worksheet.set_row(0, 70) - worksheet.set_column(0, df.shape[1] - 1 + startcol, 30) + worksheet.set_column(startcol, df.shape[1] - 1 + startcol, 30) # Write the column headers with the defined format. for col_num, value in enumerate(df.columns.values): worksheet.write(0, col_num + startcol, value, header_format) From 221df2eecacd276d84587df53b5cc52f53d7c9d4 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Thu, 1 Jun 2023 18:36:27 +0300 Subject: [PATCH 65/69] Fixed "more links than means" --- lingvodoc/schema/query.py | 69 +++++++++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 24 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 4f9764ca..b1cd7363 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13046,7 +13046,7 @@ def export_dataframe(result_pool, distance_data_array, bundles): row_index = 0 # re-group by group number and add joined values - for dict_index, perspective in enumerate(result_pool.values()): + for perspective in result_pool.values(): dict_name = perspective['name'] for entry in perspective.values(): # 'entry' iterator may present string value of 'name' or 'suite' field @@ -13158,10 +13158,10 @@ def split_lex(lex): # Getting text data for each perspective. # entries_set gathers entry_id(s) of words met in Swadesh' list - # swadesh_set gathers numbers of words within Swadesh' list + # swadesh_total gathers numbers of words within Swadesh' list entries_set = {} - swadesh_set = {} - result_pool = collections.OrderedDict() + swadesh_total = {} + result_pool = {} for index, (perspective_id, transcription_field_id, translation_field_id) in \ enumerate(perspective_info_list): @@ -13241,7 +13241,7 @@ def split_lex(lex): # Grouping translations by lexical entries. entries_set[perspective_id] = set() - swadesh_set[perspective_id] = set() + swadesh_total[perspective_id] = set() result_pool[perspective_id] = { 'name': f"{index + 1}. {dictionary_name}", 'suit': (len(data_query) > 50) @@ -13260,6 +13260,7 @@ def split_lex(lex): for translation in translation_list if translation.strip()]) + # Parsing translations and matching with Swadesh's words transcription_lex = ', '.join(transcription_list) for swadesh_num, swadesh_lex in enumerate(swadesh_list): for translation_lex in translation_list: @@ -13272,49 +13273,69 @@ def split_lex(lex): 'transcription': transcription_lex, 'translation': translation_lex } - # Store entry_id and number of the lex within Swadesh' list + # Store entry_id and number of the lex within Swadesh's list entries_set[perspective_id].add(entry_id) if (result_pool[perspective_id]['suit'] and not result_pool[perspective_id][entry_id]['borrowed']): - swadesh_set[perspective_id].add(swadesh_num) + # Total list of Swadesh's words in the perspective, + # they can have no any etimological links + swadesh_total[perspective_id].add(swadesh_num) # GC del data_query - # Create dictionary of sets: - # keys: pepspective_id - # values: numbers of etymological groups where an entry from dictionary is met - links = collections.OrderedDict() + # Checking if found entries have links + means = collections.OrderedDict() for perspective_id, entries in entries_set.items(): - links[perspective_id] = set() + means[perspective_id] = collections.defaultdict(set) for group_index, group in enumerate(group_list): + # Select etimologically linked entries linked = entries & group if linked: entry_id = linked.pop() result_pool[perspective_id][entry_id]['group'] = group_index + swadesh = result_pool[perspective_id][entry_id]['swadesh'] + # Store the correspondence: perspective { means(1/2/3) { etimological_groups(1.1/1.2/2.1/3.1) if (result_pool[perspective_id]['suit'] and not result_pool[perspective_id][entry_id]['borrowed']): - links[perspective_id].add(group_index) + means[perspective_id][swadesh].add(group_index) - dictionary_count = len(links) - distance_data_array = numpy.full((dictionary_count, dictionary_count), 100, dtype='float') + dictionary_count = len(means) + distance_data_array = numpy.full((dictionary_count, dictionary_count), 50, dtype='float') distance_header_array = numpy.full(dictionary_count, "", dtype='object') - # Calculate intersection between lists of group numbers + # Calculate intersection between lists of linked means (Swadesh matching) # So length of this intersection is the similarity of corresponding perspectives - # commons_total means amount of Swadesh's lexems met in the both perspectives + # means_total is amount of Swadesh's lexems met in the both perspectives bundles = set() - for n1, (perspective1, groups1) in enumerate(links.items()): + # Calculate each-to-each distances, exclude self-to-self + for n1, (perspective1, means1) in enumerate(means.items()): distance_header_array[n1] = result_pool[perspective1]['name'] - for n2, (perspective2, groups2) in enumerate(links.items()): + for n2, (perspective2, means2) in enumerate(means.items()): if n1 == n2: distance_data_array[n1][n2] = 0 else: - bundles.update(groups1 & groups2) - commons_linked = len(groups1 & groups2) - commons_total = len(swadesh_set[perspective1] & swadesh_set[perspective2]) - # commons_linked > 0 means that commons_total > 0 even more so - distance = math.log(commons_linked / commons_total) / -0.14 if commons_linked > 0 else 100 + # Common means of entries which have etimological linkes + # but this linkes may be not mutual + means_common = means1.keys() & means2.keys() + means_linked = 0 + # Checking if the found means have common links + for swadesh in means_common: + links_common = means1[swadesh] & means2[swadesh] + if links_common: + # Bundles are linkes with two or more entries in the result table + bundles.update(links_common) + means_linked += 1 + + means_total = len(swadesh_total[perspective1] & swadesh_total[perspective2]) + + if n2 > n1 and len(means_common) > means_linked: + log.debug(f"{n1+1},{n2+1} : " + f"{len(means_common)} but {means_linked} of {means_total} : " + f"{', '.join(sorted(means_common))}") + + # means_linked > 0 means that means_total > 0 even more so + distance = math.log(means_linked / means_total) / -0.14 if means_linked > 0 else 50 distance_data_array[n1][n2] = distance result = SwadeshAnalysis.export_dataframe(result_pool, distance_data_array, bundles) From 67353f170b051cb294d34e6aa423f1752f37b559 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Thu, 1 Jun 2023 19:26:05 +0300 Subject: [PATCH 66/69] Distances web table --- lingvodoc/schema/query.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index b1cd7363..87bd76cc 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13336,7 +13336,7 @@ def split_lex(lex): # means_linked > 0 means that means_total > 0 even more so distance = math.log(means_linked / means_total) / -0.14 if means_linked > 0 else 50 - distance_data_array[n1][n2] = distance + distance_data_array[n1][n2] = round(distance, 2) result = SwadeshAnalysis.export_dataframe(result_pool, distance_data_array, bundles) @@ -13344,16 +13344,19 @@ def split_lex(lex): del result_pool xlsx_url = SwadeshAnalysis.export_xlsx(result, base_language_name, storage) - result_tables = (build_table(result['Cognates'], 'blue_light', width="300px"), + result_tables = (build_table(result['Distances'], 'orange_light', width="300px", index=True), + build_table(result['Cognates'], 'blue_light', width="300px"), build_table(result['Singles'], 'green_light', width="300px")) # Control output size - huge_size = 1048576 - result = f"{result_tables[0]}
\n\n
{result_tables[1]}" + huge_size = 262144 #1048576 + result = f"{result_tables[0]}
\n\n
{result_tables[1]}
\n\n
{result_tables[2]}" if len(result) > huge_size: - result = f"{result_tables[0]}
\n\nNote: The table with single words is not shown due to huge summary size
" + result = f"{result_tables[0]}
\n\n
{result_tables[1]}" \ + f"
\n\nNote: The table with single words is not shown due to huge summary size
" if len(result) > huge_size: - result = "
\n\nNote: The result tables are not shown due to huge summary size
" + result = f"{result_tables[0]}" \ + f"
\n\nNote: The result tables with words are not shown due to huge summary size
" # GC del result_tables From 1da39a7569e37d05e7882657125f0308e9e562c6 Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Thu, 1 Jun 2023 21:09:26 +0300 Subject: [PATCH 67/69] Disabled word tables --- lingvodoc/schema/query.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 87bd76cc..42e3af37 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13329,7 +13329,7 @@ def split_lex(lex): means_total = len(swadesh_total[perspective1] & swadesh_total[perspective2]) - if n2 > n1 and len(means_common) > means_linked: + if n2 > n1 and len(means_common) > 0: log.debug(f"{n1+1},{n2+1} : " f"{len(means_common)} but {means_linked} of {means_total} : " f"{', '.join(sorted(means_common))}") @@ -13348,6 +13348,7 @@ def split_lex(lex): build_table(result['Cognates'], 'blue_light', width="300px"), build_table(result['Singles'], 'green_light', width="300px")) + ''' # Control output size huge_size = 262144 #1048576 result = f"{result_tables[0]}
\n\n
{result_tables[1]}
\n\n
{result_tables[2]}" @@ -13355,8 +13356,10 @@ def split_lex(lex): result = f"{result_tables[0]}
\n\n
{result_tables[1]}" \ f"
\n\nNote: The table with single words is not shown due to huge summary size
" if len(result) > huge_size: - result = f"{result_tables[0]}" \ - f"
\n\nNote: The result tables with words are not shown due to huge summary size
" + ''' + + result = f"{result_tables[0]}" \ + f"
\n\nNote: The result tables with words are not shown due to huge summary size
" # GC del result_tables From 01bc05966f7aba77364ad2baf67e557208341b6f Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Thu, 1 Jun 2023 23:05:03 +0300 Subject: [PATCH 68/69] The result tables are hidden --- lingvodoc/schema/query.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 42e3af37..a41cfbf4 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13356,10 +13356,11 @@ def split_lex(lex): result = f"{result_tables[0]}
\n\n
{result_tables[1]}" \ f"
\n\nNote: The table with single words is not shown due to huge summary size
" if len(result) > huge_size: + result = f"{result_tables[0]}" \ + f"
\n\nNote: The result tables with words are not shown due to huge summary size
" ''' - result = f"{result_tables[0]}" \ - f"
\n\nNote: The result tables with words are not shown due to huge summary size
" + result = "Note: The result tables are hidden" # GC del result_tables From 6d3f94668e6de579f407ede1b9f6310a978e75cf Mon Sep 17 00:00:00 2001 From: Vladimir Monakhov Date: Fri, 2 Jun 2023 16:10:30 +0300 Subject: [PATCH 69/69] Forget tiny dicts --- lingvodoc/schema/query.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index a41cfbf4..ec6a912b 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -13162,6 +13162,7 @@ def split_lex(lex): entries_set = {} swadesh_total = {} result_pool = {} + tiny_dicts = set() for index, (perspective_id, transcription_field_id, translation_field_id) in \ enumerate(perspective_info_list): @@ -13242,10 +13243,7 @@ def split_lex(lex): # Grouping translations by lexical entries. entries_set[perspective_id] = set() swadesh_total[perspective_id] = set() - result_pool[perspective_id] = { - 'name': f"{index + 1}. {dictionary_name}", - 'suit': (len(data_query) > 50) - } + result_pool[perspective_id] = {'name': dictionary_name} for row_index, row in enumerate(data_query): entry_id = tuple(row[:2]) transcription_list, translation_list = row[2:4] @@ -13275,12 +13273,18 @@ def split_lex(lex): } # Store entry_id and number of the lex within Swadesh's list entries_set[perspective_id].add(entry_id) - if (result_pool[perspective_id]['suit'] and - not result_pool[perspective_id][entry_id]['borrowed']): + if not result_pool[perspective_id][entry_id]['borrowed']: # Total list of Swadesh's words in the perspective, # they can have no any etimological links swadesh_total[perspective_id].add(swadesh_num) + # Forget the dictionary if it contains less than 50 Swadesh words + if len(swadesh_total[perspective_id]) < 50: + del entries_set[perspective_id] + del swadesh_total[perspective_id] + del result_pool[perspective_id] + tiny_dicts.add(dictionary_name) + # GC del data_query @@ -13296,8 +13300,7 @@ def split_lex(lex): result_pool[perspective_id][entry_id]['group'] = group_index swadesh = result_pool[perspective_id][entry_id]['swadesh'] # Store the correspondence: perspective { means(1/2/3) { etimological_groups(1.1/1.2/2.1/3.1) - if (result_pool[perspective_id]['suit'] and - not result_pool[perspective_id][entry_id]['borrowed']): + if not result_pool[perspective_id][entry_id]['borrowed']: means[perspective_id][swadesh].add(group_index) dictionary_count = len(means) @@ -13310,6 +13313,8 @@ def split_lex(lex): bundles = set() # Calculate each-to-each distances, exclude self-to-self for n1, (perspective1, means1) in enumerate(means.items()): + # Numerate dictionaries + result_pool[perspective1]['name'] = f"{n1 + 1}. {result_pool[perspective1]['name']}" distance_header_array[n1] = result_pool[perspective1]['name'] for n2, (perspective2, means2) in enumerate(means.items()): if n1 == n2: @@ -13348,9 +13353,8 @@ def split_lex(lex): build_table(result['Cognates'], 'blue_light', width="300px"), build_table(result['Singles'], 'green_light', width="300px")) - ''' # Control output size - huge_size = 262144 #1048576 + huge_size = 1048576 result = f"{result_tables[0]}
\n\n
{result_tables[1]}
\n\n
{result_tables[2]}" if len(result) > huge_size: result = f"{result_tables[0]}
\n\n
{result_tables[1]}" \ @@ -13358,9 +13362,8 @@ def split_lex(lex): if len(result) > huge_size: result = f"{result_tables[0]}" \ f"
\n\nNote: The result tables with words are not shown due to huge summary size
" - ''' - - result = "Note: The result tables are hidden" + result += ("
Note: The following dictionaries contain too less words and were not processed: \n\n" +
+                   '\n'.join(tiny_dicts) + "
") if tiny_dicts else "" # GC del result_tables