From 157d113141540ac889787cd6ed79387698497212 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Mon, 15 May 2023 02:32:06 +0300
Subject: [PATCH 01/69] Initial commit

---
 lingvodoc/schema/query.py | 146 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 142 insertions(+), 4 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 8c270b5c..726c7921 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -364,6 +364,7 @@
 
 # Setting up logging.
 log = logging.getLogger(__name__)
+logging.disable(level=logging.INFO)
 
 
 # Trying to set up celery logging.
@@ -9232,6 +9233,7 @@ def async_cognate_analysis(
     with transaction.manager:
 
         try:
+            breakpoint()
             CognateAnalysis.perform_cognate_analysis(
                 language_str,
                 source_perspective_id,
@@ -9883,6 +9885,7 @@ def tag_data_plpgsql(
             entry_already_set.update(entry_id_set)
             group_list.append(entry_id_set)
 
+        breakpoint()
         return entry_already_set, group_list, time.time() - start_time
 
     @staticmethod
@@ -10970,6 +10973,7 @@ def perform_cognate_analysis(
 
                 CognateAnalysis.tag_data_plpgsql(
                     perspective_info_list, group_field_id))
+            breakpoint()
 
         else:
 
@@ -11034,7 +11038,7 @@ def perform_cognate_analysis(
         sg_both_count = 0
 
         source_perspective_index = None
-
+        breakpoint()
         for index, (perspective_id, transcription_field_id, translation_field_id) in \
             enumerate(perspective_info_list):
 
@@ -12630,6 +12634,140 @@ def f(axes, embedding_pca):
 
         return CognateAnalysis(**result_dict)
 
+    @staticmethod
+    def swadesh_statistics(
+            language_str,
+            source_perspective_id,
+            base_language_id,
+            base_language_name,
+            group_field_id,
+            perspective_info_list,
+            multi_list,
+            multi_name_list,
+            mode,
+            distance_flag,
+            reference_perspective_id,
+            figure_flag,
+            distance_vowel_flag,
+            distance_consonant_flag,
+            match_translations_value,
+            only_orphans_flag,
+            locale_id,
+            storage,
+            task_status=None,
+            __debug_flag__=False,
+            __intermediate_flag__=False):
+
+        # Gathering entry grouping data.
+        perspective_dict = collections.defaultdict(dict)
+
+        # entry_already_set = set()
+        # group_list = []
+        # tag_dict = collections.defaultdict(set)
+
+        text_dict = {}
+        entry_id_dict = {}
+
+        entry_already_set, group_list, group_time = (
+            CognateAnalysis.tag_data_plpgsql(
+                perspective_info_list, group_field_id))
+
+        # Getting text data for each perspective.
+
+        # dbTranslation = aliased(dbEntity, name='Translation')
+        # dbPublishingTranslation = aliased(dbPublishingEntity, name='PublishingTranslation')
+        # source_perspective_index = None
+
+        for index, (perspective_id, transcription_field_id, translation_field_id) in \
+                enumerate(perspective_info_list):
+
+            # if perspective_id == source_perspective_id:
+            #    source_perspective_index = index
+
+            # Getting and saving perspective info.
+            perspective = DBSession.query(dbPerspective).filter_by(
+                client_id=perspective_id[0], object_id=perspective_id[1]).first()
+
+            perspective_name = perspective.get_translation(locale_id)
+            dictionary_name = perspective.parent.get_translation(locale_id)
+
+            transcription_rules = (
+                '' if not perspective.additional_metadata else
+                perspective.additional_metadata.get('transcription_rules', ''))
+
+            perspective_data = perspective_dict[perspective_id]
+
+            perspective_data['perspective_name'] = perspective_name
+            perspective_data['dictionary_name'] = dictionary_name
+            perspective_data['transcription_rules'] = transcription_rules
+
+            log.debug(
+                '\ncognate_analysis {0}:'
+                '\n  dictionary {1}/{2}: {3}'
+                '\n  perspective {4}/{5}: {6}'
+                '\n  transcription_rules: {7}'.format(
+                    language_str,
+                    perspective.parent_client_id, perspective.parent_object_id,
+                    repr(dictionary_name.strip()),
+                    perspective_id[0], perspective_id[1],
+                    repr(perspective_name.strip()),
+                    repr(transcription_rules)))
+
+            # Getting text data.
+            translation_query = (
+                DBSession.query(
+                    dbLexicalEntry.client_id,
+                    dbLexicalEntry.object_id).filter(
+                    dbLexicalEntry.parent_client_id == perspective_id[0],
+                    dbLexicalEntry.parent_object_id == perspective_id[1],
+                    dbLexicalEntry.marked_for_deletion == False,
+                    dbEntity.parent_client_id == dbLexicalEntry.client_id,
+                    dbEntity.parent_object_id == dbLexicalEntry.object_id,
+                    dbEntity.field_client_id == translation_field_id[0],
+                    dbEntity.field_object_id == translation_field_id[1],
+                    dbEntity.marked_for_deletion == False,
+                    dbPublishingEntity.client_id == dbEntity.client_id,
+                    dbPublishingEntity.object_id == dbEntity.object_id,
+                    dbPublishingEntity.published == True,
+                    dbPublishingEntity.accepted == True)
+
+                    .add_columns(
+                    func.array_agg(dbEntity.content).label('translation'))
+
+                    .group_by(dbLexicalEntry))
+
+            # If we are in asynchronous mode, we need to look up how many data rows we need
+            # to process for this perspective.
+            if task_status is not None:
+                row_count = translation_query.count()
+
+                log.debug(
+                    'cognate_analysis {0}: perspective {1}/{2}: {3} data rows'.format(
+                        language_str,
+                        perspective_id[0], perspective_id[1],
+                        row_count))
+
+            # Grouping translations by lexical entries.
+            for row_index, row in enumerate(translation_query.all()):
+                entry_id = tuple(row[:2])
+                transcription_list, translation_list = row[2:4]
+
+                translation_list = (
+                    [] if not translation_list else [
+                        translation.strip()
+                        for translation in translation_list
+                        if translation.strip()])
+
+                # Saving translation data.
+                entry_data_list = (index, translation_list)
+                text_dict[entry_id] = entry_data_list
+
+                entry_id_key = (
+                    index,
+                    (' ʽ' + '|'.join(translation_list) + 'ʼ' if translation_list else ''))
+
+                entry_id_dict[entry_id_key] = entry_id
+
     @staticmethod
     def mutate(self, info, **args):
         """
@@ -12823,7 +12961,7 @@ def mutate(self, info, **args):
                 cognate_suggestions_f if mode == 'suggestions' else
                 cognate_analysis_f)
 
-            if analysis_f is None:
+            if analysis_f is None and False:
 
                 return ResponseError(message =
                     'Analysis library fuction \'{0}()\' is absent, '
@@ -12875,7 +13013,7 @@ def mutate(self, info, **args):
                 request.response.status = HTTPOk.code
 
                 if synchronous:
-
+                    breakpoint()
                     CognateAnalysis.perform_cognate_analysis(
                         language_str,
                         source_perspective_id,
@@ -12933,7 +13071,7 @@ def mutate(self, info, **args):
             # We do not use acoustic data, so we perform cognate analysis synchronously.
 
             else:
-
+                #breakpoint()
                 return CognateAnalysis.perform_cognate_analysis(
                     language_str,
                     source_perspective_id,

From 41e87deca7d434d1008660f276d8d9193cc81843 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Mon, 15 May 2023 21:27:32 +0300
Subject: [PATCH 02/69] return CognateAnalysis.swadesh_statistics

---
 lingvodoc/schema/query.py | 34 +++++++++++++++++++++++++++++-----
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 726c7921..8cf51fd1 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -9233,7 +9233,6 @@ def async_cognate_analysis(
     with transaction.manager:
 
         try:
-            breakpoint()
             CognateAnalysis.perform_cognate_analysis(
                 language_str,
                 source_perspective_id,
@@ -9885,7 +9884,6 @@ def tag_data_plpgsql(
             entry_already_set.update(entry_id_set)
             group_list.append(entry_id_set)
 
-        breakpoint()
         return entry_already_set, group_list, time.time() - start_time
 
     @staticmethod
@@ -12750,7 +12748,7 @@ def swadesh_statistics(
             # Grouping translations by lexical entries.
             for row_index, row in enumerate(translation_query.all()):
                 entry_id = tuple(row[:2])
-                transcription_list, translation_list = row[2:4]
+                translation_list = row[2]
 
                 translation_list = (
                     [] if not translation_list else [
@@ -12758,6 +12756,8 @@ def swadesh_statistics(
                         for translation in translation_list
                         if translation.strip()])
 
+                print(translation_list)
+
                 # Saving translation data.
                 entry_data_list = (index, translation_list)
                 text_dict[entry_id] = entry_data_list
@@ -13068,10 +13068,34 @@ def mutate(self, info, **args):
 
                 return CognateAnalysis(triumph = True)
 
-            # We do not use acoustic data, so we perform cognate analysis synchronously.
+            elif mode == 'swadesh':
 
+                return CognateAnalysis.swadesh_statistics(
+                    language_str,
+                    source_perspective_id,
+                    base_language_id,
+                    base_language_name,
+                    group_field_id,
+                    perspective_info_list,
+                    multi_list,
+                    multi_name_list,
+                    mode,
+                    distance_flag,
+                    reference_perspective_id,
+                    figure_flag,
+                    distance_vowel_flag,
+                    distance_consonant_flag,
+                    match_translations_value,
+                    only_orphans_flag,
+                    locale_id,
+                    storage,
+                    None,
+                    __debug_flag__,
+                    __intermediate_flag__)
+
+            # We do not use acoustic data, so we perform cognate analysis synchronously.
             else:
-                #breakpoint()
+
                 return CognateAnalysis.perform_cognate_analysis(
                     language_str,
                     source_perspective_id,

From cff8e76de1f02e93985dcf431cc1d6198300e9e2 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Tue, 16 May 2023 13:30:40 +0300
Subject: [PATCH 03/69] Next steps

---
 lingvodoc/schema/query.py | 97 ++++++++++++++-------------------------
 1 file changed, 35 insertions(+), 62 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 8cf51fd1..377687f4 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -10971,7 +10971,6 @@ def perform_cognate_analysis(
 
                 CognateAnalysis.tag_data_plpgsql(
                     perspective_info_list, group_field_id))
-            breakpoint()
 
         else:
 
@@ -12657,93 +12656,67 @@ def swadesh_statistics(
             __intermediate_flag__=False):
 
         # Gathering entry grouping data.
-        perspective_dict = collections.defaultdict(dict)
-
-        # entry_already_set = set()
-        # group_list = []
-        # tag_dict = collections.defaultdict(set)
-
+        #perspective_dict = collections.defaultdict(dict)
         text_dict = {}
         entry_id_dict = {}
 
-        entry_already_set, group_list, group_time = (
+        _, group_list, group_time = (
             CognateAnalysis.tag_data_plpgsql(
                 perspective_info_list, group_field_id))
 
-        # Getting text data for each perspective.
-
-        # dbTranslation = aliased(dbEntity, name='Translation')
-        # dbPublishingTranslation = aliased(dbPublishingEntity, name='PublishingTranslation')
-        # source_perspective_index = None
+        #print(f"*** Group list: {group_list}")
 
+        # Getting text data for each perspective.
         for index, (perspective_id, transcription_field_id, translation_field_id) in \
                 enumerate(perspective_info_list):
 
-            # if perspective_id == source_perspective_id:
-            #    source_perspective_index = index
-
             # Getting and saving perspective info.
-            perspective = DBSession.query(dbPerspective).filter_by(
-                client_id=perspective_id[0], object_id=perspective_id[1]).first()
+            perspective = (
+                DBSession
+                    .query(dbPerspective)
+                    .filter_by(client_id=perspective_id[0], object_id=perspective_id[1])
+                    .first()
+            )
 
             perspective_name = perspective.get_translation(locale_id)
             dictionary_name = perspective.parent.get_translation(locale_id)
-
-            transcription_rules = (
-                '' if not perspective.additional_metadata else
-                perspective.additional_metadata.get('transcription_rules', ''))
-
-            perspective_data = perspective_dict[perspective_id]
-
-            perspective_data['perspective_name'] = perspective_name
-            perspective_data['dictionary_name'] = dictionary_name
-            perspective_data['transcription_rules'] = transcription_rules
+            #perspective_data = perspective_dict[perspective_id]
+            #perspective_data['perspective_name'] = perspective_name
+            #perspective_data['dictionary_name'] = dictionary_name
 
             log.debug(
                 '\ncognate_analysis {0}:'
                 '\n  dictionary {1}/{2}: {3}'
-                '\n  perspective {4}/{5}: {6}'
-                '\n  transcription_rules: {7}'.format(
+                '\n  perspective {4}/{5}: {6}'.format(
                     language_str,
                     perspective.parent_client_id, perspective.parent_object_id,
                     repr(dictionary_name.strip()),
                     perspective_id[0], perspective_id[1],
-                    repr(perspective_name.strip()),
-                    repr(transcription_rules)))
+                    repr(perspective_name.strip())))
 
             # Getting text data.
             translation_query = (
-                DBSession.query(
-                    dbLexicalEntry.client_id,
-                    dbLexicalEntry.object_id).filter(
-                    dbLexicalEntry.parent_client_id == perspective_id[0],
-                    dbLexicalEntry.parent_object_id == perspective_id[1],
-                    dbLexicalEntry.marked_for_deletion == False,
-                    dbEntity.parent_client_id == dbLexicalEntry.client_id,
-                    dbEntity.parent_object_id == dbLexicalEntry.object_id,
-                    dbEntity.field_client_id == translation_field_id[0],
-                    dbEntity.field_object_id == translation_field_id[1],
-                    dbEntity.marked_for_deletion == False,
-                    dbPublishingEntity.client_id == dbEntity.client_id,
-                    dbPublishingEntity.object_id == dbEntity.object_id,
-                    dbPublishingEntity.published == True,
-                    dbPublishingEntity.accepted == True)
-
+                DBSession
+                    .query(
+                        dbLexicalEntry.client_id,
+                        dbLexicalEntry.object_id)
+                    .filter(
+                        dbLexicalEntry.parent_client_id == perspective_id[0],
+                        dbLexicalEntry.parent_object_id == perspective_id[1],
+                        dbLexicalEntry.marked_for_deletion == False,
+                        dbEntity.parent_client_id == dbLexicalEntry.client_id,
+                        dbEntity.parent_object_id == dbLexicalEntry.object_id,
+                        dbEntity.field_client_id == translation_field_id[0],
+                        dbEntity.field_object_id == translation_field_id[1],
+                        dbEntity.marked_for_deletion == False,
+                        dbPublishingEntity.client_id == dbEntity.client_id,
+                        dbPublishingEntity.object_id == dbEntity.object_id,
+                        dbPublishingEntity.published == True,
+                        dbPublishingEntity.accepted == True)
                     .add_columns(
-                    func.array_agg(dbEntity.content).label('translation'))
-
-                    .group_by(dbLexicalEntry))
-
-            # If we are in asynchronous mode, we need to look up how many data rows we need
-            # to process for this perspective.
-            if task_status is not None:
-                row_count = translation_query.count()
-
-                log.debug(
-                    'cognate_analysis {0}: perspective {1}/{2}: {3} data rows'.format(
-                        language_str,
-                        perspective_id[0], perspective_id[1],
-                        row_count))
+                        func.array_agg(dbEntity.content).label('translation'))
+                    .group_by(dbLexicalEntry)
+                    .all())
 
             # Grouping translations by lexical entries.
             for row_index, row in enumerate(translation_query.all()):

From 41cd7354b8b8f4398bfaf97aa2014f73095fe25d Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Tue, 16 May 2023 13:32:35 +0300
Subject: [PATCH 04/69] Fix

---
 lingvodoc/schema/query.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 377687f4..9314300d 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -12719,7 +12719,7 @@ def swadesh_statistics(
                     .all())
 
             # Grouping translations by lexical entries.
-            for row_index, row in enumerate(translation_query.all()):
+            for row_index, row in enumerate(translation_query):
                 entry_id = tuple(row[:2])
                 translation_list = row[2]
 

From bf175c2f79b4ee3ef83f641a0559dc07fc0b0a26 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Tue, 16 May 2023 14:51:09 +0300
Subject: [PATCH 05/69] Compare_translations

---
 lingvodoc/schema/query.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 9314300d..cb57da26 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -12655,6 +12655,24 @@ def swadesh_statistics(
             __debug_flag__=False,
             __intermediate_flag__=False):
 
+        swadesh_list = ['я','ты','мы','этот, это','тот, то','кто','что','не','все','много','один','два','большой',
+                        'долгий','маленький','женщина','мужчина','человек','рыба','птица','собака','вошь','дерево',
+                        'семя','лист','корень','кора','кожа','мясо','кровь','кость','жир','яйцо','рог','хвост','перо',
+                        'волосы','голова','ухо','глаз','нос','рот','зуб','язык (орган)','ноготь','нога (стопа)','колено',
+                        'рука (кисть)','живот','горло','грудь','сердце','печень','пить','есть (кушать)','кусать','видеть',
+                        'слышать','знать','спать','умирать','убивать','плавать','летать','гулять','приходить','лежать',
+                        'сидеть','стоять','дать','сказать','солнце','луна','звезда','вода','дождь','камень','песок',
+                        'земля','облако','дым','огонь','пепел','гореть','дорога,тропа','гора','красный','зелёный',
+                        'жёлтый','белый','чёрный','ночь','тёплый','холодный','полный','новый','хороший','круглый',
+                        'сухой','имя']
+
+        def compare_translations(swadesh_lex, dictionary_lex):
+            def split_lex(lex):
+                return set(form.lower()
+                           for form in lex.replace(' ', ',').split(',')
+                           if form and '(' not in form)
+            return bool(split_lex(swadesh_lex) & split_lex(dictionary_lex))
+
         # Gathering entry grouping data.
         #perspective_dict = collections.defaultdict(dict)
         text_dict = {}
@@ -12729,7 +12747,7 @@ def swadesh_statistics(
                         for translation in translation_list
                         if translation.strip()])
 
-                print(translation_list)
+                print(entry_id, translation_list)
 
                 # Saving translation data.
                 entry_data_list = (index, translation_list)

From ffcaf089189077996f70b3462bff36e1d459a1c5 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Tue, 16 May 2023 16:56:00 +0300
Subject: [PATCH 06/69] Loop by 100words and by translation_list

---
 lingvodoc/schema/query.py | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index cb57da26..fdde67d0 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -12668,24 +12668,23 @@ def swadesh_statistics(
 
         def compare_translations(swadesh_lex, dictionary_lex):
             def split_lex(lex):
-                return set(form.lower()
-                           for form in lex.replace(' ', ',').split(',')
-                           if form and '(' not in form)
+                # Split by comma and open bracket to separate
+                # various forms of lexem and extra explanation if is
+                return set(form.strip().lower()
+                           for form in lex.replace('(', ',').split(',')
+                           if form and ')' not in form)
             return bool(split_lex(swadesh_lex) & split_lex(dictionary_lex))
 
         # Gathering entry grouping data.
-        #perspective_dict = collections.defaultdict(dict)
         text_dict = {}
         entry_id_dict = {}
 
-        _, group_list, group_time = (
+        _, group_list, _ = (
             CognateAnalysis.tag_data_plpgsql(
                 perspective_info_list, group_field_id))
 
-        #print(f"*** Group list: {group_list}")
-
         # Getting text data for each perspective.
-        for index, (perspective_id, transcription_field_id, translation_field_id) in \
+        for index, (perspective_id, _, translation_field_id) in \
                 enumerate(perspective_info_list):
 
             # Getting and saving perspective info.
@@ -12698,9 +12697,6 @@ def split_lex(lex):
 
             perspective_name = perspective.get_translation(locale_id)
             dictionary_name = perspective.parent.get_translation(locale_id)
-            #perspective_data = perspective_dict[perspective_id]
-            #perspective_data['perspective_name'] = perspective_name
-            #perspective_data['dictionary_name'] = dictionary_name
 
             log.debug(
                 '\ncognate_analysis {0}:'
@@ -12747,7 +12743,10 @@ def split_lex(lex):
                         for translation in translation_list
                         if translation.strip()])
 
-                print(entry_id, translation_list)
+                for swadesh_lex in swadesh_list:
+                    for translation_lex in translation_list:
+                        if compare_translations(swadesh_lex, translation_lex):
+                            print(entry_id, translation_lex)
 
                 # Saving translation data.
                 entry_data_list = (index, translation_list)

From e6108005617ce92893a41a26e4e3ef0496fb63c2 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Tue, 16 May 2023 21:21:26 +0300
Subject: [PATCH 07/69] First result

---
 lingvodoc/schema/query.py | 45 +++++++++++++++++++++++++--------------
 1 file changed, 29 insertions(+), 16 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index fdde67d0..59a8c12f 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -12672,21 +12672,19 @@ def split_lex(lex):
                 # various forms of lexem and extra explanation if is
                 return set(form.strip().lower()
                            for form in lex.replace('(', ',').split(',')
-                           if form and ')' not in form)
+                           if form and (')' not in form))
             return bool(split_lex(swadesh_lex) & split_lex(dictionary_lex))
 
-        # Gathering entry grouping data.
-        text_dict = {}
-        entry_id_dict = {}
-
         _, group_list, _ = (
             CognateAnalysis.tag_data_plpgsql(
                 perspective_info_list, group_field_id))
 
         # Getting text data for each perspective.
+        entries_map = {}
         for index, (perspective_id, _, translation_field_id) in \
                 enumerate(perspective_info_list):
 
+            '''
             # Getting and saving perspective info.
             perspective = (
                 DBSession
@@ -12707,6 +12705,7 @@ def split_lex(lex):
                     repr(dictionary_name.strip()),
                     perspective_id[0], perspective_id[1],
                     repr(perspective_name.strip())))
+            '''
 
             # Getting text data.
             translation_query = (
@@ -12733,6 +12732,7 @@ def split_lex(lex):
                     .all())
 
             # Grouping translations by lexical entries.
+            entries_map[perspective_id] = set()
             for row_index, row in enumerate(translation_query):
                 entry_id = tuple(row[:2])
                 translation_list = row[2]
@@ -12746,17 +12746,30 @@ def split_lex(lex):
                 for swadesh_lex in swadesh_list:
                     for translation_lex in translation_list:
                         if compare_translations(swadesh_lex, translation_lex):
-                            print(entry_id, translation_lex)
-
-                # Saving translation data.
-                entry_data_list = (index, translation_list)
-                text_dict[entry_id] = entry_data_list
-
-                entry_id_key = (
-                    index,
-                    (' ʽ' + '|'.join(translation_list) + 'ʼ' if translation_list else ''))
-
-                entry_id_dict[entry_id_key] = entry_id
+                            entries_map[perspective_id].add(entry_id)
+                            #print(entry_id, translation_lex)
+
+        # Create dictionary of sets:
+        # keys: pepspective_id
+        # values: numbers of groups where an entry from dictionary is met
+        links = {}
+        for perspective, entries in entries_map.items():
+            links[perspective] = set()
+            for index_group, group in enumerate(group_list):
+                if (entries & group):
+                    links[perspective].add(index_group)
+
+        # Calculate intersection between lists of group numbers for all the perspectives
+        # So length of this intersection is the similarity of corresponding perspectives
+        similarity = {}
+        for perspective1, groups1 in links.items():
+            similarity[perspective1] = {}
+            print(perspective1, end=' :: ')
+            for perspective2, groups2 in links.items():
+                commons = len(groups1 & groups2)
+                similarity[perspective1][perspective2] = commons
+                print(f"{perspective2}:{commons}", end=' | ')
+            print()
 
     @staticmethod
     def mutate(self, info, **args):

From 63dcb47438818d1becb777c8fef8dc8eb1d02e05 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Tue, 16 May 2023 21:41:19 +0300
Subject: [PATCH 08/69] Minor

---
 lingvodoc/schema/query.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 59a8c12f..f50a2af1 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -12668,11 +12668,12 @@ def swadesh_statistics(
 
         def compare_translations(swadesh_lex, dictionary_lex):
             def split_lex(lex):
-                # Split by comma and open bracket to separate
-                # various forms of lexem and extra explanation if is
+                # Split by commas and open brackets to separate
+                # various forms of lexem and extra note if is
                 return set(form.strip().lower()
                            for form in lex.replace('(', ',').split(',')
-                           if form and (')' not in form))
+                           if form.strip() and (')' not in form)) #exclude notes
+            # return true if the intersection is not empty
             return bool(split_lex(swadesh_lex) & split_lex(dictionary_lex))
 
         _, group_list, _ = (
@@ -12680,6 +12681,7 @@ def split_lex(lex):
                 perspective_info_list, group_field_id))
 
         # Getting text data for each perspective.
+        # entries_map gathers words from Swadesh' list met in perspectives
         entries_map = {}
         for index, (perspective_id, _, translation_field_id) in \
                 enumerate(perspective_info_list):
@@ -12751,7 +12753,7 @@ def split_lex(lex):
 
         # Create dictionary of sets:
         # keys: pepspective_id
-        # values: numbers of groups where an entry from dictionary is met
+        # values: numbers of etymological groups where an entry from dictionary is met
         links = {}
         for perspective, entries in entries_map.items():
             links[perspective] = set()

From 18a340cc184edf25440f07f87799ce079422e649 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Wed, 17 May 2023 15:05:06 +0300
Subject: [PATCH 09/69] Separated classes

---
 lingvodoc/schema/query.py | 418 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 418 insertions(+)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index f50a2af1..46420db6 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -12631,6 +12631,424 @@ def f(axes, embedding_pca):
 
         return CognateAnalysis(**result_dict)
 
+    @staticmethod
+    def mutate(self, info, **args):
+        """
+        mutation CognateAnalysis {
+          cognate_analysis(
+            base_language_id: [508, 41],
+            group_field_id: [66, 25],
+            perspective_info_list: [
+              [[425, 4], [66, 8], [66, 10]],
+              [[1552, 1759], [66, 8], [66, 10]],
+              [[418, 4], [66, 8], [66, 10]]])
+          {
+            triumph
+            entity_count
+            dictionary_count
+            group_count
+            not_enough_count
+            text_count
+            result
+          }
+        }
+        """
+
+        # Administrator / perspective author / editing permission check.
+
+        error_str = (
+            'Only administrator, perspective author and users with perspective editing permissions '
+            'can perform cognate analysis.')
+
+        client_id = info.context.request.authenticated_userid
+
+        if not client_id:
+            raise ResponseError(error_str)
+
+        user = Client.get_user_by_client_id(client_id)
+
+        author_client_id_set = (
+
+            set(
+                client_id
+                for (client_id, _), _, _ in args['perspective_info_list']))
+
+        author_id_check = (
+
+            DBSession
+
+                .query(
+
+                    DBSession
+                        .query(literal(1))
+                        .filter(
+                            Client.id.in_(author_client_id_set),
+                            Client.user_id == user.id)
+                        .exists())
+
+                .scalar())
+
+        if (user.id != 1 and
+            not author_id_check and
+            not info.context.acl_check_if('edit', 'perspective', args['source_perspective_id'])):
+
+            raise ResponseError(error_str)
+
+        # Getting arguments.
+
+        source_perspective_id = args['source_perspective_id']
+        base_language_id = args['base_language_id']
+
+        group_field_id = args['group_field_id']
+        perspective_info_list = args['perspective_info_list']
+        multi_list = args.get('multi_list')
+
+        mode = args.get('mode')
+
+        distance_flag = args.get('distance_flag')
+        reference_perspective_id = args.get('reference_perspective_id')
+
+        figure_flag = args.get('figure_flag')
+        distance_vowel_flag = args.get('distance_vowel_flag')
+        distance_consonant_flag = args.get('distance_consonant_flag')
+
+        match_translations_value = args.get('match_translations_value', 1)
+        only_orphans_flag = args.get('only_orphans_flag', True)
+
+        __debug_flag__ = args.get('debug_flag', False)
+        __intermediate_flag__ = args.get('intermediate_flag', False)
+
+        synchronous = args.get('synchronous', False)
+
+        language_str = (
+            '{0}/{1}, language {2}/{3}'.format(
+                source_perspective_id[0], source_perspective_id[1],
+                base_language_id[0], base_language_id[1]))
+
+        try:
+
+            # Getting base language info.
+
+            locale_id = info.context.get('locale_id') or 2
+
+            base_language = DBSession.query(dbLanguage).filter_by(
+                client_id = base_language_id[0], object_id = base_language_id[1]).first()
+
+            base_language_name = base_language.get_translation(locale_id)
+
+            request = info.context.request
+            storage = request.registry.settings['storage']
+
+            # Getting multi-language info, if required.
+
+            if multi_list is None:
+                multi_list = []
+
+            multi_name_list = []
+
+            for language_id, perspective_count in multi_list:
+
+                language = DBSession.query(dbLanguage).filter_by(
+                    client_id = language_id[0], object_id = language_id[1]).first()
+
+                multi_name_list.append(
+                    language.get_translation(locale_id))
+
+            # Language tag.
+
+            if mode == 'multi':
+
+                multi_str = ', '.join(
+                    '{0}/{1}'.format(*id)
+                    for id, count in multi_list)
+
+                language_str = (
+                    '{0}/{1}, languages {2}'.format(
+                        source_perspective_id[0], source_perspective_id[1],
+                        multi_str))
+
+            # Showing cognate analysis info, checking cognate analysis library presence.
+
+            log.debug(
+                 '\ncognate_analysis {}:'
+                 '\n  base language: {}'
+                 '\n  group field: {}/{}'
+                 '\n  perspectives and transcription/translation fields: {}'
+                 '\n  multi_list: {}'
+                 '\n  multi_name_list: {}'
+                 '\n  mode: {}'
+                 '\n  distance_flag: {}'
+                 '\n  reference_perspective_id: {}'
+                 '\n  figure_flag: {}'
+                 '\n  distance_vowel_flag: {}'
+                 '\n  distance_consonant_flag: {}'
+                 '\n  match_translations_value: {}'
+                 '\n  only_orphans_flag: {} ({})'
+                 '\n  __debug_flag__: {}'
+                 '\n  __intermediate_flag__: {}'
+                 '\n  cognate_analysis_f: {}'
+                 '\n  cognate_acoustic_analysis_f: {}'
+                 '\n  cognate_distance_analysis_f: {}'
+                 '\n  cognate_reconstruction_f: {}'
+                 '\n  cognate_reconstruction_multi_f: {}'
+                 '\n  cognate_suggestions_f: {}'.format(
+                    language_str,
+                    repr(base_language_name.strip()),
+                    group_field_id[0], group_field_id[1],
+                    perspective_info_list,
+                    multi_list,
+                    multi_name_list,
+                    repr(mode),
+                    distance_flag,
+                    reference_perspective_id,
+                    figure_flag,
+                    distance_vowel_flag,
+                    distance_consonant_flag,
+                    match_translations_value,
+                    only_orphans_flag, int(only_orphans_flag),
+                    __debug_flag__,
+                    __intermediate_flag__,
+                    repr(cognate_analysis_f),
+                    repr(cognate_acoustic_analysis_f),
+                    repr(cognate_distance_analysis_f),
+                    repr(cognate_reconstruction_f),
+                    repr(cognate_reconstruction_multi_f),
+                    repr(cognate_suggestions_f)))
+
+            # Checking if we have analysis function ready.
+
+            analysis_f = (
+                cognate_acoustic_analysis_f if mode == 'acoustic' else
+                cognate_reconstruction_f if mode == 'reconstruction' else
+                cognate_reconstruction_multi_f if mode == 'multi' else
+                cognate_suggestions_f if mode == 'suggestions' else
+                cognate_analysis_f)
+
+            if analysis_f is None and False:
+
+                return ResponseError(message =
+                    'Analysis library fuction \'{0}()\' is absent, '
+                    'please contact system administrator.'.format(
+                        'CognateAcousticAnalysis_GetAllOutput' if mode == 'acoustic' else
+                        'CognateReconstruct_GetAllOutput' if mode == 'reconstruction' else
+                        'CognateMultiReconstruct_GetAllOutput' if mode == 'multi' else
+                        'GuessCognates_GetAllOutput' if mode == 'suggestions' else
+                        'CognateAnalysis_GetAllOutput'))
+
+            # Transforming client/object pair ids from lists to 2-tuples.
+
+            source_perspective_id = tuple(source_perspective_id)
+            base_language_id = tuple(base_language_id)
+            group_field_id = tuple(group_field_id)
+
+            perspective_info_list = [
+
+                (tuple(perspective_id),
+                    tuple(transcription_field_id),
+                    tuple(translation_field_id))
+
+                for perspective_id,
+                    transcription_field_id,
+                    translation_field_id in perspective_info_list]
+
+            multi_list = [
+                [tuple(language_id), perspective_count]
+                for language_id, perspective_count in multi_list]
+
+            if reference_perspective_id is not None:
+                reference_perspective_id = tuple(reference_perspective_id)
+
+            # If we are to use acoustic data, we will launch cognate analysis in asynchronous mode.
+
+            if mode == 'acoustic':
+
+                client_id = request.authenticated_userid
+
+                user_id = (
+                    Client.get_user_by_client_id(client_id).id
+                        if client_id else anonymous_userid(request))
+
+                task_status = TaskStatus(
+                    user_id, 'Cognate acoustic analysis', base_language_name, 5)
+
+                # Launching cognate acoustic analysis asynchronously.
+
+                request.response.status = HTTPOk.code
+
+                if synchronous:
+                    breakpoint()
+                    CognateAnalysis.perform_cognate_analysis(
+                        language_str,
+                        source_perspective_id,
+                        base_language_id,
+                        base_language_name,
+                        group_field_id,
+                        perspective_info_list,
+                        multi_list,
+                        multi_name_list,
+                        mode,
+                        None,
+                        None,
+                        None,
+                        None,
+                        None,
+                        match_translations_value,
+                        only_orphans_flag,
+                        locale_id,
+                        storage,
+                        task_status,
+                        __debug_flag__,
+                        __intermediate_flag__)
+
+                else:
+
+                    async_cognate_analysis.delay(
+                        language_str,
+                        source_perspective_id,
+                        base_language_id,
+                        base_language_name,
+                        group_field_id,
+                        perspective_info_list,
+                        multi_list,
+                        multi_name_list,
+                        mode,
+                        distance_flag,
+                        reference_perspective_id,
+                        figure_flag,
+                        distance_vowel_flag,
+                        distance_consonant_flag,
+                        match_translations_value,
+                        only_orphans_flag,
+                        locale_id,
+                        storage,
+                        task_status.key,
+                        request.registry.settings['cache_kwargs'],
+                        request.registry.settings['sqlalchemy.url'],
+                        __debug_flag__,
+                        __intermediate_flag__)
+
+                # Signifying that we've successfully launched asynchronous cognate acoustic analysis.
+
+                return CognateAnalysis(triumph = True)
+
+            elif mode == 'swadesh':
+
+                return CognateAnalysis.swadesh_statistics(
+                    language_str,
+                    source_perspective_id,
+                    base_language_id,
+                    base_language_name,
+                    group_field_id,
+                    perspective_info_list,
+                    multi_list,
+                    multi_name_list,
+                    mode,
+                    distance_flag,
+                    reference_perspective_id,
+                    figure_flag,
+                    distance_vowel_flag,
+                    distance_consonant_flag,
+                    match_translations_value,
+                    only_orphans_flag,
+                    locale_id,
+                    storage,
+                    None,
+                    __debug_flag__,
+                    __intermediate_flag__)
+
+            # We do not use acoustic data, so we perform cognate analysis synchronously.
+            else:
+
+                return CognateAnalysis.perform_cognate_analysis(
+                    language_str,
+                    source_perspective_id,
+                    base_language_id,
+                    base_language_name,
+                    group_field_id,
+                    perspective_info_list,
+                    multi_list,
+                    multi_name_list,
+                    mode,
+                    distance_flag,
+                    reference_perspective_id,
+                    figure_flag,
+                    distance_vowel_flag,
+                    distance_consonant_flag,
+                    match_translations_value,
+                    only_orphans_flag,
+                    locale_id,
+                    storage,
+                    None,
+                    __debug_flag__,
+                    __intermediate_flag__)
+
+        # Exception occured while we tried to perform cognate analysis.
+
+        except Exception as exception:
+
+            traceback_string = ''.join(traceback.format_exception(
+                exception, exception, exception.__traceback__))[:-1]
+
+            log.warning(
+                'cognate_analysis {0}: exception'.format(
+                language_str))
+
+            log.warning(traceback_string)
+
+            return ResponseError(message =
+                'Exception:\n' + traceback_string)
+
+
+class SwadeshAnalysis(graphene.Mutation):
+    class Arguments:
+
+        source_perspective_id = LingvodocID(required = True)
+        base_language_id = LingvodocID(required = True)
+
+        group_field_id = LingvodocID(required = True)
+        perspective_info_list = graphene.List(graphene.List(LingvodocID), required = True)
+        multi_list = graphene.List(ObjectVal)
+
+        mode = graphene.String()
+
+        distance_flag = graphene.Boolean()
+        reference_perspective_id = LingvodocID()
+
+        figure_flag = graphene.Boolean()
+        distance_vowel_flag = graphene.Boolean()
+        distance_consonant_flag = graphene.Boolean()
+
+        match_translations_value = graphene.Int()
+        only_orphans_flag = graphene.Boolean()
+
+        debug_flag = graphene.Boolean()
+        intermediate_flag = graphene.Boolean()
+
+        synchronous = graphene.Boolean()
+
+    triumph = graphene.Boolean()
+
+    dictionary_count = graphene.Int()
+    group_count = graphene.Int()
+    not_enough_count = graphene.Int()
+    transcription_count = graphene.Int()
+    translation_count = graphene.Int()
+
+    result = graphene.String()
+    xlsx_url = graphene.String()
+    distance_list = graphene.Field(ObjectVal)
+    figure_url = graphene.String()
+
+    minimum_spanning_tree = graphene.List(graphene.List(graphene.Int))
+    embedding_2d = graphene.List(graphene.List(graphene.Float))
+    embedding_3d = graphene.List(graphene.List(graphene.Float))
+    perspective_name_list = graphene.List(graphene.String)
+
+    suggestion_list = graphene.List(ObjectVal)
+    suggestion_field_id = LingvodocID()
+
+    intermediate_url_list = graphene.List(graphene.String)
+
     @staticmethod
     def swadesh_statistics(
             language_str,

From 43b49484dc1bec38cdf9e9f700d7046fe5ea2435 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Wed, 17 May 2023 18:39:13 +0300
Subject: [PATCH 10/69] Next steps to separate classes

---
 lingvodoc/schema/query.py | 331 ++------------------------------------
 1 file changed, 14 insertions(+), 317 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 46420db6..393204a1 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13007,71 +13007,15 @@ class Arguments:
 
         group_field_id = LingvodocID(required = True)
         perspective_info_list = graphene.List(graphene.List(LingvodocID), required = True)
-        multi_list = graphene.List(ObjectVal)
-
-        mode = graphene.String()
-
-        distance_flag = graphene.Boolean()
-        reference_perspective_id = LingvodocID()
-
-        figure_flag = graphene.Boolean()
-        distance_vowel_flag = graphene.Boolean()
-        distance_consonant_flag = graphene.Boolean()
-
-        match_translations_value = graphene.Int()
-        only_orphans_flag = graphene.Boolean()
-
-        debug_flag = graphene.Boolean()
-        intermediate_flag = graphene.Boolean()
-
-        synchronous = graphene.Boolean()
 
     triumph = graphene.Boolean()
 
-    dictionary_count = graphene.Int()
-    group_count = graphene.Int()
-    not_enough_count = graphene.Int()
-    transcription_count = graphene.Int()
-    translation_count = graphene.Int()
-
-    result = graphene.String()
-    xlsx_url = graphene.String()
-    distance_list = graphene.Field(ObjectVal)
-    figure_url = graphene.String()
-
-    minimum_spanning_tree = graphene.List(graphene.List(graphene.Int))
-    embedding_2d = graphene.List(graphene.List(graphene.Float))
-    embedding_3d = graphene.List(graphene.List(graphene.Float))
-    perspective_name_list = graphene.List(graphene.String)
-
-    suggestion_list = graphene.List(ObjectVal)
-    suggestion_field_id = LingvodocID()
-
-    intermediate_url_list = graphene.List(graphene.String)
-
     @staticmethod
     def swadesh_statistics(
             language_str,
-            source_perspective_id,
-            base_language_id,
-            base_language_name,
             group_field_id,
             perspective_info_list,
-            multi_list,
-            multi_name_list,
-            mode,
-            distance_flag,
-            reference_perspective_id,
-            figure_flag,
-            distance_vowel_flag,
-            distance_consonant_flag,
-            match_translations_value,
-            only_orphans_flag,
-            locale_id,
-            storage,
-            task_status=None,
-            __debug_flag__=False,
-            __intermediate_flag__=False):
+            locale_id):
 
         swadesh_list = ['я','ты','мы','этот, это','тот, то','кто','что','не','все','много','один','два','большой',
                         'долгий','маленький','женщина','мужчина','человек','рыба','птица','собака','вошь','дерево',
@@ -13117,7 +13061,7 @@ def split_lex(lex):
             dictionary_name = perspective.parent.get_translation(locale_id)
 
             log.debug(
-                '\ncognate_analysis {0}:'
+                '\nswadesh_analysis {0}:'
                 '\n  dictionary {1}/{2}: {3}'
                 '\n  perspective {4}/{5}: {6}'.format(
                     language_str,
@@ -13194,8 +13138,8 @@ def split_lex(lex):
     @staticmethod
     def mutate(self, info, **args):
         """
-        mutation CognateAnalysis {
-          cognate_analysis(
+        mutation SwadeshAnalysis {
+          swadesh_analysis(
             base_language_id: [508, 41],
             group_field_id: [66, 25],
             perspective_info_list: [
@@ -13203,22 +13147,14 @@ def mutate(self, info, **args):
               [[1552, 1759], [66, 8], [66, 10]],
               [[418, 4], [66, 8], [66, 10]]])
           {
-            triumph
-            entity_count
-            dictionary_count
-            group_count
-            not_enough_count
-            text_count
-            result
-          }
+            triumph          }
         }
         """
 
         # Administrator / perspective author / editing permission check.
-
         error_str = (
             'Only administrator, perspective author and users with perspective editing permissions '
-            'can perform cognate analysis.')
+            'can perform swadesh analysis.')
 
         client_id = info.context.request.authenticated_userid
 
@@ -13261,24 +13197,6 @@ def mutate(self, info, **args):
 
         group_field_id = args['group_field_id']
         perspective_info_list = args['perspective_info_list']
-        multi_list = args.get('multi_list')
-
-        mode = args.get('mode')
-
-        distance_flag = args.get('distance_flag')
-        reference_perspective_id = args.get('reference_perspective_id')
-
-        figure_flag = args.get('figure_flag')
-        distance_vowel_flag = args.get('distance_vowel_flag')
-        distance_consonant_flag = args.get('distance_consonant_flag')
-
-        match_translations_value = args.get('match_translations_value', 1)
-        only_orphans_flag = args.get('only_orphans_flag', True)
-
-        __debug_flag__ = args.get('debug_flag', False)
-        __intermediate_flag__ = args.get('intermediate_flag', False)
-
-        synchronous = args.get('synchronous', False)
 
         language_str = (
             '{0}/{1}, language {2}/{3}'.format(
@@ -13299,102 +13217,6 @@ def mutate(self, info, **args):
             request = info.context.request
             storage = request.registry.settings['storage']
 
-            # Getting multi-language info, if required.
-
-            if multi_list is None:
-                multi_list = []
-
-            multi_name_list = []
-
-            for language_id, perspective_count in multi_list:
-
-                language = DBSession.query(dbLanguage).filter_by(
-                    client_id = language_id[0], object_id = language_id[1]).first()
-
-                multi_name_list.append(
-                    language.get_translation(locale_id))
-
-            # Language tag.
-
-            if mode == 'multi':
-
-                multi_str = ', '.join(
-                    '{0}/{1}'.format(*id)
-                    for id, count in multi_list)
-
-                language_str = (
-                    '{0}/{1}, languages {2}'.format(
-                        source_perspective_id[0], source_perspective_id[1],
-                        multi_str))
-
-            # Showing cognate analysis info, checking cognate analysis library presence.
-
-            log.debug(
-                 '\ncognate_analysis {}:'
-                 '\n  base language: {}'
-                 '\n  group field: {}/{}'
-                 '\n  perspectives and transcription/translation fields: {}'
-                 '\n  multi_list: {}'
-                 '\n  multi_name_list: {}'
-                 '\n  mode: {}'
-                 '\n  distance_flag: {}'
-                 '\n  reference_perspective_id: {}'
-                 '\n  figure_flag: {}'
-                 '\n  distance_vowel_flag: {}'
-                 '\n  distance_consonant_flag: {}'
-                 '\n  match_translations_value: {}'
-                 '\n  only_orphans_flag: {} ({})'
-                 '\n  __debug_flag__: {}'
-                 '\n  __intermediate_flag__: {}'
-                 '\n  cognate_analysis_f: {}'
-                 '\n  cognate_acoustic_analysis_f: {}'
-                 '\n  cognate_distance_analysis_f: {}'
-                 '\n  cognate_reconstruction_f: {}'
-                 '\n  cognate_reconstruction_multi_f: {}'
-                 '\n  cognate_suggestions_f: {}'.format(
-                    language_str,
-                    repr(base_language_name.strip()),
-                    group_field_id[0], group_field_id[1],
-                    perspective_info_list,
-                    multi_list,
-                    multi_name_list,
-                    repr(mode),
-                    distance_flag,
-                    reference_perspective_id,
-                    figure_flag,
-                    distance_vowel_flag,
-                    distance_consonant_flag,
-                    match_translations_value,
-                    only_orphans_flag, int(only_orphans_flag),
-                    __debug_flag__,
-                    __intermediate_flag__,
-                    repr(cognate_analysis_f),
-                    repr(cognate_acoustic_analysis_f),
-                    repr(cognate_distance_analysis_f),
-                    repr(cognate_reconstruction_f),
-                    repr(cognate_reconstruction_multi_f),
-                    repr(cognate_suggestions_f)))
-
-            # Checking if we have analysis function ready.
-
-            analysis_f = (
-                cognate_acoustic_analysis_f if mode == 'acoustic' else
-                cognate_reconstruction_f if mode == 'reconstruction' else
-                cognate_reconstruction_multi_f if mode == 'multi' else
-                cognate_suggestions_f if mode == 'suggestions' else
-                cognate_analysis_f)
-
-            if analysis_f is None and False:
-
-                return ResponseError(message =
-                    'Analysis library fuction \'{0}()\' is absent, '
-                    'please contact system administrator.'.format(
-                        'CognateAcousticAnalysis_GetAllOutput' if mode == 'acoustic' else
-                        'CognateReconstruct_GetAllOutput' if mode == 'reconstruction' else
-                        'CognateMultiReconstruct_GetAllOutput' if mode == 'multi' else
-                        'GuessCognates_GetAllOutput' if mode == 'suggestions' else
-                        'CognateAnalysis_GetAllOutput'))
-
             # Transforming client/object pair ids from lists to 2-tuples.
 
             source_perspective_id = tuple(source_perspective_id)
@@ -13411,146 +13233,20 @@ def mutate(self, info, **args):
                     transcription_field_id,
                     translation_field_id in perspective_info_list]
 
-            multi_list = [
-                [tuple(language_id), perspective_count]
-                for language_id, perspective_count in multi_list]
-
-            if reference_perspective_id is not None:
-                reference_perspective_id = tuple(reference_perspective_id)
-
-            # If we are to use acoustic data, we will launch cognate analysis in asynchronous mode.
-
-            if mode == 'acoustic':
-
-                client_id = request.authenticated_userid
-
-                user_id = (
-                    Client.get_user_by_client_id(client_id).id
-                        if client_id else anonymous_userid(request))
-
-                task_status = TaskStatus(
-                    user_id, 'Cognate acoustic analysis', base_language_name, 5)
-
-                # Launching cognate acoustic analysis asynchronously.
-
-                request.response.status = HTTPOk.code
-
-                if synchronous:
-                    breakpoint()
-                    CognateAnalysis.perform_cognate_analysis(
-                        language_str,
-                        source_perspective_id,
-                        base_language_id,
-                        base_language_name,
-                        group_field_id,
-                        perspective_info_list,
-                        multi_list,
-                        multi_name_list,
-                        mode,
-                        None,
-                        None,
-                        None,
-                        None,
-                        None,
-                        match_translations_value,
-                        only_orphans_flag,
-                        locale_id,
-                        storage,
-                        task_status,
-                        __debug_flag__,
-                        __intermediate_flag__)
-
-                else:
-
-                    async_cognate_analysis.delay(
-                        language_str,
-                        source_perspective_id,
-                        base_language_id,
-                        base_language_name,
-                        group_field_id,
-                        perspective_info_list,
-                        multi_list,
-                        multi_name_list,
-                        mode,
-                        distance_flag,
-                        reference_perspective_id,
-                        figure_flag,
-                        distance_vowel_flag,
-                        distance_consonant_flag,
-                        match_translations_value,
-                        only_orphans_flag,
-                        locale_id,
-                        storage,
-                        task_status.key,
-                        request.registry.settings['cache_kwargs'],
-                        request.registry.settings['sqlalchemy.url'],
-                        __debug_flag__,
-                        __intermediate_flag__)
-
-                # Signifying that we've successfully launched asynchronous cognate acoustic analysis.
-
-                return CognateAnalysis(triumph = True)
-
-            elif mode == 'swadesh':
-
-                return CognateAnalysis.swadesh_statistics(
-                    language_str,
-                    source_perspective_id,
-                    base_language_id,
-                    base_language_name,
-                    group_field_id,
-                    perspective_info_list,
-                    multi_list,
-                    multi_name_list,
-                    mode,
-                    distance_flag,
-                    reference_perspective_id,
-                    figure_flag,
-                    distance_vowel_flag,
-                    distance_consonant_flag,
-                    match_translations_value,
-                    only_orphans_flag,
-                    locale_id,
-                    storage,
-                    None,
-                    __debug_flag__,
-                    __intermediate_flag__)
-
-            # We do not use acoustic data, so we perform cognate analysis synchronously.
-            else:
-
-                return CognateAnalysis.perform_cognate_analysis(
-                    language_str,
-                    source_perspective_id,
-                    base_language_id,
-                    base_language_name,
-                    group_field_id,
-                    perspective_info_list,
-                    multi_list,
-                    multi_name_list,
-                    mode,
-                    distance_flag,
-                    reference_perspective_id,
-                    figure_flag,
-                    distance_vowel_flag,
-                    distance_consonant_flag,
-                    match_translations_value,
-                    only_orphans_flag,
-                    locale_id,
-                    storage,
-                    None,
-                    __debug_flag__,
-                    __intermediate_flag__)
-
-        # Exception occured while we tried to perform cognate analysis.
+            return SwadeshAnalysis.swadesh_statistics(
+                language_str,
+                group_field_id,
+                perspective_info_list,
+                locale_id)
 
+        # Exception occured while we tried to perform swadesh analysis.
         except Exception as exception:
 
             traceback_string = ''.join(traceback.format_exception(
                 exception, exception, exception.__traceback__))[:-1]
 
             log.warning(
-                'cognate_analysis {0}: exception'.format(
+                'swadesh_analysis {0}: exception'.format(
                 language_str))
 
             log.warning(traceback_string)
@@ -19392,6 +19088,7 @@ class MyMutations(graphene.ObjectType):
     starling_etymology = StarlingEtymology.Field()
     phonemic_analysis = PhonemicAnalysis.Field()
     cognate_analysis = CognateAnalysis.Field()
+    swadesh_analysis = SwadeshAnalysis.Field()
     phonology = Phonology.Field()
     phonological_statistical_distance = PhonologicalStatisticalDistance.Field()
     sound_and_markup = SoundAndMarkup.Field()

From 072bd39c2c63cdf3901f3a0605e166df6a6a5228 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Wed, 17 May 2023 19:35:44 +0300
Subject: [PATCH 11/69] Fixes and cleanup

---
 lingvodoc/schema/query.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 393204a1..236ee835 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -364,7 +364,7 @@
 
 # Setting up logging.
 log = logging.getLogger(__name__)
-logging.disable(level=logging.INFO)
+#logging.disable(level=logging.INFO)
 
 
 # Trying to set up celery logging.
@@ -11035,7 +11035,6 @@ def perform_cognate_analysis(
         sg_both_count = 0
 
         source_perspective_index = None
-        breakpoint()
         for index, (perspective_id, transcription_field_id, translation_field_id) in \
             enumerate(perspective_info_list):
 
@@ -12876,7 +12875,6 @@ def mutate(self, info, **args):
                 request.response.status = HTTPOk.code
 
                 if synchronous:
-                    breakpoint()
                     CognateAnalysis.perform_cognate_analysis(
                         language_str,
                         source_perspective_id,

From 3fca48b09d5e6f5d90c0f299f3b3e0e31d79cd98 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Thu, 18 May 2023 14:24:57 +0300
Subject: [PATCH 12/69] Calculate commons_total as intersection Swadesh'
 entries

---
 lingvodoc/schema/query.py | 37 ++++++++++++++++++++++---------------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 236ee835..e1d4f21a 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -364,7 +364,7 @@
 
 # Setting up logging.
 log = logging.getLogger(__name__)
-#logging.disable(level=logging.INFO)
+logging.disable(level=logging.INFO)
 
 
 # Trying to set up celery logging.
@@ -13041,8 +13041,10 @@ def split_lex(lex):
                 perspective_info_list, group_field_id))
 
         # Getting text data for each perspective.
-        # entries_map gathers words from Swadesh' list met in perspectives
-        entries_map = {}
+        # entries_set gathers entry_id(s) of words met in Swadesh' list
+        # swadesh_set gathers numbers of words within Swadesh' list
+        entries_set = {}
+        swadesh_set = {}
         for index, (perspective_id, _, translation_field_id) in \
                 enumerate(perspective_info_list):
 
@@ -13094,7 +13096,8 @@ def split_lex(lex):
                     .all())
 
             # Grouping translations by lexical entries.
-            entries_map[perspective_id] = set()
+            entries_set[perspective_id] = set()
+            swadesh_set[perspective_id] = set()
             for row_index, row in enumerate(translation_query):
                 entry_id = tuple(row[:2])
                 translation_list = row[2]
@@ -13105,32 +13108,36 @@ def split_lex(lex):
                         for translation in translation_list
                         if translation.strip()])
 
-                for swadesh_lex in swadesh_list:
+                for swadesh_num, swadesh_lex in enumerate(swadesh_list):
                     for translation_lex in translation_list:
                         if compare_translations(swadesh_lex, translation_lex):
-                            entries_map[perspective_id].add(entry_id)
-                            #print(entry_id, translation_lex)
+                            # Store entry_id and number of the lex within Swadesh' list
+                            entries_set[perspective_id].add(entry_id)
+                            swadesh_set[perspective_id].add(swadesh_num)
+                            #print(entry_id, swadesh_num, translation_lex)
 
         # Create dictionary of sets:
         # keys: pepspective_id
         # values: numbers of etymological groups where an entry from dictionary is met
         links = {}
-        for perspective, entries in entries_map.items():
+        for perspective, entries in entries_set.items():
             links[perspective] = set()
-            for index_group, group in enumerate(group_list):
+            for group_index, group in enumerate(group_list):
                 if (entries & group):
-                    links[perspective].add(index_group)
+                    links[perspective].add(group_index)
 
         # Calculate intersection between lists of group numbers for all the perspectives
         # So length of this intersection is the similarity of corresponding perspectives
         similarity = {}
-        for perspective1, groups1 in links.items():
+        for n1, (perspective1, groups1) in enumerate(links.items()):
             similarity[perspective1] = {}
             print(perspective1, end=' :: ')
-            for perspective2, groups2 in links.items():
-                commons = len(groups1 & groups2)
-                similarity[perspective1][perspective2] = commons
-                print(f"{perspective2}:{commons}", end=' | ')
+            for n2, (perspective2, groups2) in enumerate(links.items()):
+                if n2 <= n1: continue  #exclude duplicates and self-to-self
+                commons_total = len(swadesh_set[perspective1] & swadesh_set[perspective2])
+                commons_linked = len(groups1 & groups2)
+                similarity[perspective1][perspective2] = commons_total, commons_linked
+                print(f"{perspective2}:{commons_linked}/{commons_total}", end=' | ')
             print()
 
     @staticmethod

From 15bb2f0723328b033d679fe6c0ebabd2c2486e4a Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Thu, 18 May 2023 14:43:53 +0300
Subject: [PATCH 13/69] divergence_time

---
 lingvodoc/schema/query.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index e1d4f21a..755c5d29 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13126,8 +13126,9 @@ def split_lex(lex):
                 if (entries & group):
                     links[perspective].add(group_index)
 
-        # Calculate intersection between lists of group numbers for all the perspectives
+        # Calculate intersection between lists of group numbers
         # So length of this intersection is the similarity of corresponding perspectives
+        # commons_total means amount of Swadesh's lexems met in the both perspectives
         similarity = {}
         for n1, (perspective1, groups1) in enumerate(links.items()):
             similarity[perspective1] = {}
@@ -13136,8 +13137,9 @@ def split_lex(lex):
                 if n2 <= n1: continue  #exclude duplicates and self-to-self
                 commons_total = len(swadesh_set[perspective1] & swadesh_set[perspective2])
                 commons_linked = len(groups1 & groups2)
+                divergence_time = -10 * math.log(commons_linked / commons_total)
                 similarity[perspective1][perspective2] = commons_total, commons_linked
-                print(f"{perspective2}:{commons_linked}/{commons_total}", end=' | ')
+                print(f"{perspective2}:{commons_linked}/{commons_total}:{divergence_time}", end=' | ')
             print()
 
     @staticmethod

From 47a473dc153a975f982c3eb8a00188d86a629b69 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Thu, 18 May 2023 14:47:37 +0300
Subject: [PATCH 14/69] Fix

---
 lingvodoc/schema/query.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 755c5d29..d4659e4e 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13137,7 +13137,7 @@ def split_lex(lex):
                 if n2 <= n1: continue  #exclude duplicates and self-to-self
                 commons_total = len(swadesh_set[perspective1] & swadesh_set[perspective2])
                 commons_linked = len(groups1 & groups2)
-                divergence_time = -10 * math.log(commons_linked / commons_total)
+                divergence_time = (-10 * math.log(commons_linked / commons_total) if commons_total > 0 else -1)
                 similarity[perspective1][perspective2] = commons_total, commons_linked
                 print(f"{perspective2}:{commons_linked}/{commons_total}:{divergence_time}", end=' | ')
             print()

From 9272fa9c60c9a6d251df9351b6b883c788fe0702 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Thu, 18 May 2023 14:58:56 +0300
Subject: [PATCH 15/69] Math fix

---
 lingvodoc/schema/query.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index d4659e4e..de296af8 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13137,7 +13137,8 @@ def split_lex(lex):
                 if n2 <= n1: continue  #exclude duplicates and self-to-self
                 commons_total = len(swadesh_set[perspective1] & swadesh_set[perspective2])
                 commons_linked = len(groups1 & groups2)
-                divergence_time = (-10 * math.log(commons_linked / commons_total) if commons_total > 0 else -1)
+                # If commons_linked > 0 then commons_total > 0 all the more. If not then this is a bug.
+                divergence_time = (-10 * math.log(commons_linked / commons_total) if commons_linked > 0 else -1)
                 similarity[perspective1][perspective2] = commons_total, commons_linked
                 print(f"{perspective2}:{commons_linked}/{commons_total}:{divergence_time}", end=' | ')
             print()

From ab40a1c1b50933067bed45f9978f3648f5a76539 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Thu, 18 May 2023 18:30:26 +0300
Subject: [PATCH 16/69] Exclude borrowings

---
 lingvodoc/schema/query.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index de296af8..e7c98c21 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13032,7 +13032,9 @@ def split_lex(lex):
                 # various forms of lexem and extra note if is
                 return set(form.strip().lower()
                            for form in lex.replace('(', ',').split(',')
-                           if form.strip() and (')' not in form)) #exclude notes
+                           if form.strip()
+                           and (')' not in form)
+                           and (' заим.' not in form)) #exclude notes and borrowings
             # return true if the intersection is not empty
             return bool(split_lex(swadesh_lex) & split_lex(dictionary_lex))
 
@@ -13140,7 +13142,7 @@ def split_lex(lex):
                 # If commons_linked > 0 then commons_total > 0 all the more. If not then this is a bug.
                 divergence_time = (-10 * math.log(commons_linked / commons_total) if commons_linked > 0 else -1)
                 similarity[perspective1][perspective2] = commons_total, commons_linked
-                print(f"{perspective2}:{commons_linked}/{commons_total}:{divergence_time}", end=' | ')
+                print(f"{perspective2}:{commons_linked}/{commons_total}:{divergence_time:.2f}", end=' | ')
             print()
 
     @staticmethod

From 16f8f0aad2eaad9fb1d32ad926f8be82621725c8 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Thu, 18 May 2023 18:59:05 +0300
Subject: [PATCH 17/69] Cleanup

---
 lingvodoc/schema/query.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index e7c98c21..a6dc4421 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13033,8 +13033,8 @@ def split_lex(lex):
                 return set(form.strip().lower()
                            for form in lex.replace('(', ',').split(',')
                            if form.strip()
-                           and (')' not in form)
-                           and (' заим.' not in form)) #exclude notes and borrowings
+                           and ')' not in form
+                           and ' заим.' not in form) #exclude notes and borrowings
             # return true if the intersection is not empty
             return bool(split_lex(swadesh_lex) & split_lex(dictionary_lex))
 
@@ -13139,8 +13139,8 @@ def split_lex(lex):
                 if n2 <= n1: continue  #exclude duplicates and self-to-self
                 commons_total = len(swadesh_set[perspective1] & swadesh_set[perspective2])
                 commons_linked = len(groups1 & groups2)
-                # If commons_linked > 0 then commons_total > 0 all the more. If not then this is a bug.
-                divergence_time = (-10 * math.log(commons_linked / commons_total) if commons_linked > 0 else -1)
+                # commons_linked > 0 means that commons_total > 0 even more so
+                divergence_time = math.log(commons_linked / commons_total) / -0.14 if commons_linked > 0 else -1
                 similarity[perspective1][perspective2] = commons_total, commons_linked
                 print(f"{perspective2}:{commons_linked}/{commons_total}:{divergence_time:.2f}", end=' | ')
             print()

From 64e70d610d17ca0c0bb37c935b783f2b4934a333 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Thu, 18 May 2023 19:23:56 +0300
Subject: [PATCH 18/69] Minor

---
 lingvodoc/schema/query.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index a6dc4421..5805964d 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13140,9 +13140,9 @@ def split_lex(lex):
                 commons_total = len(swadesh_set[perspective1] & swadesh_set[perspective2])
                 commons_linked = len(groups1 & groups2)
                 # commons_linked > 0 means that commons_total > 0 even more so
-                divergence_time = math.log(commons_linked / commons_total) / -0.14 if commons_linked > 0 else -1
-                similarity[perspective1][perspective2] = commons_total, commons_linked
-                print(f"{perspective2}:{commons_linked}/{commons_total}:{divergence_time:.2f}", end=' | ')
+                distance = math.log(commons_linked / commons_total) / -0.14 if commons_linked > 0 else -1
+                similarity[perspective1][perspective2] = commons_linked, commons_total
+                print(f"{perspective2}:{commons_linked}/{commons_total}:{distance:.2f}", end=' | ')
             print()
 
     @staticmethod

From d530914fb30cfa26a72af210d43bc8d0ae0f7705 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Thu, 18 May 2023 20:49:51 +0300
Subject: [PATCH 19/69] distance_graph function

---
 lingvodoc/schema/query.py | 782 +++++++++++++++++++-------------------
 1 file changed, 400 insertions(+), 382 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 5805964d..9a77feaa 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -10887,6 +10887,399 @@ def f_callback(xyz):
 
         return result_x, f(result.x)
 
+    @staticmethod
+    def distance_graph(
+            language_str,
+            distance_data_array,
+            distance_header_array,
+            __debug_flag__):
+
+        d_ij = (distance_data_array + distance_data_array.T) / 2
+
+        log.debug(
+            '\ncognate_analysis {0}:'
+            '\ndistance_header_array:\n{1}'
+            '\ndistance_data_array:\n{2}'
+            '\nd_ij:\n{3}'.format(
+            language_str,
+            distance_header_array,
+            distance_data_array,
+            d_ij))
+
+        # Projecting the graph into a 2d plane via relative distance strain optimization, using PCA to
+        # orient it left-right.
+
+        if len(distance_data_array) > 1:
+
+            embedding_2d, strain_2d = (
+                CognateAnalysis.graph_2d_embedding(d_ij, verbose = __debug_flag__))
+
+            embedding_2d_pca = (
+                sklearn.decomposition.PCA(n_components = 2)
+                    .fit_transform(embedding_2d))
+
+            distance_2d = sklearn.metrics.euclidean_distances(embedding_2d)
+
+        else:
+
+            embedding_2d = numpy.zeros((1, 2))
+            embedding_2d_pca = numpy.zeros((1, 2))
+
+            strain_2d = 0.0
+
+            distance_2d = numpy.zeros((1, 1))
+
+        # Showing what we computed.
+
+        log.debug(
+            '\ncognate_analysis {0}:'
+            '\nembedding 2d:\n{1}'
+            '\nembedding 2d (PCA-oriented):\n{2}'
+            '\nstrain 2d:\n{3}'
+            '\ndistances 2d:\n{4}'.format(
+            language_str,
+            embedding_2d,
+            embedding_2d_pca,
+            strain_2d,
+            distance_2d))
+
+        # And now the same with 3d embedding.
+
+        if len(distance_data_array) > 1:
+
+            embedding_3d, strain_3d = (
+                CognateAnalysis.graph_3d_embedding(d_ij, verbose = __debug_flag__))
+
+            # At least three points, standard PCA-based orientation.
+
+            if len(distance_data_array) >= 3:
+
+                embedding_3d_pca = (
+                    sklearn.decomposition.PCA(n_components = 3)
+                        .fit_transform(embedding_3d))
+
+            # Only two points, so we take 2d embedding and extend it with zeros.
+
+            else:
+
+                embedding_3d_pca = (
+
+                    numpy.hstack((
+                        embedding_2d_pca,
+                        numpy.zeros((embedding_2d_pca.shape[0], 1)))))
+
+            # Making 3d embedding actually 3d, if required.
+
+            if embedding_3d_pca.shape[1] <= 2:
+
+                embedding_3d_pca = (
+
+                    numpy.hstack((
+                        embedding_3d_pca,
+                        numpy.zeros((embedding_3d_pca.shape[0], 1)))))
+
+            distance_3d = (
+                sklearn.metrics.euclidean_distances(embedding_3d_pca))
+
+        else:
+
+            embedding_3d = numpy.zeros((1, 3))
+            embedding_3d_pca = numpy.zeros((1, 3))
+
+            strain_3d = 0.0
+
+            distance_3d = numpy.zeros((1, 1))
+
+        # Showing what we've get.
+
+        log.debug(
+            '\ncognate_analysis {0}:'
+            '\nembedding 3d:\n{1}'
+            '\nembedding 3d (PCA-oriented):\n{2}'
+            '\nstrain 3d:\n{3}'
+            '\ndistances 3d:\n{4}'.format(
+            language_str,
+            embedding_3d,
+            embedding_3d_pca,
+            strain_3d,
+            distance_3d))
+
+        # Computing minimum spanning tree via standard Jarnik-Prim-Dijkstra algorithm using 2d and 3d
+        # embedding distances to break ties.
+
+        if len(distance_data_array) <= 1:
+            mst_list = []
+
+        else:
+
+            d_min, d_extra_min, min_i, min_j = min(
+                (d_ij[i,j], distance_2d[i,j] + distance_3d[i,j], i, j)
+                for i in range(d_ij.shape[0] - 1)
+                for j in range(i + 1, d_ij.shape[0]))
+
+            mst_list = [(min_i, min_j)]
+            mst_dict = {}
+
+            # MST construction initialization.
+
+            for i in range(d_ij.shape[0]):
+
+                if i == min_i or i == min_j:
+                    continue
+
+                d_min_i = (d_ij[i, min_i], distance_2d[i, min_i] + distance_3d[i, min_i])
+                d_min_j = (d_ij[i, min_j], distance_2d[i, min_j] + distance_3d[i, min_j])
+
+                mst_dict[i] = (
+                    (d_min_i, min_i) if d_min_i <= d_min_j else
+                    (d_min_j, min_i))
+
+            # Iterative MST construction.
+
+            while len(mst_dict) > 0:
+
+                (d_min, d_extra_min, i_min, i_from_min) = min(
+                    (d, d_extra, i, i_from) for i, ((d, d_extra), i_from) in mst_dict.items())
+
+                log.debug('\n' + pprint.pformat(mst_dict))
+                log.debug('\n' + repr((i_from_min, i_min, d_min, d_extra_min)))
+
+                mst_list.append((i_from_min, i_min))
+                del mst_dict[i_min]
+
+                # Updating shortest connection info.
+
+                for i_to in mst_dict.keys():
+
+                    d_to = (d_ij[i_min, i_to], distance_2d[i_min, i_to] + distance_3d[i_min, i_to])
+
+                    if d_to < mst_dict[i_to][0]:
+                        mst_dict[i_to] = (d_to, i_min)
+
+        log.debug(
+            '\ncognate_analysis {0}:'
+            '\nminimum spanning tree:\n{1}'.format(
+            language_str,
+            pprint.pformat(mst_list)))
+
+        # Plotting with matplotlib.
+
+        figure = pyplot.figure(figsize = (10, 10))
+        axes = figure.add_subplot(212)
+
+        axes.set_title(
+            'Etymological distance tree (relative distance embedding)',
+            fontsize = 14, family = 'Gentium')
+
+        axes.axis('equal')
+        axes.axis('off')
+        axes.autoscale()
+
+        def f(axes, embedding_pca):
+            """
+            Plots specified graph embedding on a given axis.
+            """
+
+            flag_3d = numpy.size(embedding_pca, 1) > 2
+
+            for index, (position, name) in enumerate(
+                zip(embedding_pca, distance_header_array)):
+
+                # Checking if any of the previous perspectives are already in this perspective's
+                # position.
+
+                same_position_index = None
+
+                for i, p in enumerate(embedding_pca[:index]):
+                    if numpy.linalg.norm(position - p) <= 1e-3:
+
+                        same_position_index = i
+                        break
+
+                color = matplotlib.colors.hsv_to_rgb(
+                    [(same_position_index or index) * 1.0 / len(distance_header_array), 0.5, 0.75])
+
+                label_same_str = (
+                    '' if same_position_index is None else
+                    ' (same as {0})'.format(same_position_index + 1))
+
+                kwargs = {
+                    's': 35,
+                    'color': color,
+                    'label': '{0}) {1}{2}'.format(index + 1, name, label_same_str)}
+
+                axes.scatter(*position, **kwargs)
+
+                # Annotating position with its number, but only if we hadn't already annotated nearby.
+
+                if same_position_index is None:
+
+                    if flag_3d:
+
+                        axes.text(
+                            position[0] + 0.01, position[1], position[2] + 0.01,
+                            str(index + 1), None, fontsize = 14)
+
+                    else:
+
+                        axes.annotate(
+                            str(index + 1),
+                            (position[0] + 0.01, position[1] - 0.005),
+                            fontsize = 14)
+
+            # Plotting minimum spanning trees.
+
+            line_list = [
+                (embedding_pca[i], embedding_pca[j])
+                for i, j in mst_list]
+
+            line_collection = (
+                Line3DCollection if flag_3d else LineCollection)(
+                    line_list, zorder = 0, color = 'gray')
+
+            axes.add_collection(line_collection)
+
+            pyplot.setp(axes.texts, family = 'Gentium')
+
+        # Plotting our embedding, creating the legend.
+
+        f(axes, embedding_2d_pca)
+
+        pyplot.tight_layout()
+
+        legend = axes.legend(
+            scatterpoints = 1,
+            loc = 'upper center',
+            bbox_to_anchor = (0.5, -0.05),
+            frameon = False,
+            handlelength = 0.5,
+            handletextpad = 0.75,
+            fontsize = 14)
+
+        pyplot.setp(legend.texts, family = 'Gentium')
+        axes.autoscale_view()
+
+        # Saving generated figure for debug purposes, if required.
+
+        if __debug_flag__:
+
+            figure_file_name = (
+                'figure cognate distance{0}.png'.format(
+                mode_name_str))
+
+            with open(figure_file_name, 'wb') as figure_file:
+
+                pyplot.savefig(
+                    figure_file,
+                    bbox_extra_artists = (legend,),
+                    bbox_inches = 'tight',
+                    pad_inches = 0.25,
+                    format = 'png')
+
+            # Also generating 3d embedding figure.
+
+            figure_3d = pyplot.figure()
+            figure_3d.set_size_inches(16, 10)
+
+            axes_3d = figure_3d.add_subplot(111, projection = '3d')
+
+            axes_3d.axis('equal')
+            axes_3d.view_init(elev = 30, azim = -75)
+
+            f(axes_3d, embedding_3d_pca)
+
+            # Setting up legend.
+
+            axes_3d.set_xlabel('X')
+            axes_3d.set_ylabel('Y')
+            axes_3d.set_zlabel('Z')
+
+            legend_3d = axes_3d.legend(
+                scatterpoints = 1,
+                loc = 'upper center',
+                bbox_to_anchor = (0.5, -0.05),
+                frameon = False,
+                handlelength = 0.5,
+                handletextpad = 0.75,
+                fontsize = 14)
+
+            pyplot.setp(legend_3d.texts, family = 'Gentium')
+
+            # Fake cubic bounding box to force axis aspect ratios, see
+            # https://stackoverflow.com/a/13701747/2016856.
+
+            X = embedding_3d_pca[:,0]
+            Y = embedding_3d_pca[:,1]
+            Z = embedding_3d_pca[:,2]
+
+            max_range = numpy.array([
+                X.max() - X.min(), Y.max() - Y.min(), Z.max() - Z.min()]).max()
+
+            Xb = (
+                0.5 * max_range * numpy.mgrid[-1:2:2,-1:2:2,-1:2:2][0].flatten() +
+                0.5 * (X.max() + X.min()))
+
+            Yb = (
+                0.5 * max_range * numpy.mgrid[-1:2:2,-1:2:2,-1:2:2][1].flatten() +
+                0.5 * (Y.max() + Y.min()))
+
+            Zb = (
+                0.5 * max_range * numpy.mgrid[-1:2:2,-1:2:2,-1:2:2][2].flatten() +
+                0.5 * (Z.max() + Z.min()))
+
+            for xb, yb, zb in zip(Xb, Yb, Zb):
+               axes_3d.plot([xb], [yb], [zb], 'w')
+
+            axes_3d.autoscale_view()
+
+            # And saving it.
+
+            figure_3d_file_name = (
+                'figure 3d cognate distance{0}.png'.format(
+                mode_name_str))
+
+            with open(figure_3d_file_name, 'wb') as figure_3d_file:
+
+                figure_3d.savefig(
+                    figure_3d_file,
+                    bbox_extra_artists = (legend_3d,),
+                    bbox_inches = 'tight',
+                    pad_inches = 0.25,
+                    format = 'png')
+
+        # Storing generated figure as a PNG image.
+
+        figure_filename = pathvalidate.sanitize_filename(
+            '{0} cognate{1} analysis {2:04d}.{3:02d}.{4:02d}.png'.format(
+                base_language_name[:64],
+                ' ' + mode if mode else '',
+                current_datetime.year,
+                current_datetime.month,
+                current_datetime.day))
+
+        figure_path = os.path.join(storage_dir, figure_filename)
+        os.makedirs(os.path.dirname(figure_path), exist_ok = True)
+
+        with open(figure_path, 'wb') as figure_file:
+
+            figure.savefig(
+                figure_file,
+                bbox_extra_artists = (legend,),
+                bbox_inches = 'tight',
+                pad_inches = 0.25,
+                format = 'png')
+
+        figure_url = ''.join([
+            storage['prefix'], storage['static_route'],
+            'cognate', '/', str(cur_time), '/', figure_filename])
+
+        return (
+            figure_url,
+            mst_list,
+            embedding_2d_pca,
+            embedding_3d_pca
+        )
+
     @staticmethod
     def perform_cognate_analysis(
         language_str,
@@ -12196,396 +12589,21 @@ def perform_cognate_analysis(
                     distance_list))
 
         # Generating distance graph, if required.
-
         figure_url = None
-
         mst_list = None
         embedding_2d_pca = None
         embedding_3d_pca = None
 
         if figure_flag:
-
-            d_ij = (distance_data_array + distance_data_array.T) / 2
-
-            log.debug(
-                '\ncognate_analysis {0}:'
-                '\ndistance_header_array:\n{1}'
-                '\ndistance_data_array:\n{2}'
-                '\nd_ij:\n{3}'.format(
-                language_str,
-                distance_header_array,
-                distance_data_array,
-                d_ij))
-
-            # Projecting the graph into a 2d plane via relative distance strain optimization, using PCA to
-            # orient it left-right.
-
-            if len(distance_data_array) > 1:
-
-                embedding_2d, strain_2d = (
-                    CognateAnalysis.graph_2d_embedding(d_ij, verbose = __debug_flag__))
-
-                embedding_2d_pca = (
-                    sklearn.decomposition.PCA(n_components = 2)
-                        .fit_transform(embedding_2d))
-
-                distance_2d = sklearn.metrics.euclidean_distances(embedding_2d)
-
-            else:
-
-                embedding_2d = numpy.zeros((1, 2))
-                embedding_2d_pca = numpy.zeros((1, 2))
-
-                strain_2d = 0.0
-
-                distance_2d = numpy.zeros((1, 1))
-
-            # Showing what we computed.
-
-            log.debug(
-                '\ncognate_analysis {0}:'
-                '\nembedding 2d:\n{1}'
-                '\nembedding 2d (PCA-oriented):\n{2}'
-                '\nstrain 2d:\n{3}'
-                '\ndistances 2d:\n{4}'.format(
-                language_str,
-                embedding_2d,
-                embedding_2d_pca,
-                strain_2d,
-                distance_2d))
-
-            # And now the same with 3d embedding.
-
-            if len(distance_data_array) > 1:
-
-                embedding_3d, strain_3d = (
-                    CognateAnalysis.graph_3d_embedding(d_ij, verbose = __debug_flag__))
-
-                # At least three points, standard PCA-based orientation.
-
-                if len(distance_data_array) >= 3:
-
-                    embedding_3d_pca = (
-                        sklearn.decomposition.PCA(n_components = 3)
-                            .fit_transform(embedding_3d))
-
-                # Only two points, so we take 2d embedding and extend it with zeros.
-
-                else:
-
-                    embedding_3d_pca = (
-
-                        numpy.hstack((
-                            embedding_2d_pca,
-                            numpy.zeros((embedding_2d_pca.shape[0], 1)))))
-
-                # Making 3d embedding actually 3d, if required.
-
-                if embedding_3d_pca.shape[1] <= 2:
-
-                    embedding_3d_pca = (
-
-                        numpy.hstack((
-                            embedding_3d_pca,
-                            numpy.zeros((embedding_3d_pca.shape[0], 1)))))
-
-                distance_3d = (
-                    sklearn.metrics.euclidean_distances(embedding_3d_pca))
-
-            else:
-
-                embedding_3d = numpy.zeros((1, 3))
-                embedding_3d_pca = numpy.zeros((1, 3))
-
-                strain_3d = 0.0
-
-                distance_3d = numpy.zeros((1, 1))
-
-            # Showing what we've get.
-
-            log.debug(
-                '\ncognate_analysis {0}:'
-                '\nembedding 3d:\n{1}'
-                '\nembedding 3d (PCA-oriented):\n{2}'
-                '\nstrain 3d:\n{3}'
-                '\ndistances 3d:\n{4}'.format(
-                language_str,
-                embedding_3d,
-                embedding_3d_pca,
-                strain_3d,
-                distance_3d))
-
-            # Computing minimum spanning tree via standard Jarnik-Prim-Dijkstra algorithm using 2d and 3d
-            # embedding distances to break ties.
-
-            if len(distance_data_array) <= 1:
-                mst_list = []
-
-            else:
-
-                d_min, d_extra_min, min_i, min_j = min(
-                    (d_ij[i,j], distance_2d[i,j] + distance_3d[i,j], i, j)
-                    for i in range(d_ij.shape[0] - 1)
-                    for j in range(i + 1, d_ij.shape[0]))
-
-                mst_list = [(min_i, min_j)]
-                mst_dict = {}
-
-                # MST construction initialization.
-
-                for i in range(d_ij.shape[0]):
-
-                    if i == min_i or i == min_j:
-                        continue
-
-                    d_min_i = (d_ij[i, min_i], distance_2d[i, min_i] + distance_3d[i, min_i])
-                    d_min_j = (d_ij[i, min_j], distance_2d[i, min_j] + distance_3d[i, min_j])
-
-                    mst_dict[i] = (
-                        (d_min_i, min_i) if d_min_i <= d_min_j else
-                        (d_min_j, min_i))
-
-                # Iterative MST construction.
-
-                while len(mst_dict) > 0:
-
-                    (d_min, d_extra_min, i_min, i_from_min) = min(
-                        (d, d_extra, i, i_from) for i, ((d, d_extra), i_from) in mst_dict.items())
-
-                    log.debug('\n' + pprint.pformat(mst_dict))
-                    log.debug('\n' + repr((i_from_min, i_min, d_min, d_extra_min)))
-
-                    mst_list.append((i_from_min, i_min))
-                    del mst_dict[i_min]
-
-                    # Updating shortest connection info.
-
-                    for i_to in mst_dict.keys():
-
-                        d_to = (d_ij[i_min, i_to], distance_2d[i_min, i_to] + distance_3d[i_min, i_to])
-
-                        if d_to < mst_dict[i_to][0]:
-                            mst_dict[i_to] = (d_to, i_min)
-
-            log.debug(
-                '\ncognate_analysis {0}:'
-                '\nminimum spanning tree:\n{1}'.format(
-                language_str,
-                pprint.pformat(mst_list)))
-
-            # Plotting with matplotlib.
-
-            figure = pyplot.figure(figsize = (10, 10))
-            axes = figure.add_subplot(212)
-
-            axes.set_title(
-                'Etymological distance tree (relative distance embedding)',
-                fontsize = 14, family = 'Gentium')
-
-            axes.axis('equal')
-            axes.axis('off')
-            axes.autoscale()
-
-            def f(axes, embedding_pca):
-                """
-                Plots specified graph embedding on a given axis.
-                """
-
-                flag_3d = numpy.size(embedding_pca, 1) > 2
-
-                for index, (position, name) in enumerate(
-                    zip(embedding_pca, distance_header_array)):
-
-                    # Checking if any of the previous perspectives are already in this perspective's
-                    # position.
-
-                    same_position_index = None
-
-                    for i, p in enumerate(embedding_pca[:index]):
-                        if numpy.linalg.norm(position - p) <= 1e-3:
-
-                            same_position_index = i
-                            break
-
-                    color = matplotlib.colors.hsv_to_rgb(
-                        [(same_position_index or index) * 1.0 / len(distance_header_array), 0.5, 0.75])
-
-                    label_same_str = (
-                        '' if same_position_index is None else
-                        ' (same as {0})'.format(same_position_index + 1))
-
-                    kwargs = {
-                        's': 35,
-                        'color': color,
-                        'label': '{0}) {1}{2}'.format(index + 1, name, label_same_str)}
-
-                    axes.scatter(*position, **kwargs)
-
-                    # Annotating position with its number, but only if we hadn't already annotated nearby.
-
-                    if same_position_index is None:
-
-                        if flag_3d:
-
-                            axes.text(
-                                position[0] + 0.01, position[1], position[2] + 0.01,
-                                str(index + 1), None, fontsize = 14)
-
-                        else:
-
-                            axes.annotate(
-                                str(index + 1),
-                                (position[0] + 0.01, position[1] - 0.005),
-                                fontsize = 14)
-
-                # Plotting minimum spanning trees.
-
-                line_list = [
-                    (embedding_pca[i], embedding_pca[j])
-                    for i, j in mst_list]
-
-                line_collection = (
-                    Line3DCollection if flag_3d else LineCollection)(
-                        line_list, zorder = 0, color = 'gray')
-
-                axes.add_collection(line_collection)
-
-                pyplot.setp(axes.texts, family = 'Gentium')
-
-            # Plotting our embedding, creating the legend.
-
-            f(axes, embedding_2d_pca)
-
-            pyplot.tight_layout()
-
-            legend = axes.legend(
-                scatterpoints = 1,
-                loc = 'upper center',
-                bbox_to_anchor = (0.5, -0.05),
-                frameon = False,
-                handlelength = 0.5,
-                handletextpad = 0.75,
-                fontsize = 14)
-
-            pyplot.setp(legend.texts, family = 'Gentium')
-            axes.autoscale_view()
-
-            # Saving generated figure for debug purposes, if required.
-
-            if __debug_flag__:
-
-                figure_file_name = (
-                    'figure cognate distance{0}.png'.format(
-                    mode_name_str))
-
-                with open(figure_file_name, 'wb') as figure_file:
-
-                    pyplot.savefig(
-                        figure_file,
-                        bbox_extra_artists = (legend,),
-                        bbox_inches = 'tight',
-                        pad_inches = 0.25,
-                        format = 'png')
-
-                # Also generating 3d embedding figure.
-
-                figure_3d = pyplot.figure()
-                figure_3d.set_size_inches(16, 10)
-
-                axes_3d = figure_3d.add_subplot(111, projection = '3d')
-
-                axes_3d.axis('equal')
-                axes_3d.view_init(elev = 30, azim = -75)
-
-                f(axes_3d, embedding_3d_pca)
-
-                # Setting up legend.
-
-                axes_3d.set_xlabel('X')
-                axes_3d.set_ylabel('Y')
-                axes_3d.set_zlabel('Z')
-
-                legend_3d = axes_3d.legend(
-                    scatterpoints = 1,
-                    loc = 'upper center',
-                    bbox_to_anchor = (0.5, -0.05),
-                    frameon = False,
-                    handlelength = 0.5,
-                    handletextpad = 0.75,
-                    fontsize = 14)
-
-                pyplot.setp(legend_3d.texts, family = 'Gentium')
-
-                # Fake cubic bounding box to force axis aspect ratios, see
-                # https://stackoverflow.com/a/13701747/2016856.
-
-                X = embedding_3d_pca[:,0]
-                Y = embedding_3d_pca[:,1]
-                Z = embedding_3d_pca[:,2]
-
-                max_range = numpy.array([
-                    X.max() - X.min(), Y.max() - Y.min(), Z.max() - Z.min()]).max()
-
-                Xb = (
-                    0.5 * max_range * numpy.mgrid[-1:2:2,-1:2:2,-1:2:2][0].flatten() +
-                    0.5 * (X.max() + X.min()))
-
-                Yb = (
-                    0.5 * max_range * numpy.mgrid[-1:2:2,-1:2:2,-1:2:2][1].flatten() +
-                    0.5 * (Y.max() + Y.min()))
-
-                Zb = (
-                    0.5 * max_range * numpy.mgrid[-1:2:2,-1:2:2,-1:2:2][2].flatten() +
-                    0.5 * (Z.max() + Z.min()))
-
-                for xb, yb, zb in zip(Xb, Yb, Zb):
-                   axes_3d.plot([xb], [yb], [zb], 'w')
-
-                axes_3d.autoscale_view()
-
-                # And saving it.
-
-                figure_3d_file_name = (
-                    'figure 3d cognate distance{0}.png'.format(
-                    mode_name_str))
-
-                with open(figure_3d_file_name, 'wb') as figure_3d_file:
-
-                    figure_3d.savefig(
-                        figure_3d_file,
-                        bbox_extra_artists = (legend_3d,),
-                        bbox_inches = 'tight',
-                        pad_inches = 0.25,
-                        format = 'png')
-
-            # Storing generated figure as a PNG image.
-
-            figure_filename = pathvalidate.sanitize_filename(
-                '{0} cognate{1} analysis {2:04d}.{3:02d}.{4:02d}.png'.format(
-                    base_language_name[:64],
-                    ' ' + mode if mode else '',
-                    current_datetime.year,
-                    current_datetime.month,
-                    current_datetime.day))
-
-            figure_path = os.path.join(storage_dir, figure_filename)
-            os.makedirs(os.path.dirname(figure_path), exist_ok = True)
-
-            with open(figure_path, 'wb') as figure_file:
-
-                figure.savefig(
-                    figure_file,
-                    bbox_extra_artists = (legend,),
-                    bbox_inches = 'tight',
-                    pad_inches = 0.25,
-                    format = 'png')
-
-            figure_url = ''.join([
-                storage['prefix'], storage['static_route'],
-                'cognate', '/', str(cur_time), '/', figure_filename])
+            figure_url, mst_list, embedding_2d_pca, embedding_3d_pca = \
+                distance_graph(
+                    language_str,
+                    distance_data_array,
+                    distance_header_array,
+                    __debug_flag__
+                )
 
         # Finalizing task status, if required, returning result.
-
         if task_status is not None:
 
             result_link_list = (

From b02f0da4c6867b471fe5461623cdfbb689e1b8e2 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Thu, 18 May 2023 20:59:11 +0300
Subject: [PATCH 20/69] Switch back Cognate analysis

---
 lingvodoc/schema/query.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 9a77feaa..af974fd2 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -12841,7 +12841,7 @@ def mutate(self, info, **args):
                 cognate_suggestions_f if mode == 'suggestions' else
                 cognate_analysis_f)
 
-            if analysis_f is None and False:
+            if analysis_f is None:
 
                 return ResponseError(message =
                     'Analysis library fuction \'{0}()\' is absent, '

From 614aee6fad8374e551162d0d13a108b68dc29c06 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Fri, 19 May 2023 16:15:49 +0300
Subject: [PATCH 21/69] Fix

---
 lingvodoc/schema/query.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index af974fd2..3ebbe069 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -12596,7 +12596,7 @@ def perform_cognate_analysis(
 
         if figure_flag:
             figure_url, mst_list, embedding_2d_pca, embedding_3d_pca = \
-                distance_graph(
+                CognateAnalysis.distance_graph(
                     language_str,
                     distance_data_array,
                     distance_header_array,

From 2a9e4b4a7b15c5af1acf312fc1240da2730f679f Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Fri, 19 May 2023 18:30:59 +0300
Subject: [PATCH 22/69] Fix

---
 lingvodoc/schema/query.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 3ebbe069..2b4e4ec0 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -10890,6 +10890,7 @@ def f_callback(xyz):
     @staticmethod
     def distance_graph(
             language_str,
+            base_language_name,
             distance_data_array,
             distance_header_array,
             __debug_flag__):
@@ -12598,6 +12599,7 @@ def perform_cognate_analysis(
             figure_url, mst_list, embedding_2d_pca, embedding_3d_pca = \
                 CognateAnalysis.distance_graph(
                     language_str,
+                    base_language_name,
                     distance_data_array,
                     distance_header_array,
                     __debug_flag__
@@ -13046,13 +13048,14 @@ def swadesh_statistics(
 
         def compare_translations(swadesh_lex, dictionary_lex):
             def split_lex(lex):
+                if ' заим.' in lex:
+                    return set()
                 # Split by commas and open brackets to separate
-                # various forms of lexem and extra note if is
+                # various forms of lexeme and extra note if is
                 return set(form.strip().lower()
                            for form in lex.replace('(', ',').split(',')
                            if form.strip()
-                           and ')' not in form
-                           and ' заим.' not in form) #exclude notes and borrowings
+                           and ')' not in form)  # exclude notes
             # return true if the intersection is not empty
             return bool(split_lex(swadesh_lex) & split_lex(dictionary_lex))
 

From 8208eef7fd0507b8fae75edacf23e83b1a24eb39 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Fri, 19 May 2023 19:36:39 +0300
Subject: [PATCH 23/69] distance_data_array

---
 lingvodoc/schema/query.py | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 2b4e4ec0..f3c22b56 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -10893,7 +10893,7 @@ def distance_graph(
             base_language_name,
             distance_data_array,
             distance_header_array,
-            __debug_flag__):
+            __debug_flag__ = False):
 
         d_ij = (distance_data_array + distance_data_array.T) / 2
 
@@ -13031,6 +13031,7 @@ class Arguments:
     @staticmethod
     def swadesh_statistics(
             language_str,
+            base_language_name,
             group_field_id,
             perspective_info_list,
             locale_id):
@@ -13068,10 +13069,12 @@ def split_lex(lex):
         # swadesh_set gathers numbers of words within Swadesh' list
         entries_set = {}
         swadesh_set = {}
+        distance_array_size = len(perspective_info_list)
+        distance_data_array = numpy.full((distance_array_size, distance_array_size), 100)
+        distance_header_array = numpy.empty(distance_array_size)
         for index, (perspective_id, _, translation_field_id) in \
                 enumerate(perspective_info_list):
 
-            '''
             # Getting and saving perspective info.
             perspective = (
                 DBSession
@@ -13081,18 +13084,7 @@ def split_lex(lex):
             )
 
             perspective_name = perspective.get_translation(locale_id)
-            dictionary_name = perspective.parent.get_translation(locale_id)
-
-            log.debug(
-                '\nswadesh_analysis {0}:'
-                '\n  dictionary {1}/{2}: {3}'
-                '\n  perspective {4}/{5}: {6}'.format(
-                    language_str,
-                    perspective.parent_client_id, perspective.parent_object_id,
-                    repr(dictionary_name.strip()),
-                    perspective_id[0], perspective_id[1],
-                    repr(perspective_name.strip())))
-            '''
+            distance_header_array[index] = perspective_name
 
             # Getting text data.
             translation_query = (
@@ -13157,15 +13149,23 @@ def split_lex(lex):
             similarity[perspective1] = {}
             print(perspective1, end=' :: ')
             for n2, (perspective2, groups2) in enumerate(links.items()):
-                if n2 <= n1: continue  #exclude duplicates and self-to-self
-                commons_total = len(swadesh_set[perspective1] & swadesh_set[perspective2])
+                #if n2 <= n1: continue  #exclude duplicates and self-to-self
                 commons_linked = len(groups1 & groups2)
+                commons_total = len(swadesh_set[perspective1] & swadesh_set[perspective2])
                 # commons_linked > 0 means that commons_total > 0 even more so
-                distance = math.log(commons_linked / commons_total) / -0.14 if commons_linked > 0 else -1
-                similarity[perspective1][perspective2] = commons_linked, commons_total
+                distance = math.log(commons_linked / commons_total) / -0.14 if commons_linked > 0 else 100
+                distance_data_array[n1][n2] = distance
+                #similarity[perspective1][perspective2] = commons_linked, commons_total
                 print(f"{perspective2}:{commons_linked}/{commons_total}:{distance:.2f}", end=' | ')
             print()
 
+        CognateAnalysis.distance_graph(
+            language_str,
+            base_language_name,
+            distance_data_array,
+            distance_header_array
+        )
+
     @staticmethod
     def mutate(self, info, **args):
         """
@@ -13266,6 +13266,7 @@ def mutate(self, info, **args):
 
             return SwadeshAnalysis.swadesh_statistics(
                 language_str,
+                base_language_name,
                 group_field_id,
                 perspective_info_list,
                 locale_id)

From f57492e1f1767e5c5deab7f44ee6f1897f74c44f Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Fri, 19 May 2023 23:31:52 +0300
Subject: [PATCH 24/69] Result dict

---
 lingvodoc/schema/query.py | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index f3c22b56..632b9b21 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -10893,6 +10893,7 @@ def distance_graph(
             base_language_name,
             distance_data_array,
             distance_header_array,
+            mode,
             __debug_flag__ = False):
 
         d_ij = (distance_data_array + distance_data_array.T) / 2
@@ -12602,6 +12603,7 @@ def perform_cognate_analysis(
                     base_language_name,
                     distance_data_array,
                     distance_header_array,
+                    mode,
                     __debug_flag__
                 )
 
@@ -13028,6 +13030,10 @@ class Arguments:
 
     triumph = graphene.Boolean()
 
+    minimum_spanning_tree = graphene.List(graphene.List(graphene.Int))
+    embedding_2d = graphene.List(graphene.List(graphene.Float))
+    embedding_3d = graphene.List(graphene.List(graphene.Float))
+
     @staticmethod
     def swadesh_statistics(
             language_str,
@@ -13163,9 +13169,28 @@ def split_lex(lex):
             language_str,
             base_language_name,
             distance_data_array,
-            distance_header_array
+            distance_header_array,
+            "swadesh"
         )
 
+        result_dict = (
+
+            dict(
+                triumph = True,
+
+                minimum_spanning_tree = mst_list,
+                embedding_2d = embedding_2d_pca,
+                embedding_3d = embedding_3d_pca))
+
+        if __debug_flag__ and __result_flag__:
+
+            with gzip.open(
+                result_file_name, 'wb') as result_file:
+
+                pickle.dump(result_dict, result_file)
+
+        return SwadeshAnalysis(**result_dict)
+
     @staticmethod
     def mutate(self, info, **args):
         """

From 2a1d07214002803c136ec1c8f56a1a347f99b128 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Sun, 21 May 2023 23:13:30 +0300
Subject: [PATCH 25/69] Tuned distance_graph

---
 lingvodoc/schema/query.py | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 632b9b21..03a6ba43 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -10894,6 +10894,9 @@ def distance_graph(
             distance_data_array,
             distance_header_array,
             mode,
+            storage,
+            storage_dir,
+            figure_filename,
             __debug_flag__ = False):
 
         d_ij = (distance_data_array + distance_data_array.T) / 2
@@ -12604,6 +12607,9 @@ def perform_cognate_analysis(
                     distance_data_array,
                     distance_header_array,
                     mode,
+                    storage,
+                    storage_dir,
+                    figure_filename,
                     __debug_flag__
                 )
 
@@ -13165,13 +13171,17 @@ def split_lex(lex):
                 print(f"{perspective2}:{commons_linked}/{commons_total}:{distance:.2f}", end=' | ')
             print()
 
-        CognateAnalysis.distance_graph(
-            language_str,
-            base_language_name,
-            distance_data_array,
-            distance_header_array,
-            "swadesh"
-        )
+        _, mst_list, embedding_2d_pca, embedding_3d_pca = \
+            CognateAnalysis.distance_graph(
+                language_str,
+                base_language_name,
+                distance_data_array,
+                distance_header_array,
+                "swadesh",
+                storage,
+                storage_dir,
+                figure_filename
+            )
 
         result_dict = (
 

From 76ab301e5ad241675c096835a335430f1378ce4e Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Sun, 21 May 2023 23:34:07 +0300
Subject: [PATCH 26/69] Cleanup

---
 lingvodoc/schema/query.py | 25 -------------------------
 1 file changed, 25 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 03a6ba43..d940aa36 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -12957,31 +12957,6 @@ def mutate(self, info, **args):
 
                 return CognateAnalysis(triumph = True)
 
-            elif mode == 'swadesh':
-
-                return CognateAnalysis.swadesh_statistics(
-                    language_str,
-                    source_perspective_id,
-                    base_language_id,
-                    base_language_name,
-                    group_field_id,
-                    perspective_info_list,
-                    multi_list,
-                    multi_name_list,
-                    mode,
-                    distance_flag,
-                    reference_perspective_id,
-                    figure_flag,
-                    distance_vowel_flag,
-                    distance_consonant_flag,
-                    match_translations_value,
-                    only_orphans_flag,
-                    locale_id,
-                    storage,
-                    None,
-                    __debug_flag__,
-                    __intermediate_flag__)
-
             # We do not use acoustic data, so we perform cognate analysis synchronously.
             else:
 

From ba1b8bd7a4cd91b0a984509dadb5fb6b636bb047 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Mon, 22 May 2023 00:07:37 +0300
Subject: [PATCH 27/69] Storage arg

---
 lingvodoc/schema/query.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index d940aa36..93cce66c 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -10896,7 +10896,6 @@ def distance_graph(
             mode,
             storage,
             storage_dir,
-            figure_filename,
             __debug_flag__ = False):
 
         d_ij = (distance_data_array + distance_data_array.T) / 2
@@ -12609,7 +12608,6 @@ def perform_cognate_analysis(
                     mode,
                     storage,
                     storage_dir,
-                    figure_filename,
                     __debug_flag__
                 )
 
@@ -13021,7 +13019,8 @@ def swadesh_statistics(
             base_language_name,
             group_field_id,
             perspective_info_list,
-            locale_id):
+            locale_id,
+            storage):
 
         swadesh_list = ['я','ты','мы','этот, это','тот, то','кто','что','не','все','много','один','два','большой',
                         'долгий','маленький','женщина','мужчина','человек','рыба','птица','собака','вошь','дерево',
@@ -13146,6 +13145,9 @@ def split_lex(lex):
                 print(f"{perspective2}:{commons_linked}/{commons_total}:{distance:.2f}", end=' | ')
             print()
 
+        cur_time = time.time()
+        storage_dir = os.path.join(storage['path'], 'swadesh', str(cur_time))
+
         _, mst_list, embedding_2d_pca, embedding_3d_pca = \
             CognateAnalysis.distance_graph(
                 language_str,
@@ -13154,8 +13156,7 @@ def split_lex(lex):
                 distance_header_array,
                 "swadesh",
                 storage,
-                storage_dir,
-                figure_filename
+                storage_dir
             )
 
         result_dict = (
@@ -13279,7 +13280,8 @@ def mutate(self, info, **args):
                 base_language_name,
                 group_field_id,
                 perspective_info_list,
-                locale_id)
+                locale_id,
+                storage)
 
         # Exception occured while we tried to perform swadesh analysis.
         except Exception as exception:

From 4f7938a61e88e774585ab17ca651aa17d4bab503 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Mon, 22 May 2023 00:50:36 +0300
Subject: [PATCH 28/69] current_datetime

---
 lingvodoc/schema/query.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 93cce66c..8c2ee4bb 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -11252,7 +11252,7 @@ def f(axes, embedding_pca):
                     format = 'png')
 
         # Storing generated figure as a PNG image.
-
+        current_datetime = datetime.datetime.now(datetime.timezone.utc)
         figure_filename = pathvalidate.sanitize_filename(
             '{0} cognate{1} analysis {2:04d}.{3:02d}.{4:02d}.png'.format(
                 base_language_name[:64],
@@ -13055,9 +13055,9 @@ def split_lex(lex):
         # swadesh_set gathers numbers of words within Swadesh' list
         entries_set = {}
         swadesh_set = {}
-        distance_array_size = len(perspective_info_list)
-        distance_data_array = numpy.full((distance_array_size, distance_array_size), 100)
-        distance_header_array = numpy.empty(distance_array_size)
+        dictionary_count = len(perspective_info_list)
+        distance_data_array = numpy.full((dictionary_count, dictionary_count), 100)
+        distance_header_array = numpy.empty(dictionary_count)
         for index, (perspective_id, _, translation_field_id) in \
                 enumerate(perspective_info_list):
 

From 8a36aaf28980cb12c1979efd96d496420d7aec15 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Mon, 22 May 2023 01:22:27 +0300
Subject: [PATCH 29/69] cur_time

---
 lingvodoc/schema/query.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 8c2ee4bb..4357a559 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -11273,6 +11273,7 @@ def f(axes, embedding_pca):
                 pad_inches = 0.25,
                 format = 'png')
 
+        cur_time = time.time()
         figure_url = ''.join([
             storage['prefix'], storage['static_route'],
             'cognate', '/', str(cur_time), '/', figure_filename])

From eeea3d8ffd2be15d14198ec114d3ecd66fde4b90 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Mon, 22 May 2023 18:43:04 +0300
Subject: [PATCH 30/69] Used handleResult

---
 lingvodoc/schema/query.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 4357a559..fde8b5cc 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13058,7 +13058,7 @@ def split_lex(lex):
         swadesh_set = {}
         dictionary_count = len(perspective_info_list)
         distance_data_array = numpy.full((dictionary_count, dictionary_count), 100)
-        distance_header_array = numpy.empty(dictionary_count)
+        distance_header_array = numpy.empty(dictionary_count, dtype='object')
         for index, (perspective_id, _, translation_field_id) in \
                 enumerate(perspective_info_list):
 
@@ -13159,7 +13159,6 @@ def split_lex(lex):
                 storage,
                 storage_dir
             )
-
         result_dict = (
 
             dict(
@@ -13169,13 +13168,6 @@ def split_lex(lex):
                 embedding_2d = embedding_2d_pca,
                 embedding_3d = embedding_3d_pca))
 
-        if __debug_flag__ and __result_flag__:
-
-            with gzip.open(
-                result_file_name, 'wb') as result_file:
-
-                pickle.dump(result_dict, result_file)
-
         return SwadeshAnalysis(**result_dict)
 
     @staticmethod

From 56a66e5ad4cb6a15710059040f3a91d1f5f4c75d Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Mon, 22 May 2023 20:18:39 +0300
Subject: [PATCH 31/69] perspective_name_list

---
 lingvodoc/schema/query.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index fde8b5cc..f09dc798 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13013,6 +13013,7 @@ class Arguments:
     minimum_spanning_tree = graphene.List(graphene.List(graphene.Int))
     embedding_2d = graphene.List(graphene.List(graphene.Float))
     embedding_3d = graphene.List(graphene.List(graphene.Float))
+    perspective_name_list = graphene.List(graphene.String)
 
     @staticmethod
     def swadesh_statistics(
@@ -13166,7 +13167,8 @@ def split_lex(lex):
 
                 minimum_spanning_tree = mst_list,
                 embedding_2d = embedding_2d_pca,
-                embedding_3d = embedding_3d_pca))
+                embedding_3d = embedding_3d_pca,
+                perspective_name_list = distance_header_array))
 
         return SwadeshAnalysis(**result_dict)
 

From 3a42ee8d582df9f67b49e656f57496323e6d117f Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Tue, 23 May 2023 17:57:05 +0300
Subject: [PATCH 32/69] First graph

---
 lingvodoc/schema/query.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index f09dc798..f03423c5 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13071,8 +13071,8 @@ def split_lex(lex):
                     .first()
             )
 
-            perspective_name = perspective.get_translation(locale_id)
-            distance_header_array[index] = perspective_name
+            dictionary_name = perspective.parent.get_translation(locale_id)
+            distance_header_array[index] = dictionary_name
 
             # Getting text data.
             translation_query = (

From 5b2925c1318c1cada2861847eaea5ecaafad3aae Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Thu, 25 May 2023 12:28:35 +0300
Subject: [PATCH 33/69] __plot_flag__

---
 lingvodoc/schema/query.py | 309 +++++++++++++++++++-------------------
 1 file changed, 157 insertions(+), 152 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index f03423c5..51f141eb 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -10896,6 +10896,7 @@ def distance_graph(
             mode,
             storage,
             storage_dir,
+            __plot_flag__ = True,
             __debug_flag__ = False):
 
         d_ij = (distance_data_array + distance_data_array.T) / 2
@@ -11067,216 +11068,219 @@ def distance_graph(
             pprint.pformat(mst_list)))
 
         # Plotting with matplotlib.
+        figure_url = None
+        if __plot_flag__:
 
-        figure = pyplot.figure(figsize = (10, 10))
-        axes = figure.add_subplot(212)
+            figure = pyplot.figure(figsize = (10, 10))
+            axes = figure.add_subplot(212)
 
-        axes.set_title(
-            'Etymological distance tree (relative distance embedding)',
-            fontsize = 14, family = 'Gentium')
+            axes.set_title(
+                'Etymological distance tree (relative distance embedding)',
+                fontsize = 14, family = 'Gentium')
 
-        axes.axis('equal')
-        axes.axis('off')
-        axes.autoscale()
+            axes.axis('equal')
+            axes.axis('off')
+            axes.autoscale()
 
-        def f(axes, embedding_pca):
-            """
-            Plots specified graph embedding on a given axis.
-            """
+            def f(axes, embedding_pca):
+                """
+                Plots specified graph embedding on a given axis.
+                """
 
-            flag_3d = numpy.size(embedding_pca, 1) > 2
+                flag_3d = numpy.size(embedding_pca, 1) > 2
 
-            for index, (position, name) in enumerate(
-                zip(embedding_pca, distance_header_array)):
+                for index, (position, name) in enumerate(
+                    zip(embedding_pca, distance_header_array)):
 
-                # Checking if any of the previous perspectives are already in this perspective's
-                # position.
+                    # Checking if any of the previous perspectives are already in this perspective's
+                    # position.
 
-                same_position_index = None
+                    same_position_index = None
 
-                for i, p in enumerate(embedding_pca[:index]):
-                    if numpy.linalg.norm(position - p) <= 1e-3:
+                    for i, p in enumerate(embedding_pca[:index]):
+                        if numpy.linalg.norm(position - p) <= 1e-3:
 
-                        same_position_index = i
-                        break
+                            same_position_index = i
+                            break
 
-                color = matplotlib.colors.hsv_to_rgb(
-                    [(same_position_index or index) * 1.0 / len(distance_header_array), 0.5, 0.75])
+                    color = matplotlib.colors.hsv_to_rgb(
+                        [(same_position_index or index) * 1.0 / len(distance_header_array), 0.5, 0.75])
 
-                label_same_str = (
-                    '' if same_position_index is None else
-                    ' (same as {0})'.format(same_position_index + 1))
+                    label_same_str = (
+                        '' if same_position_index is None else
+                        ' (same as {0})'.format(same_position_index + 1))
 
-                kwargs = {
-                    's': 35,
-                    'color': color,
-                    'label': '{0}) {1}{2}'.format(index + 1, name, label_same_str)}
+                    kwargs = {
+                        's': 35,
+                        'color': color,
+                        'label': '{0}) {1}{2}'.format(index + 1, name, label_same_str)}
 
-                axes.scatter(*position, **kwargs)
+                    axes.scatter(*position, **kwargs)
 
-                # Annotating position with its number, but only if we hadn't already annotated nearby.
+                    # Annotating position with its number, but only if we hadn't already annotated nearby.
 
-                if same_position_index is None:
+                    if same_position_index is None:
 
-                    if flag_3d:
+                        if flag_3d:
 
-                        axes.text(
-                            position[0] + 0.01, position[1], position[2] + 0.01,
-                            str(index + 1), None, fontsize = 14)
+                            axes.text(
+                                position[0] + 0.01, position[1], position[2] + 0.01,
+                                str(index + 1), None, fontsize = 14)
 
-                    else:
+                        else:
 
-                        axes.annotate(
-                            str(index + 1),
-                            (position[0] + 0.01, position[1] - 0.005),
-                            fontsize = 14)
+                            axes.annotate(
+                                str(index + 1),
+                                (position[0] + 0.01, position[1] - 0.005),
+                                fontsize = 14)
 
-            # Plotting minimum spanning trees.
+                # Plotting minimum spanning trees.
 
-            line_list = [
-                (embedding_pca[i], embedding_pca[j])
-                for i, j in mst_list]
+                line_list = [
+                    (embedding_pca[i], embedding_pca[j])
+                    for i, j in mst_list]
 
-            line_collection = (
-                Line3DCollection if flag_3d else LineCollection)(
-                    line_list, zorder = 0, color = 'gray')
+                line_collection = (
+                    Line3DCollection if flag_3d else LineCollection)(
+                        line_list, zorder = 0, color = 'gray')
 
-            axes.add_collection(line_collection)
+                axes.add_collection(line_collection)
 
-            pyplot.setp(axes.texts, family = 'Gentium')
+                pyplot.setp(axes.texts, family = 'Gentium')
 
-        # Plotting our embedding, creating the legend.
+            # Plotting our embedding, creating the legend.
 
-        f(axes, embedding_2d_pca)
+            f(axes, embedding_2d_pca)
 
-        pyplot.tight_layout()
+            pyplot.tight_layout()
 
-        legend = axes.legend(
-            scatterpoints = 1,
-            loc = 'upper center',
-            bbox_to_anchor = (0.5, -0.05),
-            frameon = False,
-            handlelength = 0.5,
-            handletextpad = 0.75,
-            fontsize = 14)
+            legend = axes.legend(
+                scatterpoints = 1,
+                loc = 'upper center',
+                bbox_to_anchor = (0.5, -0.05),
+                frameon = False,
+                handlelength = 0.5,
+                handletextpad = 0.75,
+                fontsize = 14)
 
-        pyplot.setp(legend.texts, family = 'Gentium')
-        axes.autoscale_view()
+            pyplot.setp(legend.texts, family = 'Gentium')
+            axes.autoscale_view()
 
-        # Saving generated figure for debug purposes, if required.
+            # Saving generated figure for debug purposes, if required.
 
-        if __debug_flag__:
+            if __debug_flag__:
 
-            figure_file_name = (
-                'figure cognate distance{0}.png'.format(
-                mode_name_str))
+                figure_file_name = (
+                    'figure cognate distance{0}.png'.format(
+                    mode_name_str))
 
-            with open(figure_file_name, 'wb') as figure_file:
+                with open(figure_file_name, 'wb') as figure_file:
 
-                pyplot.savefig(
-                    figure_file,
-                    bbox_extra_artists = (legend,),
-                    bbox_inches = 'tight',
-                    pad_inches = 0.25,
-                    format = 'png')
+                    pyplot.savefig(
+                        figure_file,
+                        bbox_extra_artists = (legend,),
+                        bbox_inches = 'tight',
+                        pad_inches = 0.25,
+                        format = 'png')
 
-            # Also generating 3d embedding figure.
+                # Also generating 3d embedding figure.
 
-            figure_3d = pyplot.figure()
-            figure_3d.set_size_inches(16, 10)
+                figure_3d = pyplot.figure()
+                figure_3d.set_size_inches(16, 10)
 
-            axes_3d = figure_3d.add_subplot(111, projection = '3d')
+                axes_3d = figure_3d.add_subplot(111, projection = '3d')
 
-            axes_3d.axis('equal')
-            axes_3d.view_init(elev = 30, azim = -75)
+                axes_3d.axis('equal')
+                axes_3d.view_init(elev = 30, azim = -75)
 
-            f(axes_3d, embedding_3d_pca)
+                f(axes_3d, embedding_3d_pca)
 
-            # Setting up legend.
+                # Setting up legend.
 
-            axes_3d.set_xlabel('X')
-            axes_3d.set_ylabel('Y')
-            axes_3d.set_zlabel('Z')
+                axes_3d.set_xlabel('X')
+                axes_3d.set_ylabel('Y')
+                axes_3d.set_zlabel('Z')
 
-            legend_3d = axes_3d.legend(
-                scatterpoints = 1,
-                loc = 'upper center',
-                bbox_to_anchor = (0.5, -0.05),
-                frameon = False,
-                handlelength = 0.5,
-                handletextpad = 0.75,
-                fontsize = 14)
+                legend_3d = axes_3d.legend(
+                    scatterpoints = 1,
+                    loc = 'upper center',
+                    bbox_to_anchor = (0.5, -0.05),
+                    frameon = False,
+                    handlelength = 0.5,
+                    handletextpad = 0.75,
+                    fontsize = 14)
 
-            pyplot.setp(legend_3d.texts, family = 'Gentium')
+                pyplot.setp(legend_3d.texts, family = 'Gentium')
 
-            # Fake cubic bounding box to force axis aspect ratios, see
-            # https://stackoverflow.com/a/13701747/2016856.
+                # Fake cubic bounding box to force axis aspect ratios, see
+                # https://stackoverflow.com/a/13701747/2016856.
 
-            X = embedding_3d_pca[:,0]
-            Y = embedding_3d_pca[:,1]
-            Z = embedding_3d_pca[:,2]
+                X = embedding_3d_pca[:,0]
+                Y = embedding_3d_pca[:,1]
+                Z = embedding_3d_pca[:,2]
 
-            max_range = numpy.array([
-                X.max() - X.min(), Y.max() - Y.min(), Z.max() - Z.min()]).max()
+                max_range = numpy.array([
+                    X.max() - X.min(), Y.max() - Y.min(), Z.max() - Z.min()]).max()
 
-            Xb = (
-                0.5 * max_range * numpy.mgrid[-1:2:2,-1:2:2,-1:2:2][0].flatten() +
-                0.5 * (X.max() + X.min()))
+                Xb = (
+                    0.5 * max_range * numpy.mgrid[-1:2:2,-1:2:2,-1:2:2][0].flatten() +
+                    0.5 * (X.max() + X.min()))
 
-            Yb = (
-                0.5 * max_range * numpy.mgrid[-1:2:2,-1:2:2,-1:2:2][1].flatten() +
-                0.5 * (Y.max() + Y.min()))
+                Yb = (
+                    0.5 * max_range * numpy.mgrid[-1:2:2,-1:2:2,-1:2:2][1].flatten() +
+                    0.5 * (Y.max() + Y.min()))
 
-            Zb = (
-                0.5 * max_range * numpy.mgrid[-1:2:2,-1:2:2,-1:2:2][2].flatten() +
-                0.5 * (Z.max() + Z.min()))
+                Zb = (
+                    0.5 * max_range * numpy.mgrid[-1:2:2,-1:2:2,-1:2:2][2].flatten() +
+                    0.5 * (Z.max() + Z.min()))
 
-            for xb, yb, zb in zip(Xb, Yb, Zb):
-               axes_3d.plot([xb], [yb], [zb], 'w')
+                for xb, yb, zb in zip(Xb, Yb, Zb):
+                   axes_3d.plot([xb], [yb], [zb], 'w')
 
-            axes_3d.autoscale_view()
+                axes_3d.autoscale_view()
 
-            # And saving it.
+                # And saving it.
 
-            figure_3d_file_name = (
-                'figure 3d cognate distance{0}.png'.format(
-                mode_name_str))
+                figure_3d_file_name = (
+                    'figure 3d cognate distance{0}.png'.format(
+                    mode_name_str))
 
-            with open(figure_3d_file_name, 'wb') as figure_3d_file:
+                with open(figure_3d_file_name, 'wb') as figure_3d_file:
 
-                figure_3d.savefig(
-                    figure_3d_file,
-                    bbox_extra_artists = (legend_3d,),
+                    figure_3d.savefig(
+                        figure_3d_file,
+                        bbox_extra_artists = (legend_3d,),
+                        bbox_inches = 'tight',
+                        pad_inches = 0.25,
+                        format = 'png')
+
+            # Storing generated figure as a PNG image.
+            current_datetime = datetime.datetime.now(datetime.timezone.utc)
+            figure_filename = pathvalidate.sanitize_filename(
+                '{0} cognate{1} analysis {2:04d}.{3:02d}.{4:02d}.png'.format(
+                    base_language_name[:64],
+                    ' ' + mode if mode else '',
+                    current_datetime.year,
+                    current_datetime.month,
+                    current_datetime.day))
+
+            figure_path = os.path.join(storage_dir, figure_filename)
+            os.makedirs(os.path.dirname(figure_path), exist_ok = True)
+
+            with open(figure_path, 'wb') as figure_file:
+
+                figure.savefig(
+                    figure_file,
+                    bbox_extra_artists = (legend,),
                     bbox_inches = 'tight',
                     pad_inches = 0.25,
                     format = 'png')
 
-        # Storing generated figure as a PNG image.
-        current_datetime = datetime.datetime.now(datetime.timezone.utc)
-        figure_filename = pathvalidate.sanitize_filename(
-            '{0} cognate{1} analysis {2:04d}.{3:02d}.{4:02d}.png'.format(
-                base_language_name[:64],
-                ' ' + mode if mode else '',
-                current_datetime.year,
-                current_datetime.month,
-                current_datetime.day))
-
-        figure_path = os.path.join(storage_dir, figure_filename)
-        os.makedirs(os.path.dirname(figure_path), exist_ok = True)
-
-        with open(figure_path, 'wb') as figure_file:
-
-            figure.savefig(
-                figure_file,
-                bbox_extra_artists = (legend,),
-                bbox_inches = 'tight',
-                pad_inches = 0.25,
-                format = 'png')
-
-        cur_time = time.time()
-        figure_url = ''.join([
-            storage['prefix'], storage['static_route'],
-            'cognate', '/', str(cur_time), '/', figure_filename])
+            cur_time = time.time()
+            figure_url = ''.join([
+                storage['prefix'], storage['static_route'],
+                'cognate', '/', str(cur_time), '/', figure_filename])
+        ### Plotting with matplotlib ends
 
         return (
             figure_url,
@@ -13158,7 +13162,8 @@ def split_lex(lex):
                 distance_header_array,
                 "swadesh",
                 storage,
-                storage_dir
+                storage_dir,
+                __plot_flag__ = False
             )
         result_dict = (
 
@@ -13191,7 +13196,7 @@ def mutate(self, info, **args):
         # Administrator / perspective author / editing permission check.
         error_str = (
             'Only administrator, perspective author and users with perspective editing permissions '
-            'can perform swadesh analysis.')
+            'can perform Swadesh analysis.')
 
         client_id = info.context.request.authenticated_userid
 

From f9633c06c18c51405b1720dbd2265ac376378e29 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Thu, 25 May 2023 23:30:56 +0300
Subject: [PATCH 34/69] Gathered result_pool, fixed a vulnerability

---
 lingvodoc/schema/query.py | 73 +++++++++++++++++++++++++++++++--------
 1 file changed, 59 insertions(+), 14 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 51f141eb..c47a4e06 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13061,10 +13061,8 @@ def split_lex(lex):
         # swadesh_set gathers numbers of words within Swadesh' list
         entries_set = {}
         swadesh_set = {}
-        dictionary_count = len(perspective_info_list)
-        distance_data_array = numpy.full((dictionary_count, dictionary_count), 100)
-        distance_header_array = numpy.empty(dictionary_count, dtype='object')
-        for index, (perspective_id, _, translation_field_id) in \
+        result_pool = {}
+        for index, (perspective_id, word_field_id, translation_field_id) in \
                 enumerate(perspective_info_list):
 
             # Getting and saving perspective info.
@@ -13074,11 +13072,32 @@ def split_lex(lex):
                     .filter_by(client_id=perspective_id[0], object_id=perspective_id[1])
                     .first()
             )
-
             dictionary_name = perspective.parent.get_translation(locale_id)
-            distance_header_array[index] = dictionary_name
 
             # Getting text data.
+            word_query = (
+                DBSession
+                    .query(
+                        dbLexicalEntry.client_id,
+                        dbLexicalEntry.object_id)
+                    .filter(
+                        dbLexicalEntry.parent_client_id == perspective_id[0],
+                        dbLexicalEntry.parent_object_id == perspective_id[1],
+                        dbLexicalEntry.marked_for_deletion == False,
+                        dbEntity.parent_client_id == dbLexicalEntry.client_id,
+                        dbEntity.parent_object_id == dbLexicalEntry.object_id,
+                        dbEntity.field_client_id == word_field_id[0],
+                        dbEntity.field_object_id == word_field_id[1],
+                        dbEntity.marked_for_deletion == False,
+                        dbPublishingEntity.client_id == dbEntity.client_id,
+                        dbPublishingEntity.object_id == dbEntity.object_id,
+                        dbPublishingEntity.published == True,
+                        dbPublishingEntity.accepted == True)
+                    .add_columns(
+                        func.array_agg(dbEntity.content).label('word'))
+                    .group_by(dbLexicalEntry)
+                    .subquery())
+
             translation_query = (
                 DBSession
                     .query(
@@ -13100,14 +13119,30 @@ def split_lex(lex):
                     .add_columns(
                         func.array_agg(dbEntity.content).label('translation'))
                     .group_by(dbLexicalEntry)
+                    .subquery())
+
+            # Main query for word/translation data.
+            data_query = (
+                DBSession
+                    .query(word_query)
+                    .outerjoin(translation_query, and_(
+                        word_query.c.client_id == translation_query.c.client_id,
+                        word_query.c.object_id == translation_query.c.object_id))
+                    .add_columns(
+                        translation_query.c.translation)
                     .all())
 
             # Grouping translations by lexical entries.
             entries_set[perspective_id] = set()
             swadesh_set[perspective_id] = set()
-            for row_index, row in enumerate(translation_query):
+            result_pool[perspective_id] = {'name': dictionary_name}
+            for row_index, row in enumerate(data_query):
                 entry_id = tuple(row[:2])
-                translation_list = row[2]
+                word_list, translation_list = row[2:4]
+
+                # If we have no words for this lexical entry, we skip it altogether.
+                if not word_list:
+                    continue
 
                 translation_list = (
                     [] if not translation_list else [
@@ -13121,24 +13156,35 @@ def split_lex(lex):
                             # Store entry_id and number of the lex within Swadesh' list
                             entries_set[perspective_id].add(entry_id)
                             swadesh_set[perspective_id].add(swadesh_num)
-                            #print(entry_id, swadesh_num, translation_lex)
+                            # Store the entry content in human readable format
+                            result_pool[perspective_id][entry_id] = {
+                                'group': None,
+                                'swadesh': swadesh_lex,
+                                'word': word_list[0],
+                                'translation': translation_lex
+                            }
 
         # Create dictionary of sets:
         # keys: pepspective_id
         # values: numbers of etymological groups where an entry from dictionary is met
-        links = {}
+        links = collections.OrderedDict()
         for perspective, entries in entries_set.items():
             links[perspective] = set()
             for group_index, group in enumerate(group_list):
-                if (entries & group):
+                linked = entries & group
+                if linked:
                     links[perspective].add(group_index)
+                    result_pool[perspective][linked.pop()]['group'] = group_index
+
+        dictionary_count = len(links)
+        distance_data_array = numpy.full((dictionary_count, dictionary_count), 100)
+        distance_header_array = numpy.empty(dictionary_count, dtype='object')
 
         # Calculate intersection between lists of group numbers
         # So length of this intersection is the similarity of corresponding perspectives
         # commons_total means amount of Swadesh's lexems met in the both perspectives
-        similarity = {}
         for n1, (perspective1, groups1) in enumerate(links.items()):
-            similarity[perspective1] = {}
+            distance_header_array[n1] = result_pool[perspective1]['name']
             print(perspective1, end=' :: ')
             for n2, (perspective2, groups2) in enumerate(links.items()):
                 #if n2 <= n1: continue  #exclude duplicates and self-to-self
@@ -13147,7 +13193,6 @@ def split_lex(lex):
                 # commons_linked > 0 means that commons_total > 0 even more so
                 distance = math.log(commons_linked / commons_total) / -0.14 if commons_linked > 0 else 100
                 distance_data_array[n1][n2] = distance
-                #similarity[perspective1][perspective2] = commons_linked, commons_total
                 print(f"{perspective2}:{commons_linked}/{commons_total}:{distance:.2f}", end=' | ')
             print()
 

From ee005c9c82097a6fb899ccb40569e83c56a1cdfb Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Thu, 25 May 2023 23:59:54 +0300
Subject: [PATCH 35/69] Cleanup

---
 lingvodoc/schema/query.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index c47a4e06..4538a5e9 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13041,6 +13041,7 @@ def swadesh_statistics(
 
         def compare_translations(swadesh_lex, dictionary_lex):
             def split_lex(lex):
+                #TODO: move this condition
                 if ' заим.' in lex:
                     return set()
                 # Split by commas and open brackets to separate
@@ -13156,7 +13157,7 @@ def split_lex(lex):
                             # Store entry_id and number of the lex within Swadesh' list
                             entries_set[perspective_id].add(entry_id)
                             swadesh_set[perspective_id].add(swadesh_num)
-                            # Store the entry content in human readable format
+                            # Store the entry's content in human readable format
                             result_pool[perspective_id][entry_id] = {
                                 'group': None,
                                 'swadesh': swadesh_lex,
@@ -13178,7 +13179,7 @@ def split_lex(lex):
 
         dictionary_count = len(links)
         distance_data_array = numpy.full((dictionary_count, dictionary_count), 100)
-        distance_header_array = numpy.empty(dictionary_count, dtype='object')
+        distance_header_array = numpy.full(dictionary_count, "<noname>", dtype='object')
 
         # Calculate intersection between lists of group numbers
         # So length of this intersection is the similarity of corresponding perspectives
@@ -13196,18 +13197,15 @@ def split_lex(lex):
                 print(f"{perspective2}:{commons_linked}/{commons_total}:{distance:.2f}", end=' | ')
             print()
 
-        cur_time = time.time()
-        storage_dir = os.path.join(storage['path'], 'swadesh', str(cur_time))
-
         _, mst_list, embedding_2d_pca, embedding_3d_pca = \
             CognateAnalysis.distance_graph(
                 language_str,
                 base_language_name,
                 distance_data_array,
                 distance_header_array,
-                "swadesh",
-                storage,
-                storage_dir,
+                None,
+                None,
+                None,
                 __plot_flag__ = False
             )
         result_dict = (

From 568eb3e280fab0c2ce52156d4e495f2612afc097 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Fri, 26 May 2023 11:01:28 +0300
Subject: [PATCH 36/69] Args right order

---
 lingvodoc/schema/query.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 4538a5e9..855ea0de 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -10896,8 +10896,8 @@ def distance_graph(
             mode,
             storage,
             storage_dir,
-            __plot_flag__ = True,
-            __debug_flag__ = False):
+            __debug_flag__ = False,
+            __plot_flag__ = True):
 
         d_ij = (distance_data_array + distance_data_array.T) / 2
 

From 90324a1e4c976621679dbadd25d9c26e26afb7df Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Fri, 26 May 2023 22:43:03 +0300
Subject: [PATCH 37/69] create_table

---
 lingvodoc/schema/query.py | 63 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 855ea0de..c2758fc3 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13019,6 +13019,64 @@ class Arguments:
     embedding_3d = graphene.List(graphene.List(graphene.Float))
     perspective_name_list = graphene.List(graphene.String)
 
+    @staticmethod
+    def create_table(result_pool, group_count):
+        '''
+        Keys:
+        result_pool[perspective_id][entry_id]
+        Fields:
+        'group': group_index,
+        'swadesh': swadesh_lex,
+        'word': word_list[0],
+        'translation': translation_lex
+        '''
+
+        space = ' '
+        col_len = 62
+        def combine(*args):
+            result = space * 2
+            fld_len = ((col_len - 2) // len(args)) - 2
+
+            for s in args:
+                result += f"{str(s).ljust(fld_len)[:fld_len]}{space * 2}"
+            return result
+
+        dict_count = len(result_pool)
+        #print(f"{dict_count}:{result_pool})
+
+        # 'groups' is horizontals in table before 'single'
+        groups = [[None] * dict_count] * group_count
+
+        # 'single' is verticals in table after 'groups'
+        # first element in every vertical is the dictionary name
+        single = [[]] * dict_count
+
+        # re-group by group number and add joined values
+        for dict_index, perspective in enumerate(result_pool.values()):
+            dict_name = combine(f"{dict_index + 1}. {perspective['name']}")
+            single[dict_index].append(dict_name)
+
+            for entry in perspective.values():
+                print(entry)
+                group_num = entry['group']
+                entry_text = combine(entry['swadesh'], entry['word'], entry['translate'])
+                if group_num:
+                    groups[group_num][dict_index] = entry_text
+                else:
+                    single[dict_index].append(entry_text)
+
+        # iterate through 'groups' and 'single' and concatenate result
+        result = ""
+        # headers
+        result += ''.join(single[:][0]) + '\n\n'
+        # groups by lines
+        result += '\n'.join(''.join(line) for line in groups)
+        # not-cognates by columns
+        for indent, entries in enumerate(single):
+            result += '\n'.join(space * col_len * indent + entry for entry in entries)
+
+        return result
+
     @staticmethod
     def swadesh_statistics(
             language_str,
@@ -13208,11 +13266,16 @@ def split_lex(lex):
                 None,
                 __plot_flag__ = False
             )
+
+        result = SwadeshAnalysis.create_table(result_pool, len(group_list))
+        print(result)
+
         result_dict = (
 
             dict(
                 triumph = True,
 
+                #result = result,
                 minimum_spanning_tree = mst_list,
                 embedding_2d = embedding_2d_pca,
                 embedding_3d = embedding_3d_pca,

From cd8f5220299e581f9558df8c9530390c49d7a686 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Fri, 26 May 2023 22:54:11 +0300
Subject: [PATCH 38/69] Fix

---
 lingvodoc/schema/query.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index c2758fc3..a58c1a8c 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13057,7 +13057,7 @@ def combine(*args):
             single[dict_index].append(dict_name)
 
             for entry in perspective.values():
-                print(entry)
+                if not isinstance(entry, dict): continue
                 group_num = entry['group']
                 entry_text = combine(entry['swadesh'], entry['word'], entry['translate'])
                 if group_num:

From 08e5bd8b6086779b22dcf85aaac9f94369b674df Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Fri, 26 May 2023 23:07:54 +0300
Subject: [PATCH 39/69] Fix

---
 lingvodoc/schema/query.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index a58c1a8c..3dfc7ba3 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13042,10 +13042,9 @@ def combine(*args):
             return result
 
         dict_count = len(result_pool)
-        #print(f"{dict_count}:{result_pool})
 
         # 'groups' is horizontals in table before 'single'
-        groups = [[None] * dict_count] * group_count
+        groups = [[""] * dict_count] * group_count
 
         # 'single' is verticals in table after 'groups'
         # first element in every vertical is the dictionary name
@@ -13059,7 +13058,8 @@ def combine(*args):
             for entry in perspective.values():
                 if not isinstance(entry, dict): continue
                 group_num = entry['group']
-                entry_text = combine(entry['swadesh'], entry['word'], entry['translate'])
+                entry_text = combine(entry['swadesh'], entry['word'], entry['translation'])
+                print(entry_text)
                 if group_num:
                     groups[group_num][dict_index] = entry_text
                 else:

From 74e40a4753ce696ba16b2d100b5a073e722dea56 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Sun, 28 May 2023 00:03:10 +0300
Subject: [PATCH 40/69] Some fixes

---
 lingvodoc/schema/query.py | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 3dfc7ba3..d2dc72a7 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13014,6 +13014,7 @@ class Arguments:
 
     triumph = graphene.Boolean()
 
+    result = graphene.String()
     minimum_spanning_tree = graphene.List(graphene.List(graphene.Int))
     embedding_2d = graphene.List(graphene.List(graphene.Float))
     embedding_3d = graphene.List(graphene.List(graphene.Float))
@@ -13033,6 +13034,7 @@ def create_table(result_pool, group_count):
 
         space = ' '
         col_len = 62
+        # get length-fixed lines
         def combine(*args):
             result = space * 2
             fld_len = ((col_len - 2) // len(args)) - 2
@@ -13044,36 +13046,33 @@ def combine(*args):
         dict_count = len(result_pool)
 
         # 'groups' is horizontals in table before 'single'
-        groups = [[""] * dict_count] * group_count
+        groups = numpy.full((group_count, dict_count), space*col_len, dtype='object')
 
         # 'single' is verticals in table after 'groups'
         # first element in every vertical is the dictionary name
-        single = [[]] * dict_count
+        single = [None] * dict_count
 
         # re-group by group number and add joined values
         for dict_index, perspective in enumerate(result_pool.values()):
             dict_name = combine(f"{dict_index + 1}. {perspective['name']}")
-            single[dict_index].append(dict_name)
-
+            single[dict_index] = [dict_name]
             for entry in perspective.values():
                 if not isinstance(entry, dict): continue
                 group_num = entry['group']
                 entry_text = combine(entry['swadesh'], entry['word'], entry['translation'])
-                print(entry_text)
                 if group_num:
                     groups[group_num][dict_index] = entry_text
                 else:
                     single[dict_index].append(entry_text)
-
         # iterate through 'groups' and 'single' and concatenate result
         result = ""
         # headers
-        result += ''.join(single[:][0]) + '\n\n'
+        result += ''.join(single[n][0] for n in range(dict_count)) + '\n\n'
         # groups by lines
-        result += '\n'.join(''.join(line) for line in groups)
+        result += '\n'.join(''.join(line) for line in groups) + '\n'
         # not-cognates by columns
         for indent, entries in enumerate(single):
-            result += '\n'.join(space * col_len * indent + entry for entry in entries)
+            result += '\n'.join(space * col_len * indent + entry for entry in entries[1:])
 
         return result
 
@@ -13244,7 +13243,7 @@ def split_lex(lex):
         # commons_total means amount of Swadesh's lexems met in the both perspectives
         for n1, (perspective1, groups1) in enumerate(links.items()):
             distance_header_array[n1] = result_pool[perspective1]['name']
-            print(perspective1, end=' :: ')
+            #print(perspective1, end=' :: ')
             for n2, (perspective2, groups2) in enumerate(links.items()):
                 #if n2 <= n1: continue  #exclude duplicates and self-to-self
                 commons_linked = len(groups1 & groups2)
@@ -13252,8 +13251,8 @@ def split_lex(lex):
                 # commons_linked > 0 means that commons_total > 0 even more so
                 distance = math.log(commons_linked / commons_total) / -0.14 if commons_linked > 0 else 100
                 distance_data_array[n1][n2] = distance
-                print(f"{perspective2}:{commons_linked}/{commons_total}:{distance:.2f}", end=' | ')
-            print()
+                #print(f"{perspective2}:{commons_linked}/{commons_total}:{distance:.2f}", end=' | ')
+            #print()
 
         _, mst_list, embedding_2d_pca, embedding_3d_pca = \
             CognateAnalysis.distance_graph(
@@ -13268,14 +13267,13 @@ def split_lex(lex):
             )
 
         result = SwadeshAnalysis.create_table(result_pool, len(group_list))
-        print(result)
 
         result_dict = (
 
             dict(
                 triumph = True,
 
-                #result = result,
+                result = result,
                 minimum_spanning_tree = mst_list,
                 embedding_2d = embedding_2d_pca,
                 embedding_3d = embedding_3d_pca,

From 4b915daa9ac8121cfc2f39bdd0c892adcffa7592 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Sun, 28 May 2023 10:40:39 +0300
Subject: [PATCH 41/69] Some fixes

---
 lingvodoc/schema/query.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index d2dc72a7..2b050353 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13033,7 +13033,7 @@ def create_table(result_pool, group_count):
         '''
 
         space = ' '
-        col_len = 62
+        col_len = 50
         # get length-fixed lines
         def combine(*args):
             result = space * 2
@@ -13046,7 +13046,7 @@ def combine(*args):
         dict_count = len(result_pool)
 
         # 'groups' is horizontals in table before 'single'
-        groups = numpy.full((group_count, dict_count), space*col_len, dtype='object')
+        groups = numpy.full((group_count, dict_count), '.'*col_len, dtype='object')
 
         # 'single' is verticals in table after 'groups'
         # first element in every vertical is the dictionary name
@@ -13072,7 +13072,8 @@ def combine(*args):
         result += '\n'.join(''.join(line) for line in groups) + '\n'
         # not-cognates by columns
         for indent, entries in enumerate(single):
-            result += '\n'.join(space * col_len * indent + entry for entry in entries[1:])
+            result += '\n'.join(space * col_len * indent + entry
+                                for entry in entries[1:]) + '\n'
 
         return result
 

From 9a5f04c49d345abefddb1ce24c3171d73fea465e Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Sun, 28 May 2023 16:00:26 +0300
Subject: [PATCH 42/69] Used pandas dataframe

---
 lingvodoc/schema/query.py     | 26 ++++++++++++--------------
 server-requirements-final.txt |  1 +
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 2b050353..187c7008 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13032,8 +13032,11 @@ def create_table(result_pool, group_count):
         'translation': translation_lex
         '''
 
+        import pandas as pd
+
         space = ' '
         col_len = 50
+
         # get length-fixed lines
         def combine(*args):
             result = space * 2
@@ -13041,29 +13044,23 @@ def combine(*args):
 
             for s in args:
                 result += f"{str(s).ljust(fld_len)[:fld_len]}{space * 2}"
-            return result
-
-        dict_count = len(result_pool)
-
-        # 'groups' is horizontals in table before 'single'
-        groups = numpy.full((group_count, dict_count), '.'*col_len, dtype='object')
 
-        # 'single' is verticals in table after 'groups'
-        # first element in every vertical is the dictionary name
-        single = [None] * dict_count
+            return result
 
+        groups = pd.DataFrame()
         # re-group by group number and add joined values
         for dict_index, perspective in enumerate(result_pool.values()):
             dict_name = combine(f"{dict_index + 1}. {perspective['name']}")
-            single[dict_index] = [dict_name]
             for entry in perspective.values():
                 if not isinstance(entry, dict): continue
                 group_num = entry['group']
                 entry_text = combine(entry['swadesh'], entry['word'], entry['translation'])
                 if group_num:
-                    groups[group_num][dict_index] = entry_text
+                    groups.loc[group_num, dict_name] = entry_text
                 else:
-                    single[dict_index].append(entry_text)
+                    groups.loc[group_count, dict_name] = entry_text
+                    group_count += 1
+        '''
         # iterate through 'groups' and 'single' and concatenate result
         result = ""
         # headers
@@ -13074,8 +13071,9 @@ def combine(*args):
         for indent, entries in enumerate(single):
             result += '\n'.join(space * col_len * indent + entry
                                 for entry in entries[1:]) + '\n'
+        '''
 
-        return result
+        return groups.to_html(index=False)
 
     @staticmethod
     def swadesh_statistics(
@@ -13120,7 +13118,7 @@ def split_lex(lex):
         # swadesh_set gathers numbers of words within Swadesh' list
         entries_set = {}
         swadesh_set = {}
-        result_pool = {}
+        result_pool = collections.OrderedDict()
         for index, (perspective_id, word_field_id, translation_field_id) in \
                 enumerate(perspective_info_list):
 
diff --git a/server-requirements-final.txt b/server-requirements-final.txt
index 460af8a7..557b0d02 100644
--- a/server-requirements-final.txt
+++ b/server-requirements-final.txt
@@ -1 +1,2 @@
 matplotlib==1.5.3
+pandas==2.0.1

From 142900251f03f5c09f17d809235b85bf840cd33a Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Mon, 29 May 2023 00:46:37 +0300
Subject: [PATCH 43/69] Pretty table

---
 lingvodoc/schema/query.py     | 9 ++++++---
 server-requirements-final.txt | 1 +
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 187c7008..d12351a9 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13033,7 +13033,9 @@ def create_table(result_pool, group_count):
         '''
 
         import pandas as pd
+        from pretty_html_table import build_table
 
+        '''
         space = ' '
         col_len = 50
 
@@ -13046,15 +13048,16 @@ def combine(*args):
                 result += f"{str(s).ljust(fld_len)[:fld_len]}{space * 2}"
 
             return result
+        '''
 
         groups = pd.DataFrame()
         # re-group by group number and add joined values
         for dict_index, perspective in enumerate(result_pool.values()):
-            dict_name = combine(f"{dict_index + 1}. {perspective['name']}")
+            dict_name = f"{dict_index + 1}. {perspective['name']}"
             for entry in perspective.values():
                 if not isinstance(entry, dict): continue
                 group_num = entry['group']
-                entry_text = combine(entry['swadesh'], entry['word'], entry['translation'])
+                entry_text = f"{entry['swadesh']} | {entry['word']} | {entry['translation']}"
                 if group_num:
                     groups.loc[group_num, dict_name] = entry_text
                 else:
@@ -13073,7 +13076,7 @@ def combine(*args):
                                 for entry in entries[1:]) + '\n'
         '''
 
-        return groups.to_html(index=False)
+        return build_table(groups, 'blue_light',  width="300px")
 
     @staticmethod
     def swadesh_statistics(
diff --git a/server-requirements-final.txt b/server-requirements-final.txt
index 557b0d02..5497c99d 100644
--- a/server-requirements-final.txt
+++ b/server-requirements-final.txt
@@ -1,2 +1,3 @@
 matplotlib==1.5.3
 pandas==2.0.1
+pretty_html_table

From 778118ba3878f1eed98f24ccda2950ac538f5f44 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Mon, 29 May 2023 12:14:31 +0300
Subject: [PATCH 44/69] Show borrowed words

---
 lingvodoc/schema/query.py | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index d12351a9..e5da7c78 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13055,7 +13055,8 @@ def combine(*args):
         for dict_index, perspective in enumerate(result_pool.values()):
             dict_name = f"{dict_index + 1}. {perspective['name']}"
             for entry in perspective.values():
-                if not isinstance(entry, dict): continue
+                if not isinstance(entry, dict):
+                    continue
                 group_num = entry['group']
                 entry_text = f"{entry['swadesh']} | {entry['word']} | {entry['translation']}"
                 if group_num:
@@ -13076,7 +13077,7 @@ def combine(*args):
                                 for entry in entries[1:]) + '\n'
         '''
 
-        return build_table(groups, 'blue_light',  width="300px")
+        return build_table(groups, 'blue_light', width="300px")
 
     @staticmethod
     def swadesh_statistics(
@@ -13100,12 +13101,9 @@ def swadesh_statistics(
 
         def compare_translations(swadesh_lex, dictionary_lex):
             def split_lex(lex):
-                #TODO: move this condition
-                if ' заим.' in lex:
-                    return set()
                 # Split by commas and open brackets to separate
                 # various forms of lexeme and extra note if is
-                return set(form.strip().lower()
+                return set(f" {form}".lower().replace(" заим.", "").strip()
                            for form in lex.replace('(', ',').split(',')
                            if form.strip()
                            and ')' not in form)  # exclude notes
@@ -13213,28 +13211,32 @@ def split_lex(lex):
                 for swadesh_num, swadesh_lex in enumerate(swadesh_list):
                     for translation_lex in translation_list:
                         if compare_translations(swadesh_lex, translation_lex):
-                            # Store entry_id and number of the lex within Swadesh' list
-                            entries_set[perspective_id].add(entry_id)
-                            swadesh_set[perspective_id].add(swadesh_num)
                             # Store the entry's content in human readable format
                             result_pool[perspective_id][entry_id] = {
                                 'group': None,
+                                'borrowed': (" заим." in f" {word_list[0]} {translation_lex}"),
                                 'swadesh': swadesh_lex,
                                 'word': word_list[0],
                                 'translation': translation_lex
                             }
+                            # Store entry_id and number of the lex within Swadesh' list
+                            entries_set[perspective_id].add(entry_id)
+                            if not result_pool[perspective_id][entry_id]['borrowed']:
+                                swadesh_set[perspective_id].add(swadesh_num)
 
         # Create dictionary of sets:
         # keys: pepspective_id
         # values: numbers of etymological groups where an entry from dictionary is met
         links = collections.OrderedDict()
-        for perspective, entries in entries_set.items():
-            links[perspective] = set()
+        for perspective_id, entries in entries_set.items():
+            links[perspective_id] = set()
             for group_index, group in enumerate(group_list):
                 linked = entries & group
                 if linked:
-                    links[perspective].add(group_index)
-                    result_pool[perspective][linked.pop()]['group'] = group_index
+                    entry_id = linked.pop()
+                    result_pool[perspective_id][entry_id]['group'] = group_index
+                    if not result_pool[perspective_id][entry_id]['borrowed']:
+                        links[perspective_id].add(group_index)
 
         dictionary_count = len(links)
         distance_data_array = numpy.full((dictionary_count, dictionary_count), 100)

From e6058bba0e86931950ce70c67a78dbfb21f64570 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Mon, 29 May 2023 14:46:16 +0300
Subject: [PATCH 45/69] pre-export to xlsx

---
 lingvodoc/schema/query.py | 77 ++++++++++++++++++++++++---------------
 1 file changed, 47 insertions(+), 30 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index e5da7c78..aba56f8c 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13015,40 +13015,26 @@ class Arguments:
     triumph = graphene.Boolean()
 
     result = graphene.String()
+    xlsx_url = graphene.String()
     minimum_spanning_tree = graphene.List(graphene.List(graphene.Int))
     embedding_2d = graphene.List(graphene.List(graphene.Float))
     embedding_3d = graphene.List(graphene.List(graphene.Float))
     perspective_name_list = graphene.List(graphene.String)
 
     @staticmethod
-    def create_table(result_pool, group_count):
+    def export_dataframe(result_pool, group_count):
         '''
         Keys:
         result_pool[perspective_id][entry_id]
         Fields:
         'group': group_index,
+        'borrowed': bool,
         'swadesh': swadesh_lex,
         'word': word_list[0],
         'translation': translation_lex
         '''
 
         import pandas as pd
-        from pretty_html_table import build_table
-
-        '''
-        space = ' '
-        col_len = 50
-
-        # get length-fixed lines
-        def combine(*args):
-            result = space * 2
-            fld_len = ((col_len - 2) // len(args)) - 2
-
-            for s in args:
-                result += f"{str(s).ljust(fld_len)[:fld_len]}{space * 2}"
-
-            return result
-        '''
 
         groups = pd.DataFrame()
         # re-group by group number and add joined values
@@ -13064,20 +13050,46 @@ def combine(*args):
                 else:
                     groups.loc[group_count, dict_name] = entry_text
                     group_count += 1
+
+        return groups
+
+    @staticmethod
+    def export_xlsx(
+            result_dataframe,
+            base_language_name,
+            storage
+    ):
+        # Exporting analysis results as an Excel file.
+
+        current_datetime = datetime.datetime.now(datetime.timezone.utc)
+        xlsx_filename = pathvalidate.sanitize_filename(
+            '{0} {1} {2:04d}.{3:02d}.{4:02d}.xlsx'.format(
+                base_language_name[:64],
+                'glottochronology',
+                current_datetime.year,
+                current_datetime.month,
+                current_datetime.day))
+
+        cur_time = time.time()
+        storage_dir = os.path.join(storage['path'], 'glottochronology', str(cur_time))
+
+        # Storing Excel file with the results.
+
+        xlsx_path = os.path.join(storage_dir, xlsx_filename)
+        os.makedirs(os.path.dirname(xlsx_path), exist_ok=True)
+
         '''
-        # iterate through 'groups' and 'single' and concatenate result
-        result = ""
-        # headers
-        result += ''.join(single[n][0] for n in range(dict_count)) + '\n\n'
-        # groups by lines
-        result += '\n'.join(''.join(line) for line in groups) + '\n'
-        # not-cognates by columns
-        for indent, entries in enumerate(single):
-            result += '\n'.join(space * col_len * indent + entry
-                                for entry in entries[1:]) + '\n'
+        workbook_stream.seek(0)
+
+        with open(xlsx_path, 'wb') as xlsx_file:
+            shutil.copyfileobj(workbook_stream, xlsx_file)
         '''
 
-        return build_table(groups, 'blue_light', width="300px")
+        xlsx_url = ''.join([
+            storage['prefix'], storage['static_route'],
+            'glottochronology', '/', str(cur_time), '/', xlsx_filename])
+
+        return xlsx_url
 
     @staticmethod
     def swadesh_statistics(
@@ -13088,6 +13100,8 @@ def swadesh_statistics(
             locale_id,
             storage):
 
+        from pretty_html_table import build_table
+
         swadesh_list = ['я','ты','мы','этот, это','тот, то','кто','что','не','все','много','один','два','большой',
                         'долгий','маленький','женщина','мужчина','человек','рыба','птица','собака','вошь','дерево',
                         'семя','лист','корень','кора','кожа','мясо','кровь','кость','жир','яйцо','рог','хвост','перо',
@@ -13270,14 +13284,17 @@ def split_lex(lex):
                 __plot_flag__ = False
             )
 
-        result = SwadeshAnalysis.create_table(result_pool, len(group_list))
+        result_dataframe = SwadeshAnalysis.export_dataframe(result_pool, len(group_list))
+        xlsx_url =  SwadeshAnalysis.export_xlsx(result_dataframe, base_language_name, storage)
+        result_table = build_table(result_dataframe, 'blue_light', width="300px")
 
         result_dict = (
 
             dict(
                 triumph = True,
 
-                result = result,
+                result = result_table,
+                xlsx_url = xlsx_url,
                 minimum_spanning_tree = mst_list,
                 embedding_2d = embedding_2d_pca,
                 embedding_3d = embedding_3d_pca,

From 93d72c7ef8bf90db8528720f2f9b968235c7603a Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Mon, 29 May 2023 15:54:27 +0300
Subject: [PATCH 46/69] Export to xlsx

---
 lingvodoc/schema/query.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index aba56f8c..ea52613c 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13078,12 +13078,7 @@ def export_xlsx(
         xlsx_path = os.path.join(storage_dir, xlsx_filename)
         os.makedirs(os.path.dirname(xlsx_path), exist_ok=True)
 
-        '''
-        workbook_stream.seek(0)
-
-        with open(xlsx_path, 'wb') as xlsx_file:
-            shutil.copyfileobj(workbook_stream, xlsx_file)
-        '''
+        result_dataframe.to_excel(xlsx_path, index=False, sheet_name='Glottochronology')
 
         xlsx_url = ''.join([
             storage['prefix'], storage['static_route'],

From 7ab3b3a5de4239aa7cee5730df68205bb775e649 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Tue, 30 May 2023 12:42:18 +0300
Subject: [PATCH 47/69] Using phonological transcription

---
 lingvodoc/schema/query.py     | 32 ++++++++++++++++----------------
 server-requirements-1.txt     |  2 +-
 server-requirements-final.txt |  2 +-
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index ea52613c..3b8e649c 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13030,7 +13030,7 @@ def export_dataframe(result_pool, group_count):
         'group': group_index,
         'borrowed': bool,
         'swadesh': swadesh_lex,
-        'word': word_list[0],
+        'transcription': transcription_list[0],
         'translation': translation_lex
         '''
 
@@ -13044,7 +13044,7 @@ def export_dataframe(result_pool, group_count):
                 if not isinstance(entry, dict):
                     continue
                 group_num = entry['group']
-                entry_text = f"{entry['swadesh']} | {entry['word']} | {entry['translation']}"
+                entry_text = f"{entry['swadesh']} | {entry['transcription']} | {entry['translation']}"
                 if group_num:
                     groups.loc[group_num, dict_name] = entry_text
                 else:
@@ -13129,7 +13129,7 @@ def split_lex(lex):
         entries_set = {}
         swadesh_set = {}
         result_pool = collections.OrderedDict()
-        for index, (perspective_id, word_field_id, translation_field_id) in \
+        for index, (perspective_id, transcription_field_id, translation_field_id) in \
                 enumerate(perspective_info_list):
 
             # Getting and saving perspective info.
@@ -13142,7 +13142,7 @@ def split_lex(lex):
             dictionary_name = perspective.parent.get_translation(locale_id)
 
             # Getting text data.
-            word_query = (
+            transcription_query = (
                 DBSession
                     .query(
                         dbLexicalEntry.client_id,
@@ -13153,15 +13153,15 @@ def split_lex(lex):
                         dbLexicalEntry.marked_for_deletion == False,
                         dbEntity.parent_client_id == dbLexicalEntry.client_id,
                         dbEntity.parent_object_id == dbLexicalEntry.object_id,
-                        dbEntity.field_client_id == word_field_id[0],
-                        dbEntity.field_object_id == word_field_id[1],
+                        dbEntity.field_client_id == transcription_field_id[0],
+                        dbEntity.field_object_id == transcription_field_id[1],
                         dbEntity.marked_for_deletion == False,
                         dbPublishingEntity.client_id == dbEntity.client_id,
                         dbPublishingEntity.object_id == dbEntity.object_id,
                         dbPublishingEntity.published == True,
                         dbPublishingEntity.accepted == True)
                     .add_columns(
-                        func.array_agg(dbEntity.content).label('word'))
+                        func.array_agg(dbEntity.content).label('transcription'))
                     .group_by(dbLexicalEntry)
                     .subquery())
 
@@ -13188,13 +13188,13 @@ def split_lex(lex):
                     .group_by(dbLexicalEntry)
                     .subquery())
 
-            # Main query for word/translation data.
+            # Main query for transcription/translation data.
             data_query = (
                 DBSession
-                    .query(word_query)
+                    .query(transcription_query)
                     .outerjoin(translation_query, and_(
-                        word_query.c.client_id == translation_query.c.client_id,
-                        word_query.c.object_id == translation_query.c.object_id))
+                        transcription_query.c.client_id == translation_query.c.client_id,
+                        transcription_query.c.object_id == translation_query.c.object_id))
                     .add_columns(
                         translation_query.c.translation)
                     .all())
@@ -13205,10 +13205,10 @@ def split_lex(lex):
             result_pool[perspective_id] = {'name': dictionary_name}
             for row_index, row in enumerate(data_query):
                 entry_id = tuple(row[:2])
-                word_list, translation_list = row[2:4]
+                transcription_list, translation_list = row[2:4]
 
-                # If we have no words for this lexical entry, we skip it altogether.
-                if not word_list:
+                # If we have no transcriptions for this lexical entry, we skip it altogether.
+                if not transcription_list:
                     continue
 
                 translation_list = (
@@ -13223,9 +13223,9 @@ def split_lex(lex):
                             # Store the entry's content in human readable format
                             result_pool[perspective_id][entry_id] = {
                                 'group': None,
-                                'borrowed': (" заим." in f" {word_list[0]} {translation_lex}"),
+                                'borrowed': (" заим." in f" {transcription_list[0]} {translation_lex}"),
                                 'swadesh': swadesh_lex,
-                                'word': word_list[0],
+                                'transcription': transcription_list[0],
                                 'translation': translation_lex
                             }
                             # Store entry_id and number of the lex within Swadesh' list
diff --git a/server-requirements-1.txt b/server-requirements-1.txt
index 50c1d5b0..1fa56ad7 100644
--- a/server-requirements-1.txt
+++ b/server-requirements-1.txt
@@ -65,7 +65,7 @@ pyramid-debugtoolbar==3.0.4
 pyramid-mailer==0.15.1
 pyramid-mako==1.0.2
 pyramid-tm==1.0.1
-python-dateutil==2.8.0
+python-dateutil==2.8.1
 python-docx==0.8.10
 python-editor==1.0.3
 pytz==2018.5
diff --git a/server-requirements-final.txt b/server-requirements-final.txt
index 5497c99d..a90fd4b7 100644
--- a/server-requirements-final.txt
+++ b/server-requirements-final.txt
@@ -1,3 +1,3 @@
 matplotlib==1.5.3
-pandas==2.0.1
+pandas==1.4.3
 pretty_html_table

From fa350bfe905b4a771d186b19b284c1f857510417 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Tue, 30 May 2023 14:05:05 +0300
Subject: [PATCH 48/69] Exclude tiny (<50 words) dictionaries

---
 lingvodoc/schema/query.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 3b8e649c..44688cc6 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13041,6 +13041,8 @@ def export_dataframe(result_pool, group_count):
         for dict_index, perspective in enumerate(result_pool.values()):
             dict_name = f"{dict_index + 1}. {perspective['name']}"
             for entry in perspective.values():
+                # 'entry' iterator may present 'name' or 'suite' field
+                # but not an inner dictionary for entry
                 if not isinstance(entry, dict):
                     continue
                 group_num = entry['group']
@@ -13202,7 +13204,10 @@ def split_lex(lex):
             # Grouping translations by lexical entries.
             entries_set[perspective_id] = set()
             swadesh_set[perspective_id] = set()
-            result_pool[perspective_id] = {'name': dictionary_name}
+            result_pool[perspective_id] = {
+                'name': dictionary_name,
+                'suit': (len(data_query) > 50)
+            }
             for row_index, row in enumerate(data_query):
                 entry_id = tuple(row[:2])
                 transcription_list, translation_list = row[2:4]
@@ -13230,7 +13235,8 @@ def split_lex(lex):
                             }
                             # Store entry_id and number of the lex within Swadesh' list
                             entries_set[perspective_id].add(entry_id)
-                            if not result_pool[perspective_id][entry_id]['borrowed']:
+                            if (result_pool[perspective_id]['suit'] and
+                                not result_pool[perspective_id][entry_id]['borrowed']):
                                 swadesh_set[perspective_id].add(swadesh_num)
 
         # Create dictionary of sets:
@@ -13244,7 +13250,8 @@ def split_lex(lex):
                 if linked:
                     entry_id = linked.pop()
                     result_pool[perspective_id][entry_id]['group'] = group_index
-                    if not result_pool[perspective_id][entry_id]['borrowed']:
+                    if (result_pool[perspective_id]['suit'] and
+                        not result_pool[perspective_id][entry_id]['borrowed']):
                         links[perspective_id].add(group_index)
 
         dictionary_count = len(links)

From f5b6e6042c5555f59b81715cd4b93591c5466a2a Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Tue, 30 May 2023 16:55:16 +0300
Subject: [PATCH 49/69] Sorting result table

---
 lingvodoc/schema/query.py | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 44688cc6..396b730a 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -362,6 +362,9 @@
 
 import lingvodoc.scripts.docx_import as docx_import
 
+import pandas as pd
+from pretty_html_table import build_table
+
 # Setting up logging.
 log = logging.getLogger(__name__)
 logging.disable(level=logging.INFO)
@@ -13034,15 +13037,14 @@ def export_dataframe(result_pool, group_count):
         'translation': translation_lex
         '''
 
-        import pandas as pd
-
         groups = pd.DataFrame()
+        single = pd.DataFrame()
         # re-group by group number and add joined values
         for dict_index, perspective in enumerate(result_pool.values()):
             dict_name = f"{dict_index + 1}. {perspective['name']}"
             for entry in perspective.values():
-                # 'entry' iterator may present 'name' or 'suite' field
-                # but not an inner dictionary for entry
+                # 'entry' iterator may present string value of 'name' or 'suite' field
+                # but not a dictionary for one of entries. Continue in this case.
                 if not isinstance(entry, dict):
                     continue
                 group_num = entry['group']
@@ -13050,14 +13052,15 @@ def export_dataframe(result_pool, group_count):
                 if group_num:
                     groups.loc[group_num, dict_name] = entry_text
                 else:
-                    groups.loc[group_count, dict_name] = entry_text
+                    single.loc[group_count, dict_name] = entry_text
                     group_count += 1
 
-        return groups
+        return groups.sort_values(groups.columns[0]), single.sort_index()
 
     @staticmethod
     def export_xlsx(
-            result_dataframe,
+            result_dataframes,
+            sheet_names,
             base_language_name,
             storage
     ):
@@ -13080,7 +13083,9 @@ def export_xlsx(
         xlsx_path = os.path.join(storage_dir, xlsx_filename)
         os.makedirs(os.path.dirname(xlsx_path), exist_ok=True)
 
-        result_dataframe.to_excel(xlsx_path, index=False, sheet_name='Glottochronology')
+        with pd.ExcelWriter(xlsx_path) as writer:
+            for n, df in enumerate(result_dataframes):
+                df.to_excel(writer, index=False, sheet_name=sheet_names[n])
 
         xlsx_url = ''.join([
             storage['prefix'], storage['static_route'],
@@ -13097,8 +13102,6 @@ def swadesh_statistics(
             locale_id,
             storage):
 
-        from pretty_html_table import build_table
-
         swadesh_list = ['я','ты','мы','этот, это','тот, то','кто','что','не','все','много','один','два','большой',
                         'долгий','маленький','женщина','мужчина','человек','рыба','птица','собака','вошь','дерево',
                         'семя','лист','корень','кора','кожа','мясо','кровь','кость','жир','яйцо','рог','хвост','перо',
@@ -13286,16 +13289,19 @@ def split_lex(lex):
                 __plot_flag__ = False
             )
 
-        result_dataframe = SwadeshAnalysis.export_dataframe(result_pool, len(group_list))
-        xlsx_url =  SwadeshAnalysis.export_xlsx(result_dataframe, base_language_name, storage)
-        result_table = build_table(result_dataframe, 'blue_light', width="300px")
+        result_dataframes = SwadeshAnalysis.export_dataframe(result_pool, len(group_list))
+        xlsx_url =  SwadeshAnalysis.export_xlsx(result_dataframes,
+                                                ['Cognates', 'Singles'],
+                                                base_language_name,
+                                                storage)
+        result_tables = (build_table(result_dataframes[0], 'blue_light', width="300px"),
+                         build_table(result_dataframes[1], 'green_light', width="300px"))
 
         result_dict = (
-
             dict(
                 triumph = True,
 
-                result = result_table,
+                result = f"{result_tables[0]}\n\n{result_tables[1]}",
                 xlsx_url = xlsx_url,
                 minimum_spanning_tree = mst_list,
                 embedding_2d = embedding_2d_pca,

From e7b10ec184d557fff52740f047e656cf97ef8b53 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Tue, 30 May 2023 17:14:35 +0300
Subject: [PATCH 50/69] Refactoring

---
 lingvodoc/schema/query.py | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 396b730a..3808a13a 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13055,12 +13055,14 @@ def export_dataframe(result_pool, group_count):
                     single.loc[group_count, dict_name] = entry_text
                     group_count += 1
 
-        return groups.sort_values(groups.columns[0]), single.sort_index()
+        return {
+            'Cognates': groups.sort_values(groups.columns[0]),
+            'Singles': single.sort_index()
+        }
 
     @staticmethod
     def export_xlsx(
-            result_dataframes,
-            sheet_names,
+            result,
             base_language_name,
             storage
     ):
@@ -13084,8 +13086,8 @@ def export_xlsx(
         os.makedirs(os.path.dirname(xlsx_path), exist_ok=True)
 
         with pd.ExcelWriter(xlsx_path) as writer:
-            for n, df in enumerate(result_dataframes):
-                df.to_excel(writer, index=False, sheet_name=sheet_names[n])
+            for sheet_name, df in result.items():
+                df.to_excel(writer, index=False, sheet_name=sheet_name)
 
         xlsx_url = ''.join([
             storage['prefix'], storage['static_route'],
@@ -13289,13 +13291,10 @@ def split_lex(lex):
                 __plot_flag__ = False
             )
 
-        result_dataframes = SwadeshAnalysis.export_dataframe(result_pool, len(group_list))
-        xlsx_url =  SwadeshAnalysis.export_xlsx(result_dataframes,
-                                                ['Cognates', 'Singles'],
-                                                base_language_name,
-                                                storage)
-        result_tables = (build_table(result_dataframes[0], 'blue_light', width="300px"),
-                         build_table(result_dataframes[1], 'green_light', width="300px"))
+        result = SwadeshAnalysis.export_dataframe(result_pool, len(group_list))
+        xlsx_url = SwadeshAnalysis.export_xlsx(result, base_language_name, storage)
+        result_tables = (build_table(result['Cognates'], 'blue_light', width="300px"),
+                         build_table(result['Singles'], 'green_light', width="300px"))
 
         result_dict = (
             dict(

From 3b195579c0b89a06ea2a31f70f2e479d13bf7cf1 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Tue, 30 May 2023 17:44:46 +0300
Subject: [PATCH 51/69] Set columns width

---
 lingvodoc/schema/query.py     | 3 ++-
 server-requirements-final.txt | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 3808a13a..c47f63fa 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13085,9 +13085,10 @@ def export_xlsx(
         xlsx_path = os.path.join(storage_dir, xlsx_filename)
         os.makedirs(os.path.dirname(xlsx_path), exist_ok=True)
 
-        with pd.ExcelWriter(xlsx_path) as writer:
+        with pd.ExcelWriter(xlsx_path, engine='xlsxwriter') as writer:
             for sheet_name, df in result.items():
                 df.to_excel(writer, index=False, sheet_name=sheet_name)
+                writer.sheets[sheet_name].set_column(0, df.shape[1] - 1, 30)
 
         xlsx_url = ''.join([
             storage['prefix'], storage['static_route'],
diff --git a/server-requirements-final.txt b/server-requirements-final.txt
index a90fd4b7..1a41656e 100644
--- a/server-requirements-final.txt
+++ b/server-requirements-final.txt
@@ -1,3 +1,4 @@
 matplotlib==1.5.3
 pandas==1.4.3
 pretty_html_table
+xlsxwriter

From a408f5b08cec66d2d2cd1f40d2f7b0664ac67d06 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Tue, 30 May 2023 18:55:57 +0300
Subject: [PATCH 52/69] Bundles

---
 lingvodoc/schema/query.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index c47f63fa..f6a87016 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13025,7 +13025,7 @@ class Arguments:
     perspective_name_list = graphene.List(graphene.String)
 
     @staticmethod
-    def export_dataframe(result_pool, group_count):
+    def export_dataframe(result_pool, bundles):
         '''
         Keys:
         result_pool[perspective_id][entry_id]
@@ -13039,6 +13039,7 @@ def export_dataframe(result_pool, group_count):
 
         groups = pd.DataFrame()
         single = pd.DataFrame()
+        row_index = 0
         # re-group by group number and add joined values
         for dict_index, perspective in enumerate(result_pool.values()):
             dict_name = f"{dict_index + 1}. {perspective['name']}"
@@ -13049,11 +13050,11 @@ def export_dataframe(result_pool, group_count):
                     continue
                 group_num = entry['group']
                 entry_text = f"{entry['swadesh']} | {entry['transcription']} | {entry['translation']}"
-                if group_num:
+                if group_num and group_num in bundles:
                     groups.loc[group_num, dict_name] = entry_text
                 else:
-                    single.loc[group_count, dict_name] = entry_text
-                    group_count += 1
+                    single.loc[row_index, dict_name] = entry_text
+                    row_index += 1
 
         return {
             'Cognates': groups.sort_values(groups.columns[0]),
@@ -13267,18 +13268,16 @@ def split_lex(lex):
         # Calculate intersection between lists of group numbers
         # So length of this intersection is the similarity of corresponding perspectives
         # commons_total means amount of Swadesh's lexems met in the both perspectives
+        bundles = set()
         for n1, (perspective1, groups1) in enumerate(links.items()):
             distance_header_array[n1] = result_pool[perspective1]['name']
-            #print(perspective1, end=' :: ')
             for n2, (perspective2, groups2) in enumerate(links.items()):
-                #if n2 <= n1: continue  #exclude duplicates and self-to-self
+                bundles.update(groups1 & groups2)
                 commons_linked = len(groups1 & groups2)
                 commons_total = len(swadesh_set[perspective1] & swadesh_set[perspective2])
                 # commons_linked > 0 means that commons_total > 0 even more so
                 distance = math.log(commons_linked / commons_total) / -0.14 if commons_linked > 0 else 100
                 distance_data_array[n1][n2] = distance
-                #print(f"{perspective2}:{commons_linked}/{commons_total}:{distance:.2f}", end=' | ')
-            #print()
 
         _, mst_list, embedding_2d_pca, embedding_3d_pca = \
             CognateAnalysis.distance_graph(
@@ -13292,7 +13291,7 @@ def split_lex(lex):
                 __plot_flag__ = False
             )
 
-        result = SwadeshAnalysis.export_dataframe(result_pool, len(group_list))
+        result = SwadeshAnalysis.export_dataframe(result_pool, bundles)
         xlsx_url = SwadeshAnalysis.export_xlsx(result, base_language_name, storage)
         result_tables = (build_table(result['Cognates'], 'blue_light', width="300px"),
                          build_table(result['Singles'], 'green_light', width="300px"))
@@ -13301,7 +13300,7 @@ def split_lex(lex):
             dict(
                 triumph = True,
 
-                result = f"{result_tables[0]}\n\n{result_tables[1]}",
+                result = f"{result_tables[0]}<pre>\n\n</pre>{result_tables[1]}",
                 xlsx_url = xlsx_url,
                 minimum_spanning_tree = mst_list,
                 embedding_2d = embedding_2d_pca,

From ccd9935ec04fe693c7ce404f70cf412aa624f6d2 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Tue, 30 May 2023 20:00:19 +0300
Subject: [PATCH 53/69] Full transcription

---
 lingvodoc/schema/query.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index f6a87016..ae56775a 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13229,15 +13229,16 @@ def split_lex(lex):
                         for translation in translation_list
                         if translation.strip()])
 
+                transcription_lex = ', '.join(transcription_list)
                 for swadesh_num, swadesh_lex in enumerate(swadesh_list):
                     for translation_lex in translation_list:
                         if compare_translations(swadesh_lex, translation_lex):
                             # Store the entry's content in human readable format
                             result_pool[perspective_id][entry_id] = {
                                 'group': None,
-                                'borrowed': (" заим." in f" {transcription_list[0]} {translation_lex}"),
+                                'borrowed': (" заим." in f" {transcription_lex} {translation_lex}"),
                                 'swadesh': swadesh_lex,
-                                'transcription': transcription_list[0],
+                                'transcription': transcription_lex,
                                 'translation': translation_lex
                             }
                             # Store entry_id and number of the lex within Swadesh' list

From e50fe2bc38461937fa515b005291263532ebfe6f Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Tue, 30 May 2023 20:43:35 +0300
Subject: [PATCH 54/69] Exclude self-to-self groups

---
 lingvodoc/schema/query.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index ae56775a..6981a120 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13273,12 +13273,15 @@ def split_lex(lex):
         for n1, (perspective1, groups1) in enumerate(links.items()):
             distance_header_array[n1] = result_pool[perspective1]['name']
             for n2, (perspective2, groups2) in enumerate(links.items()):
-                bundles.update(groups1 & groups2)
-                commons_linked = len(groups1 & groups2)
-                commons_total = len(swadesh_set[perspective1] & swadesh_set[perspective2])
-                # commons_linked > 0 means that commons_total > 0 even more so
-                distance = math.log(commons_linked / commons_total) / -0.14 if commons_linked > 0 else 100
-                distance_data_array[n1][n2] = distance
+                if n1 == n2:
+                    distance_data_array[n1][n2] = 0
+                else:
+                    bundles.update(groups1 & groups2)
+                    commons_linked = len(groups1 & groups2)
+                    commons_total = len(swadesh_set[perspective1] & swadesh_set[perspective2])
+                    # commons_linked > 0 means that commons_total > 0 even more so
+                    distance = math.log(commons_linked / commons_total) / -0.14 if commons_linked > 0 else 100
+                    distance_data_array[n1][n2] = distance
 
         _, mst_list, embedding_2d_pca, embedding_3d_pca = \
             CognateAnalysis.distance_graph(

From 9062dfb3998ff5fa54cbd73302d73ca62d0775aa Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Tue, 30 May 2023 21:41:00 +0300
Subject: [PATCH 55/69] Fixed dependencies

---
 server-requirements-1.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server-requirements-1.txt b/server-requirements-1.txt
index 1fa56ad7..a2d1bb7c 100644
--- a/server-requirements-1.txt
+++ b/server-requirements-1.txt
@@ -68,7 +68,7 @@ pyramid-tm==1.0.1
 python-dateutil==2.8.1
 python-docx==0.8.10
 python-editor==1.0.3
-pytz==2018.5
+pytz==2020.1
 PyYAML==5.2
 redis==2.10.5
 regex==2019.6.8

From f85915ba4b0c81564b5ac60d1f52fe49a3848ce8 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Tue, 30 May 2023 23:22:50 +0300
Subject: [PATCH 56/69] Text wrap

---
 lingvodoc/schema/query.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 6981a120..bd26f1bb 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13087,9 +13087,19 @@ def export_xlsx(
         os.makedirs(os.path.dirname(xlsx_path), exist_ok=True)
 
         with pd.ExcelWriter(xlsx_path, engine='xlsxwriter') as writer:
+            header_format = writer.book.add_format({'bold': True,
+                                                    'text_wrap': True,
+                                                    'valign': 'top',
+                                                    'fg_color': '#D7E4BC',
+                                                    'border': 1})
             for sheet_name, df in result.items():
-                df.to_excel(writer, index=False, sheet_name=sheet_name)
-                writer.sheets[sheet_name].set_column(0, df.shape[1] - 1, 30)
+                df.to_excel(writer, sheet_name=sheet_name, index=False, startrow=1, header=False)
+                worksheet = writer.sheets[sheet_name]
+                worksheet.set_column(0, df.shape[1] - 1, 30)
+                # Write the column headers with the defined format.
+                for col_num, value in enumerate(df.columns.values):
+                    worksheet.write(0, col_num, value, header_format)
+                worksheet.set_row(0, 70)
 
         xlsx_url = ''.join([
             storage['prefix'], storage['static_route'],

From 6cc4a81a96b35d7f1fe60247d483e0215b36032c Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Tue, 30 May 2023 23:23:47 +0300
Subject: [PATCH 57/69] Deps

---
 server-requirements-1.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server-requirements-1.txt b/server-requirements-1.txt
index a2d1bb7c..0441a615 100644
--- a/server-requirements-1.txt
+++ b/server-requirements-1.txt
@@ -20,7 +20,7 @@ configparser==4.0.2
 cycler==0.10.0
 DataProperty==0.42.1
 defusedxml==0.6.0
-dill==0.3.5.1
+dill==0.3.6
 docutils==0.15.2
 dogpile.cache==0.6.8
 et-xmlfile==1.0.1

From 0d2cf320413dc63b22a333f32d49584a5b386138 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Wed, 31 May 2023 16:50:11 +0300
Subject: [PATCH 58/69] Garbage collecting

---
 lingvodoc/schema/query.py | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index bd26f1bb..86b5ad9a 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13160,6 +13160,9 @@ def split_lex(lex):
             )
             dictionary_name = perspective.parent.get_translation(locale_id)
 
+            # GC
+            del perspective
+
             # Getting text data.
             transcription_query = (
                 DBSession
@@ -13218,6 +13221,10 @@ def split_lex(lex):
                         translation_query.c.translation)
                     .all())
 
+            # GC
+            del transcription_query
+            del translation_query
+
             # Grouping translations by lexical entries.
             entries_set[perspective_id] = set()
             swadesh_set[perspective_id] = set()
@@ -13257,6 +13264,9 @@ def split_lex(lex):
                                 not result_pool[perspective_id][entry_id]['borrowed']):
                                 swadesh_set[perspective_id].add(swadesh_num)
 
+            # GC
+            del data_query
+
         # Create dictionary of sets:
         # keys: pepspective_id
         # values: numbers of etymological groups where an entry from dictionary is met
@@ -13293,6 +13303,15 @@ def split_lex(lex):
                     distance = math.log(commons_linked / commons_total) / -0.14 if commons_linked > 0 else 100
                     distance_data_array[n1][n2] = distance
 
+        result = SwadeshAnalysis.export_dataframe(result_pool, bundles)
+        # GC
+        del result_pool
+        xlsx_url = SwadeshAnalysis.export_xlsx(result, base_language_name, storage)
+        result_tables = (build_table(result['Cognates'], 'blue_light', width="300px"),
+                         build_table(result['Singles'], 'green_light', width="300px"))
+        # GC
+        del result
+
         _, mst_list, embedding_2d_pca, embedding_3d_pca = \
             CognateAnalysis.distance_graph(
                 language_str,
@@ -13305,11 +13324,6 @@ def split_lex(lex):
                 __plot_flag__ = False
             )
 
-        result = SwadeshAnalysis.export_dataframe(result_pool, bundles)
-        xlsx_url = SwadeshAnalysis.export_xlsx(result, base_language_name, storage)
-        result_tables = (build_table(result['Cognates'], 'blue_light', width="300px"),
-                         build_table(result['Singles'], 'green_light', width="300px"))
-
         result_dict = (
             dict(
                 triumph = True,

From df09c7079264563a9b06d42c7ed494e368f3823e Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Wed, 31 May 2023 18:06:29 +0300
Subject: [PATCH 59/69] Control output size

---
 lingvodoc/schema/query.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 86b5ad9a..db9d857d 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13057,7 +13057,7 @@ def export_dataframe(result_pool, bundles):
                     row_index += 1
 
         return {
-            'Cognates': groups.sort_values(groups.columns[0]),
+            'Cognates': groups if groups.empty else groups.sort_values(groups.columns[0]),
             'Singles': single.sort_index()
         }
 
@@ -13304,13 +13304,24 @@ def split_lex(lex):
                     distance_data_array[n1][n2] = distance
 
         result = SwadeshAnalysis.export_dataframe(result_pool, bundles)
+
         # GC
         del result_pool
+
         xlsx_url = SwadeshAnalysis.export_xlsx(result, base_language_name, storage)
         result_tables = (build_table(result['Cognates'], 'blue_light', width="300px"),
                          build_table(result['Singles'], 'green_light', width="300px"))
+
+        # Control output size
+        huge_size = 1048576
+        result = f"{result_tables[0]}<pre>\n\n</pre>{result_tables[1]}"
+        if len(result) > huge_size:
+            result = f"{result_tables[0]}<pre>\n\nNote: The table with single words is not shown due to huge summary size</pre>"
+        if len(result) > huge_size:
+            result = "<pre>\n\nNote: The result tables are not shown due to huge summary size</pre>"
+
         # GC
-        del result
+        del result_tables
 
         _, mst_list, embedding_2d_pca, embedding_3d_pca = \
             CognateAnalysis.distance_graph(
@@ -13328,7 +13339,7 @@ def split_lex(lex):
             dict(
                 triumph = True,
 
-                result = f"{result_tables[0]}<pre>\n\n</pre>{result_tables[1]}",
+                result = result,
                 xlsx_url = xlsx_url,
                 minimum_spanning_tree = mst_list,
                 embedding_2d = embedding_2d_pca,

From 35bc7664b6908bde3dad414a4e71a43b038de6ad Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Wed, 31 May 2023 19:33:14 +0300
Subject: [PATCH 60/69] Distances worksheet

---
 lingvodoc/schema/query.py | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index db9d857d..84583050 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13025,7 +13025,7 @@ class Arguments:
     perspective_name_list = graphene.List(graphene.String)
 
     @staticmethod
-    def export_dataframe(result_pool, bundles):
+    def export_dataframe(result_pool, distance_data_array, bundles):
         '''
         Keys:
         result_pool[perspective_id][entry_id]
@@ -13038,7 +13038,9 @@ def export_dataframe(result_pool, bundles):
         '''
 
         groups = pd.DataFrame()
-        single = pd.DataFrame()
+        singles = pd.DataFrame()
+        distances = pd.DataFrame(distance_data_array,
+                                 columns=[perspective['name'] for perspective in result_pool.values()])
         row_index = 0
         # re-group by group number and add joined values
         for dict_index, perspective in enumerate(result_pool.values()):
@@ -13049,16 +13051,17 @@ def export_dataframe(result_pool, bundles):
                 if not isinstance(entry, dict):
                     continue
                 group_num = entry['group']
-                entry_text = f"{entry['swadesh']} | {entry['transcription']} | {entry['translation']}"
+                entry_text = f"{entry['swadesh']} [ {entry['transcription']} ] {entry['translation']}"
                 if group_num and group_num in bundles:
                     groups.loc[group_num, dict_name] = entry_text
                 else:
-                    single.loc[row_index, dict_name] = entry_text
+                    singles.loc[row_index, dict_name] = entry_text
                     row_index += 1
 
         return {
             'Cognates': groups if groups.empty else groups.sort_values(groups.columns[0]),
-            'Singles': single.sort_index()
+            'Singles': singles.sort_index(),
+            'Distances': distances.sort_index()
         }
 
     @staticmethod
@@ -13093,7 +13096,14 @@ def export_xlsx(
                                                     'fg_color': '#D7E4BC',
                                                     'border': 1})
             for sheet_name, df in result.items():
-                df.to_excel(writer, sheet_name=sheet_name, index=False, startrow=1, header=False)
+                index = (sheet_name == 'Distances')
+                startcol = int(index)
+                df.to_excel(writer,
+                            sheet_name=sheet_name,
+                            index=index,
+                            startrow=1,
+                            startcol=startcol,
+                            header=False)
                 worksheet = writer.sheets[sheet_name]
                 worksheet.set_column(0, df.shape[1] - 1, 30)
                 # Write the column headers with the defined format.
@@ -13303,7 +13313,7 @@ def split_lex(lex):
                     distance = math.log(commons_linked / commons_total) / -0.14 if commons_linked > 0 else 100
                     distance_data_array[n1][n2] = distance
 
-        result = SwadeshAnalysis.export_dataframe(result_pool, bundles)
+        result = SwadeshAnalysis.export_dataframe(result_pool, distance_data_array, bundles)
 
         # GC
         del result_pool

From a5faa455b9eef593d5e1e9d67291acc57528ce32 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Wed, 31 May 2023 20:18:58 +0300
Subject: [PATCH 61/69] Float distances

---
 lingvodoc/schema/query.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 84583050..287c89c6 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13041,10 +13041,13 @@ def export_dataframe(result_pool, distance_data_array, bundles):
         singles = pd.DataFrame()
         distances = pd.DataFrame(distance_data_array,
                                  columns=[perspective['name'] for perspective in result_pool.values()])
+        # Start index for distances from 1 to match with dictionaries numbers
+        distances.index += 1
+
         row_index = 0
         # re-group by group number and add joined values
         for dict_index, perspective in enumerate(result_pool.values()):
-            dict_name = f"{dict_index + 1}. {perspective['name']}"
+            dict_name = perspective['name']
             for entry in perspective.values():
                 # 'entry' iterator may present string value of 'name' or 'suite' field
                 # but not a dictionary for one of entries. Continue in this case.
@@ -13102,13 +13105,12 @@ def export_xlsx(
                             sheet_name=sheet_name,
                             index=index,
                             startrow=1,
-                            startcol=startcol,
                             header=False)
                 worksheet = writer.sheets[sheet_name]
-                worksheet.set_column(0, df.shape[1] - 1, 30)
+                worksheet.set_column(0, df.shape[1] - 1 + startcol, 30)
                 # Write the column headers with the defined format.
                 for col_num, value in enumerate(df.columns.values):
-                    worksheet.write(0, col_num, value, header_format)
+                    worksheet.write(0, col_num + startcol, value, header_format)
                 worksheet.set_row(0, 70)
 
         xlsx_url = ''.join([
@@ -13239,7 +13241,7 @@ def split_lex(lex):
             entries_set[perspective_id] = set()
             swadesh_set[perspective_id] = set()
             result_pool[perspective_id] = {
-                'name': dictionary_name,
+                'name': f"{index + 1}. {dictionary_name}",
                 'suit': (len(data_query) > 50)
             }
             for row_index, row in enumerate(data_query):
@@ -13293,7 +13295,7 @@ def split_lex(lex):
                         links[perspective_id].add(group_index)
 
         dictionary_count = len(links)
-        distance_data_array = numpy.full((dictionary_count, dictionary_count), 100)
+        distance_data_array = numpy.full((dictionary_count, dictionary_count), 100, dtype='float')
         distance_header_array = numpy.full(dictionary_count, "<noname>", dtype='object')
 
         # Calculate intersection between lists of group numbers

From 42d3ee3d7143952f555073390ff5efd37c20d207 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Wed, 31 May 2023 20:26:12 +0300
Subject: [PATCH 62/69] Cleanup

---
 lingvodoc/schema/query.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 287c89c6..6c227ce0 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13099,19 +13099,21 @@ def export_xlsx(
                                                     'fg_color': '#D7E4BC',
                                                     'border': 1})
             for sheet_name, df in result.items():
+                worksheet = writer.sheets[sheet_name]
                 index = (sheet_name == 'Distances')
                 startcol = int(index)
+
                 df.to_excel(writer,
                             sheet_name=sheet_name,
                             index=index,
                             startrow=1,
                             header=False)
-                worksheet = writer.sheets[sheet_name]
+
+                worksheet.set_row(0, 70)
                 worksheet.set_column(0, df.shape[1] - 1 + startcol, 30)
                 # Write the column headers with the defined format.
                 for col_num, value in enumerate(df.columns.values):
                     worksheet.write(0, col_num + startcol, value, header_format)
-                worksheet.set_row(0, 70)
 
         xlsx_url = ''.join([
             storage['prefix'], storage['static_route'],

From 8cda6f7193ba692cd15e47976a54020153c441df Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Wed, 31 May 2023 20:36:00 +0300
Subject: [PATCH 63/69] Fix

---
 lingvodoc/schema/query.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 6c227ce0..88a2a774 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13099,7 +13099,6 @@ def export_xlsx(
                                                     'fg_color': '#D7E4BC',
                                                     'border': 1})
             for sheet_name, df in result.items():
-                worksheet = writer.sheets[sheet_name]
                 index = (sheet_name == 'Distances')
                 startcol = int(index)
 
@@ -13109,6 +13108,7 @@ def export_xlsx(
                             startrow=1,
                             header=False)
 
+                worksheet = writer.sheets[sheet_name]
                 worksheet.set_row(0, 70)
                 worksheet.set_column(0, df.shape[1] - 1 + startcol, 30)
                 # Write the column headers with the defined format.

From 652cae7c4de62d5808c64b698992b1f4d379adf8 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Wed, 31 May 2023 20:45:30 +0300
Subject: [PATCH 64/69] Cleanup

---
 lingvodoc/schema/query.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 88a2a774..4f9764ca 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13110,7 +13110,7 @@ def export_xlsx(
 
                 worksheet = writer.sheets[sheet_name]
                 worksheet.set_row(0, 70)
-                worksheet.set_column(0, df.shape[1] - 1 + startcol, 30)
+                worksheet.set_column(startcol, df.shape[1] - 1 + startcol, 30)
                 # Write the column headers with the defined format.
                 for col_num, value in enumerate(df.columns.values):
                     worksheet.write(0, col_num + startcol, value, header_format)

From 221df2eecacd276d84587df53b5cc52f53d7c9d4 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Thu, 1 Jun 2023 18:36:27 +0300
Subject: [PATCH 65/69] Fixed "more links than means"

---
 lingvodoc/schema/query.py | 69 +++++++++++++++++++++++++--------------
 1 file changed, 45 insertions(+), 24 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 4f9764ca..b1cd7363 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13046,7 +13046,7 @@ def export_dataframe(result_pool, distance_data_array, bundles):
 
         row_index = 0
         # re-group by group number and add joined values
-        for dict_index, perspective in enumerate(result_pool.values()):
+        for perspective in result_pool.values():
             dict_name = perspective['name']
             for entry in perspective.values():
                 # 'entry' iterator may present string value of 'name' or 'suite' field
@@ -13158,10 +13158,10 @@ def split_lex(lex):
 
         # Getting text data for each perspective.
         # entries_set gathers entry_id(s) of words met in Swadesh' list
-        # swadesh_set gathers numbers of words within Swadesh' list
+        # swadesh_total gathers numbers of words within Swadesh' list
         entries_set = {}
-        swadesh_set = {}
-        result_pool = collections.OrderedDict()
+        swadesh_total = {}
+        result_pool = {}
         for index, (perspective_id, transcription_field_id, translation_field_id) in \
                 enumerate(perspective_info_list):
 
@@ -13241,7 +13241,7 @@ def split_lex(lex):
 
             # Grouping translations by lexical entries.
             entries_set[perspective_id] = set()
-            swadesh_set[perspective_id] = set()
+            swadesh_total[perspective_id] = set()
             result_pool[perspective_id] = {
                 'name': f"{index + 1}. {dictionary_name}",
                 'suit': (len(data_query) > 50)
@@ -13260,6 +13260,7 @@ def split_lex(lex):
                         for translation in translation_list
                         if translation.strip()])
 
+                # Parsing translations and matching with Swadesh's words
                 transcription_lex = ', '.join(transcription_list)
                 for swadesh_num, swadesh_lex in enumerate(swadesh_list):
                     for translation_lex in translation_list:
@@ -13272,49 +13273,69 @@ def split_lex(lex):
                                 'transcription': transcription_lex,
                                 'translation': translation_lex
                             }
-                            # Store entry_id and number of the lex within Swadesh' list
+                            # Store entry_id and number of the lex within Swadesh's list
                             entries_set[perspective_id].add(entry_id)
                             if (result_pool[perspective_id]['suit'] and
                                 not result_pool[perspective_id][entry_id]['borrowed']):
-                                swadesh_set[perspective_id].add(swadesh_num)
+                                # Total list of Swadesh's words in the perspective,
+                                # they can have no any etimological links
+                                swadesh_total[perspective_id].add(swadesh_num)
 
             # GC
             del data_query
 
-        # Create dictionary of sets:
-        # keys: pepspective_id
-        # values: numbers of etymological groups where an entry from dictionary is met
-        links = collections.OrderedDict()
+        # Checking if found entries have links
+        means = collections.OrderedDict()
         for perspective_id, entries in entries_set.items():
-            links[perspective_id] = set()
+            means[perspective_id] = collections.defaultdict(set)
             for group_index, group in enumerate(group_list):
+                # Select etimologically linked entries
                 linked = entries & group
                 if linked:
                     entry_id = linked.pop()
                     result_pool[perspective_id][entry_id]['group'] = group_index
+                    swadesh = result_pool[perspective_id][entry_id]['swadesh']
+                    # Store the correspondence: perspective { means(1/2/3) { etimological_groups(1.1/1.2/2.1/3.1)
                     if (result_pool[perspective_id]['suit'] and
                         not result_pool[perspective_id][entry_id]['borrowed']):
-                        links[perspective_id].add(group_index)
+                        means[perspective_id][swadesh].add(group_index)
 
-        dictionary_count = len(links)
-        distance_data_array = numpy.full((dictionary_count, dictionary_count), 100, dtype='float')
+        dictionary_count = len(means)
+        distance_data_array = numpy.full((dictionary_count, dictionary_count), 50, dtype='float')
         distance_header_array = numpy.full(dictionary_count, "<noname>", dtype='object')
 
-        # Calculate intersection between lists of group numbers
+        # Calculate intersection between lists of linked means (Swadesh matching)
         # So length of this intersection is the similarity of corresponding perspectives
-        # commons_total means amount of Swadesh's lexems met in the both perspectives
+        # means_total is amount of Swadesh's lexems met in the both perspectives
         bundles = set()
-        for n1, (perspective1, groups1) in enumerate(links.items()):
+        # Calculate each-to-each distances, exclude self-to-self
+        for n1, (perspective1, means1) in enumerate(means.items()):
             distance_header_array[n1] = result_pool[perspective1]['name']
-            for n2, (perspective2, groups2) in enumerate(links.items()):
+            for n2, (perspective2, means2) in enumerate(means.items()):
                 if n1 == n2:
                     distance_data_array[n1][n2] = 0
                 else:
-                    bundles.update(groups1 & groups2)
-                    commons_linked = len(groups1 & groups2)
-                    commons_total = len(swadesh_set[perspective1] & swadesh_set[perspective2])
-                    # commons_linked > 0 means that commons_total > 0 even more so
-                    distance = math.log(commons_linked / commons_total) / -0.14 if commons_linked > 0 else 100
+                    # Common means of entries which have etimological linkes
+                    # but this linkes may be not mutual
+                    means_common = means1.keys() & means2.keys()
+                    means_linked = 0
+                    # Checking if the found means have common links
+                    for swadesh in means_common:
+                        links_common = means1[swadesh] & means2[swadesh]
+                        if links_common:
+                            # Bundles are linkes with two or more entries in the result table
+                            bundles.update(links_common)
+                            means_linked += 1
+
+                    means_total = len(swadesh_total[perspective1] & swadesh_total[perspective2])
+
+                    if n2 > n1 and len(means_common) > means_linked:
+                        log.debug(f"{n1+1},{n2+1} : "
+                                  f"{len(means_common)} but {means_linked} of {means_total} : "
+                                  f"{', '.join(sorted(means_common))}")
+
+                    # means_linked > 0 means that means_total > 0 even more so
+                    distance = math.log(means_linked / means_total) / -0.14 if means_linked > 0 else 50
                     distance_data_array[n1][n2] = distance
 
         result = SwadeshAnalysis.export_dataframe(result_pool, distance_data_array, bundles)

From 67353f170b051cb294d34e6aa423f1752f37b559 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Thu, 1 Jun 2023 19:26:05 +0300
Subject: [PATCH 66/69] Distances web table

---
 lingvodoc/schema/query.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index b1cd7363..87bd76cc 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13336,7 +13336,7 @@ def split_lex(lex):
 
                     # means_linked > 0 means that means_total > 0 even more so
                     distance = math.log(means_linked / means_total) / -0.14 if means_linked > 0 else 50
-                    distance_data_array[n1][n2] = distance
+                    distance_data_array[n1][n2] = round(distance, 2)
 
         result = SwadeshAnalysis.export_dataframe(result_pool, distance_data_array, bundles)
 
@@ -13344,16 +13344,19 @@ def split_lex(lex):
         del result_pool
 
         xlsx_url = SwadeshAnalysis.export_xlsx(result, base_language_name, storage)
-        result_tables = (build_table(result['Cognates'], 'blue_light', width="300px"),
+        result_tables = (build_table(result['Distances'], 'orange_light', width="300px", index=True),
+                         build_table(result['Cognates'], 'blue_light', width="300px"),
                          build_table(result['Singles'], 'green_light', width="300px"))
 
         # Control output size
-        huge_size = 1048576
-        result = f"{result_tables[0]}<pre>\n\n</pre>{result_tables[1]}"
+        huge_size = 262144 #1048576
+        result = f"{result_tables[0]}<pre>\n\n</pre>{result_tables[1]}<pre>\n\n</pre>{result_tables[2]}"
         if len(result) > huge_size:
-            result = f"{result_tables[0]}<pre>\n\nNote: The table with single words is not shown due to huge summary size</pre>"
+            result = f"{result_tables[0]}<pre>\n\n</pre>{result_tables[1]}" \
+                     f"<pre>\n\nNote: The table with single words is not shown due to huge summary size</pre>"
         if len(result) > huge_size:
-            result = "<pre>\n\nNote: The result tables are not shown due to huge summary size</pre>"
+            result = f"{result_tables[0]}" \
+                     f"<pre>\n\nNote: The result tables with words are not shown due to huge summary size</pre>"
 
         # GC
         del result_tables

From 1da39a7569e37d05e7882657125f0308e9e562c6 Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Thu, 1 Jun 2023 21:09:26 +0300
Subject: [PATCH 67/69] Disabled word tables

---
 lingvodoc/schema/query.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 87bd76cc..42e3af37 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13329,7 +13329,7 @@ def split_lex(lex):
 
                     means_total = len(swadesh_total[perspective1] & swadesh_total[perspective2])
 
-                    if n2 > n1 and len(means_common) > means_linked:
+                    if n2 > n1 and len(means_common) > 0:
                         log.debug(f"{n1+1},{n2+1} : "
                                   f"{len(means_common)} but {means_linked} of {means_total} : "
                                   f"{', '.join(sorted(means_common))}")
@@ -13348,6 +13348,7 @@ def split_lex(lex):
                          build_table(result['Cognates'], 'blue_light', width="300px"),
                          build_table(result['Singles'], 'green_light', width="300px"))
 
+        '''
         # Control output size
         huge_size = 262144 #1048576
         result = f"{result_tables[0]}<pre>\n\n</pre>{result_tables[1]}<pre>\n\n</pre>{result_tables[2]}"
@@ -13355,8 +13356,10 @@ def split_lex(lex):
             result = f"{result_tables[0]}<pre>\n\n</pre>{result_tables[1]}" \
                      f"<pre>\n\nNote: The table with single words is not shown due to huge summary size</pre>"
         if len(result) > huge_size:
-            result = f"{result_tables[0]}" \
-                     f"<pre>\n\nNote: The result tables with words are not shown due to huge summary size</pre>"
+        '''
+
+        result = f"{result_tables[0]}" \
+                 f"<pre>\n\nNote: The result tables with words are not shown due to huge summary size</pre>"
 
         # GC
         del result_tables

From 01bc05966f7aba77364ad2baf67e557208341b6f Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Thu, 1 Jun 2023 23:05:03 +0300
Subject: [PATCH 68/69] The result tables are hidden

---
 lingvodoc/schema/query.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index 42e3af37..a41cfbf4 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13356,10 +13356,11 @@ def split_lex(lex):
             result = f"{result_tables[0]}<pre>\n\n</pre>{result_tables[1]}" \
                      f"<pre>\n\nNote: The table with single words is not shown due to huge summary size</pre>"
         if len(result) > huge_size:
+            result = f"{result_tables[0]}" \
+                     f"<pre>\n\nNote: The result tables with words are not shown due to huge summary size</pre>"
         '''
 
-        result = f"{result_tables[0]}" \
-                 f"<pre>\n\nNote: The result tables with words are not shown due to huge summary size</pre>"
+        result = "Note: The result tables are hidden"
 
         # GC
         del result_tables

From 6d3f94668e6de579f407ede1b9f6310a978e75cf Mon Sep 17 00:00:00 2001
From: Vladimir Monakhov <vmonakhov@ispras.ru>
Date: Fri, 2 Jun 2023 16:10:30 +0300
Subject: [PATCH 69/69] Forget tiny dicts

---
 lingvodoc/schema/query.py | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py
index a41cfbf4..ec6a912b 100644
--- a/lingvodoc/schema/query.py
+++ b/lingvodoc/schema/query.py
@@ -13162,6 +13162,7 @@ def split_lex(lex):
         entries_set = {}
         swadesh_total = {}
         result_pool = {}
+        tiny_dicts = set()
         for index, (perspective_id, transcription_field_id, translation_field_id) in \
                 enumerate(perspective_info_list):
 
@@ -13242,10 +13243,7 @@ def split_lex(lex):
             # Grouping translations by lexical entries.
             entries_set[perspective_id] = set()
             swadesh_total[perspective_id] = set()
-            result_pool[perspective_id] = {
-                'name': f"{index + 1}. {dictionary_name}",
-                'suit': (len(data_query) > 50)
-            }
+            result_pool[perspective_id] = {'name': dictionary_name}
             for row_index, row in enumerate(data_query):
                 entry_id = tuple(row[:2])
                 transcription_list, translation_list = row[2:4]
@@ -13275,12 +13273,18 @@ def split_lex(lex):
                             }
                             # Store entry_id and number of the lex within Swadesh's list
                             entries_set[perspective_id].add(entry_id)
-                            if (result_pool[perspective_id]['suit'] and
-                                not result_pool[perspective_id][entry_id]['borrowed']):
+                            if not result_pool[perspective_id][entry_id]['borrowed']:
                                 # Total list of Swadesh's words in the perspective,
                                 # they can have no any etimological links
                                 swadesh_total[perspective_id].add(swadesh_num)
 
+            # Forget the dictionary if it contains less than 50 Swadesh words
+            if len(swadesh_total[perspective_id]) < 50:
+                del entries_set[perspective_id]
+                del swadesh_total[perspective_id]
+                del result_pool[perspective_id]
+                tiny_dicts.add(dictionary_name)
+
             # GC
             del data_query
 
@@ -13296,8 +13300,7 @@ def split_lex(lex):
                     result_pool[perspective_id][entry_id]['group'] = group_index
                     swadesh = result_pool[perspective_id][entry_id]['swadesh']
                     # Store the correspondence: perspective { means(1/2/3) { etimological_groups(1.1/1.2/2.1/3.1)
-                    if (result_pool[perspective_id]['suit'] and
-                        not result_pool[perspective_id][entry_id]['borrowed']):
+                    if not result_pool[perspective_id][entry_id]['borrowed']:
                         means[perspective_id][swadesh].add(group_index)
 
         dictionary_count = len(means)
@@ -13310,6 +13313,8 @@ def split_lex(lex):
         bundles = set()
         # Calculate each-to-each distances, exclude self-to-self
         for n1, (perspective1, means1) in enumerate(means.items()):
+            # Numerate dictionaries
+            result_pool[perspective1]['name'] = f"{n1 + 1}. {result_pool[perspective1]['name']}"
             distance_header_array[n1] = result_pool[perspective1]['name']
             for n2, (perspective2, means2) in enumerate(means.items()):
                 if n1 == n2:
@@ -13348,9 +13353,8 @@ def split_lex(lex):
                          build_table(result['Cognates'], 'blue_light', width="300px"),
                          build_table(result['Singles'], 'green_light', width="300px"))
 
-        '''
         # Control output size
-        huge_size = 262144 #1048576
+        huge_size = 1048576
         result = f"{result_tables[0]}<pre>\n\n</pre>{result_tables[1]}<pre>\n\n</pre>{result_tables[2]}"
         if len(result) > huge_size:
             result = f"{result_tables[0]}<pre>\n\n</pre>{result_tables[1]}" \
@@ -13358,9 +13362,8 @@ def split_lex(lex):
         if len(result) > huge_size:
             result = f"{result_tables[0]}" \
                      f"<pre>\n\nNote: The result tables with words are not shown due to huge summary size</pre>"
-        '''
-
-        result = "Note: The result tables are hidden"
+        result += ("<pre>Note: The following dictionaries contain too less words and were not processed: \n\n" +
+                   '\n'.join(tiny_dicts) + "</pre>") if tiny_dicts else ""
 
         # GC
         del result_tables