diff --git a/lingvodoc/schema/query.py b/lingvodoc/schema/query.py index 8c270b5c..ec6a912b 100644 --- a/lingvodoc/schema/query.py +++ b/lingvodoc/schema/query.py @@ -362,8 +362,12 @@ import lingvodoc.scripts.docx_import as docx_import +import pandas as pd +from pretty_html_table import build_table + # Setting up logging. log = logging.getLogger(__name__) +logging.disable(level=logging.INFO) # Trying to set up celery logging. @@ -10887,1705 +10891,1735 @@ def f_callback(xyz): return result_x, f(result.x) @staticmethod - def perform_cognate_analysis( - language_str, - source_perspective_id, - base_language_id, - base_language_name, - group_field_id, - perspective_info_list, - multi_list, - multi_name_list, - mode, - distance_flag, - reference_perspective_id, - figure_flag, - distance_vowel_flag, - distance_consonant_flag, - match_translations_value, - only_orphans_flag, - locale_id, - storage, - task_status = None, - __debug_flag__ = False, - __intermediate_flag__ = False): - """ - Performs cognate analysis in either synchronous or asynchronous mode. - """ + def distance_graph( + language_str, + base_language_name, + distance_data_array, + distance_header_array, + mode, + storage, + storage_dir, + __debug_flag__ = False, + __plot_flag__ = True): - __result_flag__ = False + d_ij = (distance_data_array + distance_data_array.T) / 2 - if task_status is not None: - task_status.set(1, 0, 'Gathering grouping data') + log.debug( + '\ncognate_analysis {0}:' + '\ndistance_header_array:\n{1}' + '\ndistance_data_array:\n{2}' + '\nd_ij:\n{3}'.format( + language_str, + distance_header_array, + distance_data_array, + d_ij)) - # Sometimes in debugging mode we should return already computed results. + # Projecting the graph into a 2d plane via relative distance strain optimization, using PCA to + # orient it left-right. - if __debug_flag__: + if len(distance_data_array) > 1: - tag_data_digest = ( + embedding_2d, strain_2d = ( + CognateAnalysis.graph_2d_embedding(d_ij, verbose = __debug_flag__)) - hashlib.md5( + embedding_2d_pca = ( + sklearn.decomposition.PCA(n_components = 2) + .fit_transform(embedding_2d)) - repr(list(group_field_id) + - [perspective_info[0] for perspective_info in perspective_info_list]) + distance_2d = sklearn.metrics.euclidean_distances(embedding_2d) - .encode('utf-8')) + else: - .hexdigest()) + embedding_2d = numpy.zeros((1, 2)) + embedding_2d_pca = numpy.zeros((1, 2)) - result_file_name = ( + strain_2d = 0.0 - '__result_{0}_{1}__.gz'.format( + distance_2d = numpy.zeros((1, 1)) - 'multi{0}'.format(len(multi_list)) - if mode == 'multi' else - '{0}_{1}'.format(*base_language_id), + # Showing what we computed. - tag_data_digest)) + log.debug( + '\ncognate_analysis {0}:' + '\nembedding 2d:\n{1}' + '\nembedding 2d (PCA-oriented):\n{2}' + '\nstrain 2d:\n{3}' + '\ndistances 2d:\n{4}'.format( + language_str, + embedding_2d, + embedding_2d_pca, + strain_2d, + distance_2d)) - if __result_flag__ and os.path.exists(result_file_name): + # And now the same with 3d embedding. - with gzip.open( - result_file_name, 'rb') as result_file: + if len(distance_data_array) > 1: - result_dict = pickle.load(result_file) + embedding_3d, strain_3d = ( + CognateAnalysis.graph_3d_embedding(d_ij, verbose = __debug_flag__)) - return CognateAnalysis(**result_dict) + # At least three points, standard PCA-based orientation. - # Gathering entry grouping data. + if len(distance_data_array) >= 3: - perspective_dict = collections.defaultdict(dict) + embedding_3d_pca = ( + sklearn.decomposition.PCA(n_components = 3) + .fit_transform(embedding_3d)) - entry_already_set = set() - group_list = [] + # Only two points, so we take 2d embedding and extend it with zeros. - tag_dict = collections.defaultdict(set) + else: - text_dict = {} - entry_id_dict = {} + embedding_3d_pca = ( - if not __debug_flag__: + numpy.hstack(( + embedding_2d_pca, + numpy.zeros((embedding_2d_pca.shape[0], 1))))) - entry_already_set, group_list, group_time = ( + # Making 3d embedding actually 3d, if required. - CognateAnalysis.tag_data_plpgsql( - perspective_info_list, group_field_id)) + if embedding_3d_pca.shape[1] <= 2: + + embedding_3d_pca = ( + + numpy.hstack(( + embedding_3d_pca, + numpy.zeros((embedding_3d_pca.shape[0], 1))))) + + distance_3d = ( + sklearn.metrics.euclidean_distances(embedding_3d_pca)) else: - # If we are in debug mode, we try to load existing tag data to reduce debugging time. + embedding_3d = numpy.zeros((1, 3)) + embedding_3d_pca = numpy.zeros((1, 3)) - tag_data_file_name = ( + strain_3d = 0.0 - '__tag_data_{0}_{1}__.gz'.format( + distance_3d = numpy.zeros((1, 1)) - 'multi{0}'.format(len(multi_list)) - if mode == 'multi' else - '{0}_{1}'.format(*base_language_id), + # Showing what we've get. - tag_data_digest)) + log.debug( + '\ncognate_analysis {0}:' + '\nembedding 3d:\n{1}' + '\nembedding 3d (PCA-oriented):\n{2}' + '\nstrain 3d:\n{3}' + '\ndistances 3d:\n{4}'.format( + language_str, + embedding_3d, + embedding_3d_pca, + strain_3d, + distance_3d)) - # Checking if we have saved data. + # Computing minimum spanning tree via standard Jarnik-Prim-Dijkstra algorithm using 2d and 3d + # embedding distances to break ties. - if os.path.exists(tag_data_file_name): + if len(distance_data_array) <= 1: + mst_list = [] - with gzip.open(tag_data_file_name, 'rb') as tag_data_file: - entry_already_set, group_list, group_time = pickle.load(tag_data_file) + else: - else: + d_min, d_extra_min, min_i, min_j = min( + (d_ij[i,j], distance_2d[i,j] + distance_3d[i,j], i, j) + for i in range(d_ij.shape[0] - 1) + for j in range(i + 1, d_ij.shape[0])) - # Don't have existing data, so we gather it and then save it for later use. + mst_list = [(min_i, min_j)] + mst_dict = {} - entry_already_set, group_list, group_time = ( + # MST construction initialization. - CognateAnalysis.tag_data_plpgsql( - perspective_info_list, group_field_id)) + for i in range(d_ij.shape[0]): - with gzip.open(tag_data_file_name, 'wb') as tag_data_file: - pickle.dump((entry_already_set, group_list, group_time), tag_data_file) + if i == min_i or i == min_j: + continue - log.debug( - '\ncognate_analysis {0}:' - '\n{1} entries, {2} groups, {3:.2f}s elapsed time'.format( - language_str, - len(entry_already_set), - len(group_list), - group_time)) + d_min_i = (d_ij[i, min_i], distance_2d[i, min_i] + distance_3d[i, min_i]) + d_min_j = (d_ij[i, min_j], distance_2d[i, min_j] + distance_3d[i, min_j]) - if task_status is not None: - task_status.set(2, 5, 'Gathering analysis source data') + mst_dict[i] = ( + (d_min_i, min_i) if d_min_i <= d_min_j else + (d_min_j, min_i)) - # Getting text data for each perspective. + # Iterative MST construction. - dbTranslation = aliased(dbEntity, name = 'Translation') - dbSound = aliased(dbEntity, name = 'Sound') - dbMarkup = aliased(dbEntity, name = 'Markup') + while len(mst_dict) > 0: - dbPublishingTranslation = aliased(dbPublishingEntity, name = 'PublishingTranslation') - dbPublishingSound = aliased(dbPublishingEntity, name = 'PublishingSound') - dbPublishingMarkup = aliased(dbPublishingEntity, name = 'PublishingMarkup') + (d_min, d_extra_min, i_min, i_from_min) = min( + (d, d_extra, i, i_from) for i, ((d, d_extra), i_from) in mst_dict.items()) - phonemic_data_list = [] - suggestions_data_list = [] + log.debug('\n' + pprint.pformat(mst_dict)) + log.debug('\n' + repr((i_from_min, i_min, d_min, d_extra_min))) - sg_total_count = 0 - sg_xcript_count = 0 - sg_xlat_count = 0 - sg_both_count = 0 + mst_list.append((i_from_min, i_min)) + del mst_dict[i_min] - source_perspective_index = None + # Updating shortest connection info. - for index, (perspective_id, transcription_field_id, translation_field_id) in \ - enumerate(perspective_info_list): + for i_to in mst_dict.keys(): - if perspective_id == source_perspective_id: - source_perspective_index = index + d_to = (d_ij[i_min, i_to], distance_2d[i_min, i_to] + distance_3d[i_min, i_to]) - # Getting and saving perspective info. + if d_to < mst_dict[i_to][0]: + mst_dict[i_to] = (d_to, i_min) - perspective = DBSession.query(dbPerspective).filter_by( - client_id = perspective_id[0], object_id = perspective_id[1]).first() + log.debug( + '\ncognate_analysis {0}:' + '\nminimum spanning tree:\n{1}'.format( + language_str, + pprint.pformat(mst_list))) - perspective_name = perspective.get_translation(locale_id) - dictionary_name = perspective.parent.get_translation(locale_id) + # Plotting with matplotlib. + figure_url = None + if __plot_flag__: - transcription_rules = ( - '' if not perspective.additional_metadata else - perspective.additional_metadata.get('transcription_rules', '')) + figure = pyplot.figure(figsize = (10, 10)) + axes = figure.add_subplot(212) - perspective_data = perspective_dict[perspective_id] + axes.set_title( + 'Etymological distance tree (relative distance embedding)', + fontsize = 14, family = 'Gentium') - perspective_data['perspective_name'] = perspective_name - perspective_data['dictionary_name'] = dictionary_name - perspective_data['transcription_rules'] = transcription_rules + axes.axis('equal') + axes.axis('off') + axes.autoscale() - # Preparing to save additional data, if required. + def f(axes, embedding_pca): + """ + Plots specified graph embedding on a given axis. + """ - if mode == 'phonemic': + flag_3d = numpy.size(embedding_pca, 1) > 2 - phonemic_data_list.append([ - '{0} - {1}'.format(dictionary_name, perspective_name), '']) + for index, (position, name) in enumerate( + zip(embedding_pca, distance_header_array)): - elif mode == 'suggestions': + # Checking if any of the previous perspectives are already in this perspective's + # position. - suggestions_data_list.append([]) + same_position_index = None - log.debug( - '\ncognate_analysis {0}:' - '\n dictionary {1}/{2}: {3}' - '\n perspective {4}/{5}: {6}' - '\n transcription_rules: {7}'.format( - language_str, - perspective.parent_client_id, perspective.parent_object_id, - repr(dictionary_name.strip()), - perspective_id[0], perspective_id[1], - repr(perspective_name.strip()), - repr(transcription_rules))) + for i, p in enumerate(embedding_pca[:index]): + if numpy.linalg.norm(position - p) <= 1e-3: - # Getting text data. + same_position_index = i + break - transcription_query = ( + color = matplotlib.colors.hsv_to_rgb( + [(same_position_index or index) * 1.0 / len(distance_header_array), 0.5, 0.75]) - DBSession.query( - dbLexicalEntry.client_id, - dbLexicalEntry.object_id).filter( - dbLexicalEntry.parent_client_id == perspective_id[0], - dbLexicalEntry.parent_object_id == perspective_id[1], - dbLexicalEntry.marked_for_deletion == False, - dbEntity.parent_client_id == dbLexicalEntry.client_id, - dbEntity.parent_object_id == dbLexicalEntry.object_id, - dbEntity.field_client_id == transcription_field_id[0], - dbEntity.field_object_id == transcription_field_id[1], - dbEntity.marked_for_deletion == False, - dbPublishingEntity.client_id == dbEntity.client_id, - dbPublishingEntity.object_id == dbEntity.object_id, - dbPublishingEntity.published == True, - dbPublishingEntity.accepted == True) + label_same_str = ( + '' if same_position_index is None else + ' (same as {0})'.format(same_position_index + 1)) - .add_columns( - func.array_agg(dbEntity.content).label('transcription')) + kwargs = { + 's': 35, + 'color': color, + 'label': '{0}) {1}{2}'.format(index + 1, name, label_same_str)} - .group_by(dbLexicalEntry)).subquery() + axes.scatter(*position, **kwargs) - translation_query = ( + # Annotating position with its number, but only if we hadn't already annotated nearby. - DBSession.query( - dbLexicalEntry.client_id, - dbLexicalEntry.object_id).filter( - dbLexicalEntry.parent_client_id == perspective_id[0], - dbLexicalEntry.parent_object_id == perspective_id[1], - dbLexicalEntry.marked_for_deletion == False, - dbEntity.parent_client_id == dbLexicalEntry.client_id, - dbEntity.parent_object_id == dbLexicalEntry.object_id, - dbEntity.field_client_id == translation_field_id[0], - dbEntity.field_object_id == translation_field_id[1], - dbEntity.marked_for_deletion == False, - dbPublishingEntity.client_id == dbEntity.client_id, - dbPublishingEntity.object_id == dbEntity.object_id, - dbPublishingEntity.published == True, - dbPublishingEntity.accepted == True) + if same_position_index is None: - .add_columns( - func.array_agg(dbEntity.content).label('translation')) + if flag_3d: - .group_by(dbLexicalEntry)).subquery() + axes.text( + position[0] + 0.01, position[1], position[2] + 0.01, + str(index + 1), None, fontsize = 14) - # Main query for transcription/translation data. + else: - data_query = ( - DBSession.query(transcription_query) + axes.annotate( + str(index + 1), + (position[0] + 0.01, position[1] - 0.005), + fontsize = 14) - .outerjoin(translation_query, and_( - transcription_query.c.client_id == translation_query.c.client_id, - transcription_query.c.object_id == translation_query.c.object_id)) + # Plotting minimum spanning trees. - .add_columns( - translation_query.c.translation)) + line_list = [ + (embedding_pca[i], embedding_pca[j]) + for i, j in mst_list] - # If we need to do an acoustic analysis, we also get sound/markup data. + line_collection = ( + Line3DCollection if flag_3d else LineCollection)( + line_list, zorder = 0, color = 'gray') - if mode == 'acoustic': + axes.add_collection(line_collection) - sound_markup_query = ( + pyplot.setp(axes.texts, family = 'Gentium') - DBSession.query( - dbLexicalEntry.client_id, - dbLexicalEntry.object_id).filter( - dbLexicalEntry.parent_client_id == perspective_id[0], - dbLexicalEntry.parent_object_id == perspective_id[1], - dbLexicalEntry.marked_for_deletion == False, - dbMarkup.parent_client_id == dbLexicalEntry.client_id, - dbMarkup.parent_object_id == dbLexicalEntry.object_id, - dbMarkup.marked_for_deletion == False, - dbMarkup.additional_metadata.contains({'data_type': 'praat markup'}), - dbPublishingMarkup.client_id == dbMarkup.client_id, - dbPublishingMarkup.object_id == dbMarkup.object_id, - dbPublishingMarkup.published == True, - dbPublishingMarkup.accepted == True, - dbSound.client_id == dbMarkup.self_client_id, - dbSound.object_id == dbMarkup.self_object_id, - dbSound.marked_for_deletion == False, - dbPublishingSound.client_id == dbSound.client_id, - dbPublishingSound.object_id == dbSound.object_id, - dbPublishingSound.published == True, - dbPublishingSound.accepted == True) + # Plotting our embedding, creating the legend. - .add_columns( + f(axes, embedding_2d_pca) - func.jsonb_agg(func.jsonb_build_array( - dbSound.client_id, dbSound.object_id, dbSound.content, - dbMarkup.client_id, dbMarkup.object_id, dbMarkup.content)) + pyplot.tight_layout() - .label('sound_markup')) + legend = axes.legend( + scatterpoints = 1, + loc = 'upper center', + bbox_to_anchor = (0.5, -0.05), + frameon = False, + handlelength = 0.5, + handletextpad = 0.75, + fontsize = 14) - .group_by(dbLexicalEntry)).subquery() + pyplot.setp(legend.texts, family = 'Gentium') + axes.autoscale_view() - # Adding sound/markup retrieval to the main query. + # Saving generated figure for debug purposes, if required. - data_query = ( - data_query + if __debug_flag__: - .outerjoin(sound_markup_query, and_( - transcription_query.c.client_id == sound_markup_query.c.client_id, - transcription_query.c.object_id == sound_markup_query.c.object_id)) + figure_file_name = ( + 'figure cognate distance{0}.png'.format( + mode_name_str)) - .add_columns( - sound_markup_query.c.sound_markup)) + with open(figure_file_name, 'wb') as figure_file: - # If we are in asynchronous mode, we need to look up how many data rows we need - # to process for this perspective. + pyplot.savefig( + figure_file, + bbox_extra_artists = (legend,), + bbox_inches = 'tight', + pad_inches = 0.25, + format = 'png') - if task_status is not None: + # Also generating 3d embedding figure. - row_count = data_query.count() + figure_3d = pyplot.figure() + figure_3d.set_size_inches(16, 10) - log.debug( - 'cognate_analysis {0}: perspective {1}/{2}: {3} data rows'.format( - language_str, - perspective_id[0], perspective_id[1], - row_count)) + axes_3d = figure_3d.add_subplot(111, projection = '3d') - # Grouping transcriptions and translations by lexical entries. + axes_3d.axis('equal') + axes_3d.view_init(elev = 30, azim = -75) - for row_index, row in enumerate(data_query.all()): + f(axes_3d, embedding_3d_pca) - entry_id = tuple(row[:2]) - transcription_list, translation_list = row[2:4] + # Setting up legend. - transcription_list = ( - [] if not transcription_list else [ - transcription.strip() - for transcription in transcription_list - if transcription.strip()]) + axes_3d.set_xlabel('X') + axes_3d.set_ylabel('Y') + axes_3d.set_zlabel('Z') - # If we have no trascriptions for this lexical entry, we skip it altogether. + legend_3d = axes_3d.legend( + scatterpoints = 1, + loc = 'upper center', + bbox_to_anchor = (0.5, -0.05), + frameon = False, + handlelength = 0.5, + handletextpad = 0.75, + fontsize = 14) - if not transcription_list: - continue + pyplot.setp(legend_3d.texts, family = 'Gentium') - translation_list = ( - [] if not translation_list else [ - translation.strip() - for translation in translation_list - if translation.strip()]) + # Fake cubic bounding box to force axis aspect ratios, see + # https://stackoverflow.com/a/13701747/2016856. - # Saving transcription / translation data. + X = embedding_3d_pca[:,0] + Y = embedding_3d_pca[:,1] + Z = embedding_3d_pca[:,2] - translation_str = ( - translation_list[0] if translation_list else '') + max_range = numpy.array([ + X.max() - X.min(), Y.max() - Y.min(), Z.max() - Z.min()]).max() - if mode == 'phonemic': + Xb = ( + 0.5 * max_range * numpy.mgrid[-1:2:2,-1:2:2,-1:2:2][0].flatten() + + 0.5 * (X.max() + X.min())) - for transcription in transcription_list: - phonemic_data_list[-1].extend([transcription, translation_str]) + Yb = ( + 0.5 * max_range * numpy.mgrid[-1:2:2,-1:2:2,-1:2:2][1].flatten() + + 0.5 * (Y.max() + Y.min())) - elif mode == 'suggestions' and entry_id not in entry_already_set: + Zb = ( + 0.5 * max_range * numpy.mgrid[-1:2:2,-1:2:2,-1:2:2][2].flatten() + + 0.5 * (Z.max() + Z.min())) - suggestions_data_list[-1].append([ - '|'.join(transcription_list), - '|'.join(translation_list)]) + for xb, yb, zb in zip(Xb, Yb, Zb): + axes_3d.plot([xb], [yb], [zb], 'w') - sg_total_count += 1 + axes_3d.autoscale_view() - # Counting how many instances of more than one transcription and / or translation - # we have. + # And saving it. - if len(transcription_list) > 1: - sg_xcript_count += 1 + figure_3d_file_name = ( + 'figure 3d cognate distance{0}.png'.format( + mode_name_str)) - if len(translation_list) > 1: - sg_xlat_count += 1 + with open(figure_3d_file_name, 'wb') as figure_3d_file: - if len(transcription_list) > 1 and len(translation_list) > 1: - sg_both_count += 1 + figure_3d.savefig( + figure_3d_file, + bbox_extra_artists = (legend_3d,), + bbox_inches = 'tight', + pad_inches = 0.25, + format = 'png') - # If we are fetching additional acoustic data, it's possible we have to process - # sound recordings and markup this lexical entry has. + # Storing generated figure as a PNG image. + current_datetime = datetime.datetime.now(datetime.timezone.utc) + figure_filename = pathvalidate.sanitize_filename( + '{0} cognate{1} analysis {2:04d}.{3:02d}.{4:02d}.png'.format( + base_language_name[:64], + ' ' + mode if mode else '', + current_datetime.year, + current_datetime.month, + current_datetime.day)) - if len(row) > 4 and row[4]: + figure_path = os.path.join(storage_dir, figure_filename) + os.makedirs(os.path.dirname(figure_path), exist_ok = True) - row_list = row[4][0] + with open(figure_path, 'wb') as figure_file: - result = ( - CognateAnalysis.acoustic_data( - base_language_id, - tuple(row_list[0:2]), row_list[2], - tuple(row_list[3:5]), row_list[5], - storage, - __debug_flag__)) + figure.savefig( + figure_file, + bbox_extra_artists = (legend,), + bbox_inches = 'tight', + pad_inches = 0.25, + format = 'png') - # Updating task progress, if required. + cur_time = time.time() + figure_url = ''.join([ + storage['prefix'], storage['static_route'], + 'cognate', '/', str(cur_time), '/', figure_filename]) + ### Plotting with matplotlib ends - if task_status is not None: + return ( + figure_url, + mst_list, + embedding_2d_pca, + embedding_3d_pca + ) - percent = int(math.floor(90.0 * - (index + float(row_index + 1) / row_count) / - len(perspective_info_list))) + @staticmethod + def perform_cognate_analysis( + language_str, + source_perspective_id, + base_language_id, + base_language_name, + group_field_id, + perspective_info_list, + multi_list, + multi_name_list, + mode, + distance_flag, + reference_perspective_id, + figure_flag, + distance_vowel_flag, + distance_consonant_flag, + match_translations_value, + only_orphans_flag, + locale_id, + storage, + task_status = None, + __debug_flag__ = False, + __intermediate_flag__ = False): + """ + Performs cognate analysis in either synchronous or asynchronous mode. + """ - task_status.set(2, 5 + percent, 'Gathering analysis source data') + __result_flag__ = False - entry_data_list = (index, - transcription_list, - translation_list, - result) + if task_status is not None: + task_status.set(1, 0, 'Gathering grouping data') - # No additional acoustic data. + # Sometimes in debugging mode we should return already computed results. - else: - entry_data_list = (index, transcription_list, translation_list) + if __debug_flag__: - text_dict[entry_id] = entry_data_list + tag_data_digest = ( - entry_id_key = ( + hashlib.md5( - index, - '|'.join(transcription_list) + ( - ' ʽ' + '|'.join(translation_list) + 'ʼ' if translation_list else '')) + repr(list(group_field_id) + + [perspective_info[0] for perspective_info in perspective_info_list]) - entry_id_dict[entry_id_key] = entry_id + .encode('utf-8')) - # Showing some info on non-grouped entries, if required. + .hexdigest()) - if mode == 'suggestions': + result_file_name = ( - log.debug( - '\ncognate_analysis {0}:' - '\n{1} non-grouped entries' - '\n{2} with multiple transcriptions' - '\n{3} with multiple translations' - '\n{4} with multiple transcriptions and translations'.format( - language_str, - sg_total_count, - sg_xcript_count, - sg_xlat_count, - sg_both_count)) - - # Also, if we are computing cognate suggestions, we should have a valid source perspective, it's - # an error otherwise. - - if source_perspective_index is None: - - return ResponseError(message = - 'Cognate suggestions require that the source perspective ' - 'is among the ones being analyzed.') - - if task_status is not None: - task_status.set(3, 95, 'Performing analysis') - - # Ok, and now we form the source data for analysis. + '__result_{0}_{1}__.gz'.format( - result_list = [[]] + 'multi{0}'.format(len(multi_list)) + if mode == 'multi' else + '{0}_{1}'.format(*base_language_id), - perspective_id_list = [] - perspective_name_list = [] + tag_data_digest)) - for perspective_id, transcription_field_id, translation_field_id in perspective_info_list: + if __result_flag__ and os.path.exists(result_file_name): - perspective_id_list.append(perspective_id) - perspective_data = perspective_dict[perspective_id] + with gzip.open( + result_file_name, 'rb') as result_file: - perspective_str = '{0} - {1}'.format( - perspective_data['dictionary_name'], - perspective_data['perspective_name']) + result_dict = pickle.load(result_file) - perspective_name_list.append(perspective_str) + return CognateAnalysis(**result_dict) - # Also going to use transcription transformation rules. + # Gathering entry grouping data. - result_list[0].extend([ - perspective_str, - perspective_data['transcription_rules']]) + perspective_dict = collections.defaultdict(dict) - log.debug( - '\ncognate_analysis {0}:' - '\nsource_perspective_index: {1}' - '\nperspective_list:\n{2}' - '\nheader_list:\n{3}'.format( - language_str, - source_perspective_index, - pprint.pformat(perspective_name_list, width = 108), - pprint.pformat(result_list[0], width = 108))) + entry_already_set = set() + group_list = [] - # Each group of lexical entries. + tag_dict = collections.defaultdict(set) - not_enough_count = 0 + text_dict = {} + entry_id_dict = {} - total_transcription_count = 0 - total_translation_count = 0 + if not __debug_flag__: - not_suggestions = mode != 'suggestions' + entry_already_set, group_list, group_time = ( - for entry_id_set in group_list: + CognateAnalysis.tag_data_plpgsql( + perspective_info_list, group_field_id)) - group_entry_id_list = [[] - for i in range(len(perspective_info_list))] + else: - group_transcription_list = [[] - for i in range(len(perspective_info_list))] + # If we are in debug mode, we try to load existing tag data to reduce debugging time. - group_translation_list = [[] - for i in range(len(perspective_info_list))] + tag_data_file_name = ( - group_acoustic_list = [None - for i in range(len(perspective_info_list))] + '__tag_data_{0}_{1}__.gz'.format( - transcription_count = 0 - translation_count = 0 + 'multi{0}'.format(len(multi_list)) + if mode == 'multi' else + '{0}_{1}'.format(*base_language_id), - for entry_id in entry_id_set: + tag_data_digest)) - if entry_id not in text_dict: - continue + # Checking if we have saved data. - # Processing text data of each entry of the group. + if os.path.exists(tag_data_file_name): - entry_data_list = text_dict[entry_id] + with gzip.open(tag_data_file_name, 'rb') as tag_data_file: + entry_already_set, group_list, group_time = pickle.load(tag_data_file) - (index, - transcription_list, - translation_list) = ( + else: - entry_data_list[:3]) + # Don't have existing data, so we gather it and then save it for later use. - group_entry_id_list[index].append(entry_id) + entry_already_set, group_list, group_time = ( - group_transcription_list[index].extend(transcription_list) - group_translation_list[index].extend(translation_list) + CognateAnalysis.tag_data_plpgsql( + perspective_info_list, group_field_id)) - transcription_count += len(transcription_list) - translation_count += len(translation_list) + with gzip.open(tag_data_file_name, 'wb') as tag_data_file: + pickle.dump((entry_already_set, group_list, group_time), tag_data_file) - if (len(entry_data_list) > 3 and - entry_data_list[3] and - group_acoustic_list[index] is None): + log.debug( + '\ncognate_analysis {0}:' + '\n{1} entries, {2} groups, {3:.2f}s elapsed time'.format( + language_str, + len(entry_already_set), + len(group_list), + group_time)) - group_acoustic_list[index] = entry_data_list[3] + if task_status is not None: + task_status.set(2, 5, 'Gathering analysis source data') - # Dropping groups with transcriptions from no more than a single dictionary, if required. + # Getting text data for each perspective. - if (not_suggestions and - sum(min(1, len(transcription_list)) - for transcription_list in group_transcription_list) <= 1): + dbTranslation = aliased(dbEntity, name = 'Translation') + dbSound = aliased(dbEntity, name = 'Sound') + dbMarkup = aliased(dbEntity, name = 'Markup') - not_enough_count += 1 - continue + dbPublishingTranslation = aliased(dbPublishingEntity, name = 'PublishingTranslation') + dbPublishingSound = aliased(dbPublishingEntity, name = 'PublishingSound') + dbPublishingMarkup = aliased(dbPublishingEntity, name = 'PublishingMarkup') - total_transcription_count += transcription_count - total_translation_count += translation_count + phonemic_data_list = [] + suggestions_data_list = [] - result_list.append([]) + sg_total_count = 0 + sg_xcript_count = 0 + sg_xlat_count = 0 + sg_both_count = 0 - group_zipper = zip( - group_entry_id_list, - group_transcription_list, - group_translation_list, - group_acoustic_list) + source_perspective_index = None + for index, (perspective_id, transcription_field_id, translation_field_id) in \ + enumerate(perspective_info_list): - # Forming row of the source data table based on the entry group. + if perspective_id == source_perspective_id: + source_perspective_index = index - for ( - index, ( - entry_id_list, - transcription_list, - translation_list, - acoustic_list)) in ( + # Getting and saving perspective info. - enumerate(group_zipper)): + perspective = DBSession.query(dbPerspective).filter_by( + client_id = perspective_id[0], object_id = perspective_id[1]).first() - transcription_str = '|'.join(transcription_list) - translation_str = '|'.join(translation_list) + perspective_name = perspective.get_translation(locale_id) + dictionary_name = perspective.parent.get_translation(locale_id) - result_list[-1].append(transcription_str) - result_list[-1].append(translation_str) + transcription_rules = ( + '' if not perspective.additional_metadata else + perspective.additional_metadata.get('transcription_rules', '')) - if mode == 'acoustic': - result_list[-1].extend(acoustic_list or ['', '', '', '', '']) + perspective_data = perspective_dict[perspective_id] - # Saving mapping from the translation / transcription info string to an id of one entry of - # the group. + perspective_data['perspective_name'] = perspective_name + perspective_data['dictionary_name'] = dictionary_name + perspective_data['transcription_rules'] = transcription_rules - if transcription_list or translation_list: + # Preparing to save additional data, if required. - entry_id_key = ( + if mode == 'phonemic': - index, - transcription_str + ( - ' ʽ' + translation_str + 'ʼ' if translation_str else '')) + phonemic_data_list.append([ + '{0} - {1}'.format(dictionary_name, perspective_name), '']) - entry_id_dict[entry_id_key] = entry_id_list[0] + elif mode == 'suggestions': - # Showing what we've gathered. + suggestions_data_list.append([]) - log.debug( - '\ncognate_analysis {0}:' - '\n len(group_list): {1}' - '\n len(result_list): {2}' - '\n not_enough_count: {3}' - '\n transcription_count: {4}' - '\n translation_count: {5}' - '\n result_list:\n{6}'.format( + log.debug( + '\ncognate_analysis {0}:' + '\n dictionary {1}/{2}: {3}' + '\n perspective {4}/{5}: {6}' + '\n transcription_rules: {7}'.format( language_str, - len(group_list), - len(result_list), - not_enough_count, - total_transcription_count, - total_translation_count, - pprint.pformat(result_list, width = 108))) - - # If we have no data at all, we return empty result. - - if len(result_list) <= 1 and not_suggestions: + perspective.parent_client_id, perspective.parent_object_id, + repr(dictionary_name.strip()), + perspective_id[0], perspective_id[1], + repr(perspective_name.strip()), + repr(transcription_rules))) - return CognateAnalysis( - triumph = True, - dictionary_count = len(perspective_info_list), - group_count = len(group_list), - not_enough_count = not_enough_count, - transcription_count = total_transcription_count, - translation_count = total_translation_count, - result = '', - xlsx_url = '', - distance_list = [], - figure_url = '', - intermediate_url_list = None) + # Getting text data. - analysis_f = ( - cognate_acoustic_analysis_f if mode == 'acoustic' else - cognate_reconstruction_f if mode == 'reconstruction' else - cognate_reconstruction_multi_f if mode == 'multi' else - cognate_suggestions_f if mode == 'suggestions' else - cognate_analysis_f) + transcription_query = ( - # Preparing analysis input. + DBSession.query( + dbLexicalEntry.client_id, + dbLexicalEntry.object_id).filter( + dbLexicalEntry.parent_client_id == perspective_id[0], + dbLexicalEntry.parent_object_id == perspective_id[1], + dbLexicalEntry.marked_for_deletion == False, + dbEntity.parent_client_id == dbLexicalEntry.client_id, + dbEntity.parent_object_id == dbLexicalEntry.object_id, + dbEntity.field_client_id == transcription_field_id[0], + dbEntity.field_object_id == transcription_field_id[1], + dbEntity.marked_for_deletion == False, + dbPublishingEntity.client_id == dbEntity.client_id, + dbPublishingEntity.object_id == dbEntity.object_id, + dbPublishingEntity.published == True, + dbPublishingEntity.accepted == True) - phonemic_input_list = [ - ''.join(text + '\0' for text in text_list) - for text_list in phonemic_data_list] + .add_columns( + func.array_agg(dbEntity.content).label('transcription')) - suggestions_result_list = [] + .group_by(dbLexicalEntry)).subquery() - for tt_list in itertools.zip_longest( - *suggestions_data_list, fillvalue = ['', '']): + translation_query = ( - suggestions_result_list.append([]) + DBSession.query( + dbLexicalEntry.client_id, + dbLexicalEntry.object_id).filter( + dbLexicalEntry.parent_client_id == perspective_id[0], + dbLexicalEntry.parent_object_id == perspective_id[1], + dbLexicalEntry.marked_for_deletion == False, + dbEntity.parent_client_id == dbLexicalEntry.client_id, + dbEntity.parent_object_id == dbLexicalEntry.object_id, + dbEntity.field_client_id == translation_field_id[0], + dbEntity.field_object_id == translation_field_id[1], + dbEntity.marked_for_deletion == False, + dbPublishingEntity.client_id == dbEntity.client_id, + dbPublishingEntity.object_id == dbEntity.object_id, + dbPublishingEntity.published == True, + dbPublishingEntity.accepted == True) - for tt in tt_list: - suggestions_result_list[-1].extend(tt) + .add_columns( + func.array_agg(dbEntity.content).label('translation')) - if mode == 'suggestions': + .group_by(dbLexicalEntry)).subquery() - # Showing additional ungrouped input data, if required. + # Main query for transcription/translation data. - log.debug( - '\ncognate_analysis {0}:' - '\nsuggestions_result_list:\n{1}'.format( - language_str, - pprint.pformat(suggestions_result_list, width = 144))) + data_query = ( + DBSession.query(transcription_query) - result_input = ( + .outerjoin(translation_query, and_( + transcription_query.c.client_id == translation_query.c.client_id, + transcription_query.c.object_id == translation_query.c.object_id)) - ''.join( - ''.join(text + '\0' for text in text_list) + .add_columns( + translation_query.c.translation)) - for text_list in ( - result_list + suggestions_result_list))) + # If we need to do an acoustic analysis, we also get sound/markup data. - input = '\0'.join(phonemic_input_list + [result_input]) + if mode == 'acoustic': - log.debug( - '\ncognate_analysis {0}:' - '\nanalysis_f: {1}' - '\ninput ({2} columns, {3} rows{4}):\n{5}'.format( - language_str, - repr(analysis_f), - len(perspective_info_list), - len(result_list), - '' if mode != 'suggestions' else - ', {0} ungrouped rows'.format(len(suggestions_result_list)), - pprint.pformat([input[i : i + 256] - for i in range(0, len(input), 256)], width = 144))) + sound_markup_query = ( - # Saving input to a file, if required. + DBSession.query( + dbLexicalEntry.client_id, + dbLexicalEntry.object_id).filter( + dbLexicalEntry.parent_client_id == perspective_id[0], + dbLexicalEntry.parent_object_id == perspective_id[1], + dbLexicalEntry.marked_for_deletion == False, + dbMarkup.parent_client_id == dbLexicalEntry.client_id, + dbMarkup.parent_object_id == dbLexicalEntry.object_id, + dbMarkup.marked_for_deletion == False, + dbMarkup.additional_metadata.contains({'data_type': 'praat markup'}), + dbPublishingMarkup.client_id == dbMarkup.client_id, + dbPublishingMarkup.object_id == dbMarkup.object_id, + dbPublishingMarkup.published == True, + dbPublishingMarkup.accepted == True, + dbSound.client_id == dbMarkup.self_client_id, + dbSound.object_id == dbMarkup.self_object_id, + dbSound.marked_for_deletion == False, + dbPublishingSound.client_id == dbSound.client_id, + dbPublishingSound.object_id == dbSound.object_id, + dbPublishingSound.published == True, + dbPublishingSound.accepted == True) - storage_dir = None - intermediate_url_list = [] + .add_columns( - if __debug_flag__ or __intermediate_flag__: + func.jsonb_agg(func.jsonb_build_array( + dbSound.client_id, dbSound.object_id, dbSound.content, + dbMarkup.client_id, dbMarkup.object_id, dbMarkup.content)) - language_name_str = ( - ' '.join(multi_name_list) if mode == 'multi' else - base_language_name.strip()) + .label('sound_markup')) - if len(language_name_str) > 64: - language_name_str = language_name_str[:64] + '...' + .group_by(dbLexicalEntry)).subquery() - mode_name_str = ( + # Adding sound/markup retrieval to the main query. - '{0} {1} {2} {3}{4}'.format( + data_query = ( + data_query - ' multi{0}'.format(len(multi_list)) - if mode == 'multi' else - (' ' + mode if mode else ''), + .outerjoin(sound_markup_query, and_( + transcription_query.c.client_id == sound_markup_query.c.client_id, + transcription_query.c.object_id == sound_markup_query.c.object_id)) - language_name_str, + .add_columns( + sound_markup_query.c.sound_markup)) - ' '.join(str(count) for id, count in multi_list) - if mode == 'multi' else - len(perspective_info_list), + # If we are in asynchronous mode, we need to look up how many data rows we need + # to process for this perspective. - len(result_list), + if task_status is not None: - '' if not_suggestions else - ' {} {} {} {}'.format( - len(suggestions_result_list), - source_perspective_index, - match_translations_value, - int(only_orphans_flag)))) + row_count = data_query.count() - cognate_name_str = ( - 'cognate' + mode_name_str) + log.debug( + 'cognate_analysis {0}: perspective {1}/{2}: {3} data rows'.format( + language_str, + perspective_id[0], perspective_id[1], + row_count)) - # Initializing file storage directory, if required. + # Grouping transcriptions and translations by lexical entries. - if __intermediate_flag__ and storage_dir is None: + for row_index, row in enumerate(data_query.all()): - cur_time = time.time() + entry_id = tuple(row[:2]) + transcription_list, translation_list = row[2:4] - storage_dir = os.path.join( - storage['path'], 'cognate', str(cur_time)) + transcription_list = ( + [] if not transcription_list else [ + transcription.strip() + for transcription in transcription_list + if transcription.strip()]) - for extension, encoding in ('utf8', 'utf-8'), ('utf16', 'utf-16'): + # If we have no trascriptions for this lexical entry, we skip it altogether. - input_file_name = ( + if not transcription_list: + continue - pathvalidate.sanitize_filename( - 'input {0}.{1}'.format( - cognate_name_str, extension))) + translation_list = ( + [] if not translation_list else [ + translation.strip() + for translation in translation_list + if translation.strip()]) - # Saving to the working directory... + # Saving transcription / translation data. - if __debug_flag__: + translation_str = ( + translation_list[0] if translation_list else '') - with open(input_file_name, 'wb') as input_file: - input_file.write(input.encode(encoding)) + if mode == 'phonemic': - # ...and / or to the file storage. + for transcription in transcription_list: + phonemic_data_list[-1].extend([transcription, translation_str]) - if __intermediate_flag__: + elif mode == 'suggestions' and entry_id not in entry_already_set: - input_path = os.path.join( - storage_dir, input_file_name) + suggestions_data_list[-1].append([ + '|'.join(transcription_list), + '|'.join(translation_list)]) - os.makedirs( - os.path.dirname(input_path), - exist_ok = True) + sg_total_count += 1 - with open(input_path, 'wb') as input_file: - input_file.write(input.encode(encoding)) + # Counting how many instances of more than one transcription and / or translation + # we have. - input_url = ''.join([ - storage['prefix'], - storage['static_route'], - 'cognate', '/', - str(cur_time), '/', - input_file_name]) + if len(transcription_list) > 1: + sg_xcript_count += 1 - intermediate_url_list.append(input_url) + if len(translation_list) > 1: + sg_xlat_count += 1 - # Calling analysis library, starting with getting required output buffer size and continuing - # with analysis proper. + if len(transcription_list) > 1 and len(translation_list) > 1: + sg_both_count += 1 - if mode == 'multi': + # If we are fetching additional acoustic data, it's possible we have to process + # sound recordings and markup this lexical entry has. - multi_count_list = [ - perspective_count - for language_id, perspective_count in multi_list] + if len(row) > 4 and row[4]: - perspective_count_array = ( - ctypes.c_int * len(multi_list))(*multi_count_list) + row_list = row[4][0] - # int CognateMultiReconstruct_GetAllOutput( - # LPTSTR bufIn, int* pnCols, int nGroups, int nRows, LPTSTR bufOut, int flags) + result = ( + CognateAnalysis.acoustic_data( + base_language_id, + tuple(row_list[0:2]), row_list[2], + tuple(row_list[3:5]), row_list[5], + storage, + __debug_flag__)) - output_buffer_size = analysis_f( - None, - perspective_count_array, - len(multi_list), - len(result_list), - None, - 1) + # Updating task progress, if required. - elif mode == 'suggestions': + if task_status is not None: - # int GuessCognates_GetAllOutput( - # LPTSTR bufIn, int nCols, int nRowsCorresp, int nRowsRest, int iDictThis, int lookMeaning, - # int onlyOrphans, LPTSTR bufOut, int flags) + percent = int(math.floor(90.0 * + (index + float(row_index + 1) / row_count) / + len(perspective_info_list))) - output_buffer_size = analysis_f( - None, - len(perspective_info_list), - len(result_list), - len(suggestions_result_list), - source_perspective_index, - match_translations_value, - int(only_orphans_flag), - None, - 1) + task_status.set(2, 5 + percent, 'Gathering analysis source data') - else: + entry_data_list = (index, + transcription_list, + translation_list, + result) - # int CognateAnalysis_GetAllOutput( - # LPTSTR bufIn, int nCols, int nRows, LPTSTR bufOut, int flags) + # No additional acoustic data. - output_buffer_size = analysis_f( - None, - len(perspective_info_list), - len(result_list), - None, - 1) + else: + entry_data_list = (index, transcription_list, translation_list) - log.debug( - '\ncognate_analysis {0}: output buffer size {1}'.format( - language_str, - output_buffer_size)) + text_dict[entry_id] = entry_data_list - input_buffer = ctypes.create_unicode_buffer(input) + entry_id_key = ( - # Saving input buffer to a file, if required. + index, + '|'.join(transcription_list) + ( + ' ʽ' + '|'.join(translation_list) + 'ʼ' if translation_list else '')) - if __debug_flag__: + entry_id_dict[entry_id_key] = entry_id - input_file_name = ( - 'input {0}.buffer'.format( - cognate_name_str)) + # Showing some info on non-grouped entries, if required. - with open(input_file_name, 'wb') as input_file: - input_file.write(bytes(input_buffer)) + if mode == 'suggestions': - output_buffer = ctypes.create_unicode_buffer(output_buffer_size + 256) + log.debug( + '\ncognate_analysis {0}:' + '\n{1} non-grouped entries' + '\n{2} with multiple transcriptions' + '\n{3} with multiple translations' + '\n{4} with multiple transcriptions and translations'.format( + language_str, + sg_total_count, + sg_xcript_count, + sg_xlat_count, + sg_both_count)) - if mode == 'multi': + # Also, if we are computing cognate suggestions, we should have a valid source perspective, it's + # an error otherwise. - result = analysis_f( - input_buffer, - perspective_count_array, - len(multi_list), - len(result_list), - output_buffer, - 1) + if source_perspective_index is None: - elif mode == 'suggestions': + return ResponseError(message = + 'Cognate suggestions require that the source perspective ' + 'is among the ones being analyzed.') - result = analysis_f( - input_buffer, - len(perspective_info_list), - len(result_list), - len(suggestions_result_list), - source_perspective_index, - match_translations_value, - int(only_orphans_flag), - output_buffer, - 1) + if task_status is not None: + task_status.set(3, 95, 'Performing analysis') - else: + # Ok, and now we form the source data for analysis. - result = analysis_f( - input_buffer, - len(perspective_info_list), - len(result_list), - output_buffer, - 1) + result_list = [[]] - log.debug( - '\ncognate_analysis {0}: result {1}'.format( - language_str, - result)) + perspective_id_list = [] + perspective_name_list = [] - # If we don't have a good result, we return an error. + for perspective_id, transcription_field_id, translation_field_id in perspective_info_list: - if result <= 0: + perspective_id_list.append(perspective_id) + perspective_data = perspective_dict[perspective_id] - if task_status is not None: + perspective_str = '{0} - {1}'.format( + perspective_data['dictionary_name'], + perspective_data['perspective_name']) - task_status.set(5, 100, - 'Finished (ERROR): library call error {0}'.format(result)) + perspective_name_list.append(perspective_str) - return ResponseError(message = - 'Cognate analysis library call error {0}'.format(result)) + # Also going to use transcription transformation rules. - output = output_buffer.value + result_list[0].extend([ + perspective_str, + perspective_data['transcription_rules']]) log.debug( - '\ncognate_analysis {}:\noutput ({}):\n{}'.format( + '\ncognate_analysis {0}:' + '\nsource_perspective_index: {1}' + '\nperspective_list:\n{2}' + '\nheader_list:\n{3}'.format( language_str, - len(output), - pprint.pformat([output[i : i + 256] - for i in range(0, len(output), 256)], width = 144))) - - # Saving output buffer and output to files, if required. - - if __debug_flag__: - - output_file_name = ( - 'output {0}.buffer'.format( - cognate_name_str)) + source_perspective_index, + pprint.pformat(perspective_name_list, width = 108), + pprint.pformat(result_list[0], width = 108))) - with open(output_file_name, 'wb') as output_file: - output_file.write(bytes(output_buffer)) + # Each group of lexical entries. - for extension, encoding in ('utf8', 'utf-8'), ('utf16', 'utf-16'): + not_enough_count = 0 - output_file_name = ( - 'output {0}.{1}'.format( - cognate_name_str, - extension)) + total_transcription_count = 0 + total_translation_count = 0 - with open(output_file_name, 'wb') as output_file: - output_file.write(output.encode(encoding)) + not_suggestions = mode != 'suggestions' - # Reflowing output. + for entry_id_set in group_list: - line_list = output.split('\r\n') + group_entry_id_list = [[] + for i in range(len(perspective_info_list))] - text_wrapper = textwrap.TextWrapper( - width = max(196, len(perspective_info_list) * 40), tabsize = 20) + group_transcription_list = [[] + for i in range(len(perspective_info_list))] - reflow_list = [] + group_translation_list = [[] + for i in range(len(perspective_info_list))] - for line in line_list: - reflow_list.extend(text_wrapper.wrap(line)) + group_acoustic_list = [None + for i in range(len(perspective_info_list))] - wrapped_output = '\n'.join(reflow_list) + transcription_count = 0 + translation_count = 0 - log.debug( - 'cognate_analysis {0}:\nwrapped output:\n{1}'.format( - language_str, - wrapped_output)) + for entry_id in entry_id_set: - # Getting binary output for parsing and exporting. + if entry_id not in text_dict: + continue - if mode == 'multi': + # Processing text data of each entry of the group. - result_binary = analysis_f( - input_buffer, - perspective_count_array, - len(multi_list), - len(result_list), - output_buffer, - 2) + entry_data_list = text_dict[entry_id] - # If we are in the suggestions mode, we currently just return the output. + (index, + transcription_list, + translation_list) = ( - elif mode == 'suggestions': + entry_data_list[:3]) - result_binary = analysis_f( - input_buffer, - len(perspective_info_list), - len(result_list), - len(suggestions_result_list), - source_perspective_index, - match_translations_value, - int(only_orphans_flag), - output_buffer, - 2) + group_entry_id_list[index].append(entry_id) - else: + group_transcription_list[index].extend(transcription_list) + group_translation_list[index].extend(translation_list) - result_binary = analysis_f( - input_buffer, - len(perspective_info_list), - len(result_list), - output_buffer, - 2) + transcription_count += len(transcription_list) + translation_count += len(translation_list) - log.debug( - 'cognate_analysis {0}: result_binary {1}'.format( - language_str, - result_binary)) + if (len(entry_data_list) > 3 and + entry_data_list[3] and + group_acoustic_list[index] is None): - if result_binary <= 0: + group_acoustic_list[index] = entry_data_list[3] - if task_status is not None: + # Dropping groups with transcriptions from no more than a single dictionary, if required. - task_status.set(5, 100, - 'Finished (ERROR): library call (binary) error {0}'.format(result_binary)) + if (not_suggestions and + sum(min(1, len(transcription_list)) + for transcription_list in group_transcription_list) <= 1): - return ResponseError(message = - 'Cognate analysis library call (binary) error {0}'.format(result_binary)) + not_enough_count += 1 + continue - # Showing what we've got from the binary output call. + total_transcription_count += transcription_count + total_translation_count += translation_count - output_binary = output_buffer[:result_binary] + result_list.append([]) - output_binary_list = [ - output_binary[i : i + 256] - for i in range(0, len(output_binary), 256)] + group_zipper = zip( + group_entry_id_list, + group_transcription_list, + group_translation_list, + group_acoustic_list) - log.debug( - '\ncognate_analysis {0}:' - '\noutput_binary:\n{1}'.format( - language_str, - pprint.pformat( - output_binary_list, width = 144))) + # Forming row of the source data table based on the entry group. - # Saving binary output buffer and binary output to files, if required. + for ( + index, ( + entry_id_list, + transcription_list, + translation_list, + acoustic_list)) in ( - if __debug_flag__: + enumerate(group_zipper)): - output_file_name = ( - 'output binary {0}.buffer'.format( - cognate_name_str)) + transcription_str = '|'.join(transcription_list) + translation_str = '|'.join(translation_list) - with open( - output_file_name, 'wb') as output_file: + result_list[-1].append(transcription_str) + result_list[-1].append(translation_str) - output_file.write( - bytes(output_buffer)) + if mode == 'acoustic': + result_list[-1].extend(acoustic_list or ['', '', '', '', '']) - for extension, encoding in ('utf8', 'utf-8'), ('utf16', 'utf-16'): + # Saving mapping from the translation / transcription info string to an id of one entry of + # the group. - output_file_name = ( - 'output binary {0}.{1}'.format( - cognate_name_str, extension)) + if transcription_list or translation_list: - with open( - output_file_name, 'wb') as output_file: + entry_id_key = ( - output_file.write( - output_binary.encode(encoding)) + index, + transcription_str + ( + ' ʽ' + translation_str + 'ʼ' if translation_str else '')) - # For cognate suggestions we just parse and return suggestions. + entry_id_dict[entry_id_key] = entry_id_list[0] - if mode == 'suggestions': + # Showing what we've gathered. - suggestion_list = ( + log.debug( + '\ncognate_analysis {0}:' + '\n len(group_list): {1}' + '\n len(result_list): {2}' + '\n not_enough_count: {3}' + '\n transcription_count: {4}' + '\n translation_count: {5}' + '\n result_list:\n{6}'.format( + language_str, + len(group_list), + len(result_list), + not_enough_count, + total_transcription_count, + total_translation_count, + pprint.pformat(result_list, width = 108))) - CognateAnalysis.parse_suggestions( - language_str, - output_binary, - len(perspective_info_list), - source_perspective_index, - entry_id_dict, - __debug_flag__, - cognate_name_str if __debug_flag__ else None, - group_field_id if __debug_flag__ else None)) + # If we have no data at all, we return empty result. - result_dict = ( + if len(result_list) <= 1 and not_suggestions: - dict( + return CognateAnalysis( + triumph = True, + dictionary_count = len(perspective_info_list), + group_count = len(group_list), + not_enough_count = not_enough_count, + transcription_count = total_transcription_count, + translation_count = total_translation_count, + result = '', + xlsx_url = '', + distance_list = [], + figure_url = '', + intermediate_url_list = None) - triumph = True, + analysis_f = ( + cognate_acoustic_analysis_f if mode == 'acoustic' else + cognate_reconstruction_f if mode == 'reconstruction' else + cognate_reconstruction_multi_f if mode == 'multi' else + cognate_suggestions_f if mode == 'suggestions' else + cognate_analysis_f) - dictionary_count = len(perspective_info_list), - group_count = len(group_list), - not_enough_count = not_enough_count, - transcription_count = total_transcription_count, - translation_count = total_translation_count, + # Preparing analysis input. - result = output, + phonemic_input_list = [ + ''.join(text + '\0' for text in text_list) + for text_list in phonemic_data_list] - perspective_name_list = perspective_name_list, + suggestions_result_list = [] - suggestion_list = suggestion_list, - suggestion_field_id = group_field_id, + for tt_list in itertools.zip_longest( + *suggestions_data_list, fillvalue = ['', '']): - intermediate_url_list = - intermediate_url_list if __intermediate_flag__ else None)) + suggestions_result_list.append([]) - if __debug_flag__ and __result_flag__: + for tt in tt_list: + suggestions_result_list[-1].extend(tt) - with gzip.open( - result_file_name, 'wb') as result_file: + if mode == 'suggestions': - pickle.dump(result_dict, result_file) + # Showing additional ungrouped input data, if required. - return CognateAnalysis(**result_dict) + log.debug( + '\ncognate_analysis {0}:' + '\nsuggestions_result_list:\n{1}'.format( + language_str, + pprint.pformat(suggestions_result_list, width = 144))) - # Performing etymological distance analysis, if required. + result_input = ( - d_output = None - d_output_binary = None + ''.join( + ''.join(text + '\0' for text in text_list) - if distance_flag or figure_flag: + for text_list in ( + result_list + suggestions_result_list))) - d_output_buffer_size = cognate_distance_analysis_f( - None, len(perspective_info_list), len(result_list), None, 1) + input = '\0'.join(phonemic_input_list + [result_input]) - log.debug( - 'cognate_analysis {0}: distance output buffer size {1}'.format( + log.debug( + '\ncognate_analysis {0}:' + '\nanalysis_f: {1}' + '\ninput ({2} columns, {3} rows{4}):\n{5}'.format( language_str, - d_output_buffer_size)) + repr(analysis_f), + len(perspective_info_list), + len(result_list), + '' if mode != 'suggestions' else + ', {0} ungrouped rows'.format(len(suggestions_result_list)), + pprint.pformat([input[i : i + 256] + for i in range(0, len(input), 256)], width = 144))) + + # Saving input to a file, if required. - d_output_buffer = ctypes.create_unicode_buffer(d_output_buffer_size + 256) + storage_dir = None + intermediate_url_list = [] - d_result = cognate_distance_analysis_f( - input_buffer, len(perspective_info_list), len(result_list), d_output_buffer, 1) + if __debug_flag__ or __intermediate_flag__: - # If we don't have a good result, we return an error. + language_name_str = ( + ' '.join(multi_name_list) if mode == 'multi' else + base_language_name.strip()) - log.debug( - 'cognate_analysis {0}: distance result {1}'.format( - language_str, - d_result)) + if len(language_name_str) > 64: + language_name_str = language_name_str[:64] + '...' - if d_result <= 0: + mode_name_str = ( - if task_status is not None: + '{0} {1} {2} {3}{4}'.format( - task_status.set(5, 100, - 'Finished (ERROR): library call error {0}'.format(d_result)) + ' multi{0}'.format(len(multi_list)) + if mode == 'multi' else + (' ' + mode if mode else ''), - return ResponseError(message = - 'Cognate analysis library call error {0}'.format(d_result)) + language_name_str, - # Showing what we've got. + ' '.join(str(count) for id, count in multi_list) + if mode == 'multi' else + len(perspective_info_list), - d_output = d_output_buffer.value + len(result_list), - distance_output_list = [ - d_output[i : i + 256] - for i in range(0, len(d_output), 256)] + '' if not_suggestions else + ' {} {} {} {}'.format( + len(suggestions_result_list), + source_perspective_index, + match_translations_value, + int(only_orphans_flag)))) - log.debug( - 'cognate_analysis {0}:\ndistance output:\n{1}'.format( - language_str, - pprint.pformat( - distance_output_list, width = 144))) + cognate_name_str = ( + 'cognate' + mode_name_str) - # Saving distance output buffer and distance output to files, if required. + # Initializing file storage directory, if required. - if __debug_flag__: + if __intermediate_flag__ and storage_dir is None: - d_output_file_name = ( - 'output {0}.buffer'.format( - cognate_name_str)) + cur_time = time.time() - with open( - d_output_file_name, 'wb') as d_output_file: + storage_dir = os.path.join( + storage['path'], 'cognate', str(cur_time)) - d_output_file.write( - bytes(d_output_buffer)) + for extension, encoding in ('utf8', 'utf-8'), ('utf16', 'utf-16'): - for extension, encoding in ('utf8', 'utf-8'), ('utf16', 'utf-16'): + input_file_name = ( - d_output_file_name = ( - 'output {0}.{1}'.format( - cognate_name_str, extension)) + pathvalidate.sanitize_filename( + 'input {0}.{1}'.format( + cognate_name_str, extension))) - with open( - d_output_file_name, 'wb') as d_output_file: + # Saving to the working directory... - d_output_file.write( - d_output.encode(encoding)) + if __debug_flag__: - # Getting binary output for parsing and exporting. + with open(input_file_name, 'wb') as input_file: + input_file.write(input.encode(encoding)) - d_result_binary = cognate_distance_analysis_f( - input_buffer, len(perspective_info_list), len(result_list), d_output_buffer, 2) + # ...and / or to the file storage. - log.debug( - 'cognate_analysis {0}: distance result_binary {1}'.format( - language_str, - d_result_binary)) + if __intermediate_flag__: - if d_result_binary <= 0: + input_path = os.path.join( + storage_dir, input_file_name) - if task_status is not None: + os.makedirs( + os.path.dirname(input_path), + exist_ok = True) - task_status.set(5, 100, - 'Finished (ERROR): library call (binary) error {0}'.format(d_result_binary)) + with open(input_path, 'wb') as input_file: + input_file.write(input.encode(encoding)) - return ResponseError(message = - 'Cognate analysis library call (binary) error {0}'.format(d_result_binary)) + input_url = ''.join([ + storage['prefix'], + storage['static_route'], + 'cognate', '/', + str(cur_time), '/', + input_file_name]) - # Showing what we've got from the binary output call. + intermediate_url_list.append(input_url) - d_output_binary = d_output_buffer[:d_result_binary] + # Calling analysis library, starting with getting required output buffer size and continuing + # with analysis proper. - d_output_binary_list = [ - d_output_binary[i : i + 256] - for i in range(0, len(d_output_binary), 256)] + if mode == 'multi': - log.debug( - '\ncognate_analysis {0}:' - '\ndistance output_binary:\n{1}'.format( - language_str, - pprint.pformat( - d_output_binary_list, width = 144))) + multi_count_list = [ + perspective_count + for language_id, perspective_count in multi_list] - # Indicating task's final stage, if required. + perspective_count_array = ( + ctypes.c_int * len(multi_list))(*multi_count_list) - if task_status is not None: - task_status.set(4, 99, 'Exporting analysis results to XLSX') + # int CognateMultiReconstruct_GetAllOutput( + # LPTSTR bufIn, int* pnCols, int nGroups, int nRows, LPTSTR bufOut, int flags) - # Parsing analysis results and exporting them as an Excel file. + output_buffer_size = analysis_f( + None, + perspective_count_array, + len(multi_list), + len(result_list), + None, + 1) - workbook_stream, distance_matrix_list = ( + elif mode == 'suggestions': - CognateAnalysis.export_xlsx( - language_str, - mode, - output_binary, - d_output_binary, - len(perspective_info_list), - __debug_flag__, - cognate_name_str if __debug_flag__ else None)) + # int GuessCognates_GetAllOutput( + # LPTSTR bufIn, int nCols, int nRowsCorresp, int nRowsRest, int iDictThis, int lookMeaning, + # int onlyOrphans, LPTSTR bufOut, int flags) - current_datetime = datetime.datetime.now(datetime.timezone.utc) + output_buffer_size = analysis_f( + None, + len(perspective_info_list), + len(result_list), + len(suggestions_result_list), + source_perspective_index, + match_translations_value, + int(only_orphans_flag), + None, + 1) - xlsx_filename = pathvalidate.sanitize_filename( - '{0} cognate{1} analysis {2:04d}.{3:02d}.{4:02d}.xlsx'.format( - base_language_name[:64], - ' ' + mode if mode else '', - current_datetime.year, - current_datetime.month, - current_datetime.day)) + else: - if storage_dir is None: + # int CognateAnalysis_GetAllOutput( + # LPTSTR bufIn, int nCols, int nRows, LPTSTR bufOut, int flags) - cur_time = time.time() - storage_dir = os.path.join(storage['path'], 'cognate', str(cur_time)) + output_buffer_size = analysis_f( + None, + len(perspective_info_list), + len(result_list), + None, + 1) - # Storing Excel file with the results. + log.debug( + '\ncognate_analysis {0}: output buffer size {1}'.format( + language_str, + output_buffer_size)) - xlsx_path = os.path.join(storage_dir, xlsx_filename) - os.makedirs(os.path.dirname(xlsx_path), exist_ok = True) + input_buffer = ctypes.create_unicode_buffer(input) - workbook_stream.seek(0) + # Saving input buffer to a file, if required. - with open(xlsx_path, 'wb') as xlsx_file: - shutil.copyfileobj(workbook_stream, xlsx_file) + if __debug_flag__: - xlsx_url = ''.join([ - storage['prefix'], storage['static_route'], - 'cognate', '/', str(cur_time), '/', xlsx_filename]) + input_file_name = ( + 'input {0}.buffer'.format( + cognate_name_str)) - # Selecting one of the distance matrices, if we have any. + with open(input_file_name, 'wb') as input_file: + input_file.write(bytes(input_buffer)) - distance_header_array = None + output_buffer = ctypes.create_unicode_buffer(output_buffer_size + 256) - if distance_matrix_list is not None: + if mode == 'multi': - distance_matrix = distance_matrix_list[-1] + result = analysis_f( + input_buffer, + perspective_count_array, + len(multi_list), + len(result_list), + output_buffer, + 1) - if distance_vowel_flag and distance_consonant_flag: - pass + elif mode == 'suggestions': - elif distance_vowel_flag: - distance_matrix = distance_matrix_list[0] + result = analysis_f( + input_buffer, + len(perspective_info_list), + len(result_list), + len(suggestions_result_list), + source_perspective_index, + match_translations_value, + int(only_orphans_flag), + output_buffer, + 1) - elif distance_consonant_flag: - distance_matrix = distance_matrix_list[1] + else: - (distance_title, - distance_header_list, - distance_data_list, - distance_header_array, - distance_data_array) = distance_matrix + result = analysis_f( + input_buffer, + len(perspective_info_list), + len(result_list), + output_buffer, + 1) - # Generating list of etymological distances to the reference perspective, if required. + log.debug( + '\ncognate_analysis {0}: result {1}'.format( + language_str, + result)) - distance_list = None + # If we don't have a good result, we return an error. - if distance_flag and reference_perspective_id is not None: + if result <= 0: - reference_index = None + if task_status is not None: - for index, perspective_id in enumerate(perspective_id_list): - if perspective_id == reference_perspective_id: + task_status.set(5, 100, + 'Finished (ERROR): library call error {0}'.format(result)) - reference_index = index - break + return ResponseError(message = + 'Cognate analysis library call error {0}'.format(result)) - if reference_index is not None: + output = output_buffer.value - distance_value_list = list(map( - float, distance_data_list[reference_index])) + log.debug( + '\ncognate_analysis {}:\noutput ({}):\n{}'.format( + language_str, + len(output), + pprint.pformat([output[i : i + 256] + for i in range(0, len(output), 256)], width = 144))) - max_distance = float(max(distance_value_list)) + # Saving output buffer and output to files, if required. - # Compiling and showing relative distance list. + if __debug_flag__: - if max_distance > 0: - distance_list = [ - (perspective_id, distance / max_distance) + output_file_name = ( + 'output {0}.buffer'.format( + cognate_name_str)) - for perspective_id, distance in zip( - perspective_id_list, distance_value_list)] + with open(output_file_name, 'wb') as output_file: + output_file.write(bytes(output_buffer)) - else: + for extension, encoding in ('utf8', 'utf-8'), ('utf16', 'utf-16'): - distance_list = distance_value_list + output_file_name = ( + 'output {0}.{1}'.format( + cognate_name_str, + extension)) - log.debug( - '\ncognate_analysis {0}:' - '\n perspective_id_list: {1}' - '\n perspective_name_list:\n{2}' - '\n reference_perspective_id: {3}' - '\n reference_index: {4}' - '\n distance_value_list: {5}' - '\n max_distance: {6}' - '\n distance_list: {7}'.format( - language_str, - perspective_id_list, - pprint.pformat(perspective_name_list, width = 144), - reference_perspective_id, - reference_index, - distance_value_list, - max_distance, - distance_list)) + with open(output_file_name, 'wb') as output_file: + output_file.write(output.encode(encoding)) - # Generating distance graph, if required. + # Reflowing output. - figure_url = None + line_list = output.split('\r\n') - mst_list = None - embedding_2d_pca = None - embedding_3d_pca = None + text_wrapper = textwrap.TextWrapper( + width = max(196, len(perspective_info_list) * 40), tabsize = 20) - if figure_flag: + reflow_list = [] - d_ij = (distance_data_array + distance_data_array.T) / 2 + for line in line_list: + reflow_list.extend(text_wrapper.wrap(line)) - log.debug( - '\ncognate_analysis {0}:' - '\ndistance_header_array:\n{1}' - '\ndistance_data_array:\n{2}' - '\nd_ij:\n{3}'.format( - language_str, - distance_header_array, - distance_data_array, - d_ij)) + wrapped_output = '\n'.join(reflow_list) - # Projecting the graph into a 2d plane via relative distance strain optimization, using PCA to - # orient it left-right. + log.debug( + 'cognate_analysis {0}:\nwrapped output:\n{1}'.format( + language_str, + wrapped_output)) - if len(distance_data_array) > 1: + # Getting binary output for parsing and exporting. - embedding_2d, strain_2d = ( - CognateAnalysis.graph_2d_embedding(d_ij, verbose = __debug_flag__)) + if mode == 'multi': - embedding_2d_pca = ( - sklearn.decomposition.PCA(n_components = 2) - .fit_transform(embedding_2d)) + result_binary = analysis_f( + input_buffer, + perspective_count_array, + len(multi_list), + len(result_list), + output_buffer, + 2) - distance_2d = sklearn.metrics.euclidean_distances(embedding_2d) + # If we are in the suggestions mode, we currently just return the output. - else: + elif mode == 'suggestions': - embedding_2d = numpy.zeros((1, 2)) - embedding_2d_pca = numpy.zeros((1, 2)) + result_binary = analysis_f( + input_buffer, + len(perspective_info_list), + len(result_list), + len(suggestions_result_list), + source_perspective_index, + match_translations_value, + int(only_orphans_flag), + output_buffer, + 2) - strain_2d = 0.0 + else: - distance_2d = numpy.zeros((1, 1)) + result_binary = analysis_f( + input_buffer, + len(perspective_info_list), + len(result_list), + output_buffer, + 2) - # Showing what we computed. + log.debug( + 'cognate_analysis {0}: result_binary {1}'.format( + language_str, + result_binary)) - log.debug( - '\ncognate_analysis {0}:' - '\nembedding 2d:\n{1}' - '\nembedding 2d (PCA-oriented):\n{2}' - '\nstrain 2d:\n{3}' - '\ndistances 2d:\n{4}'.format( - language_str, - embedding_2d, - embedding_2d_pca, - strain_2d, - distance_2d)) + if result_binary <= 0: - # And now the same with 3d embedding. + if task_status is not None: - if len(distance_data_array) > 1: + task_status.set(5, 100, + 'Finished (ERROR): library call (binary) error {0}'.format(result_binary)) - embedding_3d, strain_3d = ( - CognateAnalysis.graph_3d_embedding(d_ij, verbose = __debug_flag__)) + return ResponseError(message = + 'Cognate analysis library call (binary) error {0}'.format(result_binary)) - # At least three points, standard PCA-based orientation. + # Showing what we've got from the binary output call. - if len(distance_data_array) >= 3: + output_binary = output_buffer[:result_binary] - embedding_3d_pca = ( - sklearn.decomposition.PCA(n_components = 3) - .fit_transform(embedding_3d)) + output_binary_list = [ + output_binary[i : i + 256] + for i in range(0, len(output_binary), 256)] - # Only two points, so we take 2d embedding and extend it with zeros. + log.debug( + '\ncognate_analysis {0}:' + '\noutput_binary:\n{1}'.format( + language_str, + pprint.pformat( + output_binary_list, width = 144))) - else: + # Saving binary output buffer and binary output to files, if required. - embedding_3d_pca = ( + if __debug_flag__: - numpy.hstack(( - embedding_2d_pca, - numpy.zeros((embedding_2d_pca.shape[0], 1))))) + output_file_name = ( + 'output binary {0}.buffer'.format( + cognate_name_str)) - # Making 3d embedding actually 3d, if required. + with open( + output_file_name, 'wb') as output_file: - if embedding_3d_pca.shape[1] <= 2: + output_file.write( + bytes(output_buffer)) - embedding_3d_pca = ( + for extension, encoding in ('utf8', 'utf-8'), ('utf16', 'utf-16'): - numpy.hstack(( - embedding_3d_pca, - numpy.zeros((embedding_3d_pca.shape[0], 1))))) + output_file_name = ( + 'output binary {0}.{1}'.format( + cognate_name_str, extension)) - distance_3d = ( - sklearn.metrics.euclidean_distances(embedding_3d_pca)) + with open( + output_file_name, 'wb') as output_file: - else: + output_file.write( + output_binary.encode(encoding)) - embedding_3d = numpy.zeros((1, 3)) - embedding_3d_pca = numpy.zeros((1, 3)) + # For cognate suggestions we just parse and return suggestions. - strain_3d = 0.0 + if mode == 'suggestions': - distance_3d = numpy.zeros((1, 1)) + suggestion_list = ( - # Showing what we've get. + CognateAnalysis.parse_suggestions( + language_str, + output_binary, + len(perspective_info_list), + source_perspective_index, + entry_id_dict, + __debug_flag__, + cognate_name_str if __debug_flag__ else None, + group_field_id if __debug_flag__ else None)) - log.debug( - '\ncognate_analysis {0}:' - '\nembedding 3d:\n{1}' - '\nembedding 3d (PCA-oriented):\n{2}' - '\nstrain 3d:\n{3}' - '\ndistances 3d:\n{4}'.format( - language_str, - embedding_3d, - embedding_3d_pca, - strain_3d, - distance_3d)) + result_dict = ( - # Computing minimum spanning tree via standard Jarnik-Prim-Dijkstra algorithm using 2d and 3d - # embedding distances to break ties. + dict( - if len(distance_data_array) <= 1: - mst_list = [] + triumph = True, - else: + dictionary_count = len(perspective_info_list), + group_count = len(group_list), + not_enough_count = not_enough_count, + transcription_count = total_transcription_count, + translation_count = total_translation_count, - d_min, d_extra_min, min_i, min_j = min( - (d_ij[i,j], distance_2d[i,j] + distance_3d[i,j], i, j) - for i in range(d_ij.shape[0] - 1) - for j in range(i + 1, d_ij.shape[0])) + result = output, - mst_list = [(min_i, min_j)] - mst_dict = {} + perspective_name_list = perspective_name_list, - # MST construction initialization. + suggestion_list = suggestion_list, + suggestion_field_id = group_field_id, - for i in range(d_ij.shape[0]): + intermediate_url_list = + intermediate_url_list if __intermediate_flag__ else None)) - if i == min_i or i == min_j: - continue + if __debug_flag__ and __result_flag__: - d_min_i = (d_ij[i, min_i], distance_2d[i, min_i] + distance_3d[i, min_i]) - d_min_j = (d_ij[i, min_j], distance_2d[i, min_j] + distance_3d[i, min_j]) + with gzip.open( + result_file_name, 'wb') as result_file: - mst_dict[i] = ( - (d_min_i, min_i) if d_min_i <= d_min_j else - (d_min_j, min_i)) + pickle.dump(result_dict, result_file) - # Iterative MST construction. + return CognateAnalysis(**result_dict) - while len(mst_dict) > 0: + # Performing etymological distance analysis, if required. - (d_min, d_extra_min, i_min, i_from_min) = min( - (d, d_extra, i, i_from) for i, ((d, d_extra), i_from) in mst_dict.items()) + d_output = None + d_output_binary = None - log.debug('\n' + pprint.pformat(mst_dict)) - log.debug('\n' + repr((i_from_min, i_min, d_min, d_extra_min))) + if distance_flag or figure_flag: - mst_list.append((i_from_min, i_min)) - del mst_dict[i_min] + d_output_buffer_size = cognate_distance_analysis_f( + None, len(perspective_info_list), len(result_list), None, 1) - # Updating shortest connection info. + log.debug( + 'cognate_analysis {0}: distance output buffer size {1}'.format( + language_str, + d_output_buffer_size)) - for i_to in mst_dict.keys(): + d_output_buffer = ctypes.create_unicode_buffer(d_output_buffer_size + 256) - d_to = (d_ij[i_min, i_to], distance_2d[i_min, i_to] + distance_3d[i_min, i_to]) + d_result = cognate_distance_analysis_f( + input_buffer, len(perspective_info_list), len(result_list), d_output_buffer, 1) - if d_to < mst_dict[i_to][0]: - mst_dict[i_to] = (d_to, i_min) + # If we don't have a good result, we return an error. log.debug( - '\ncognate_analysis {0}:' - '\nminimum spanning tree:\n{1}'.format( + 'cognate_analysis {0}: distance result {1}'.format( language_str, - pprint.pformat(mst_list))) + d_result)) - # Plotting with matplotlib. + if d_result <= 0: - figure = pyplot.figure(figsize = (10, 10)) - axes = figure.add_subplot(212) + if task_status is not None: - axes.set_title( - 'Etymological distance tree (relative distance embedding)', - fontsize = 14, family = 'Gentium') + task_status.set(5, 100, + 'Finished (ERROR): library call error {0}'.format(d_result)) - axes.axis('equal') - axes.axis('off') - axes.autoscale() + return ResponseError(message = + 'Cognate analysis library call error {0}'.format(d_result)) - def f(axes, embedding_pca): - """ - Plots specified graph embedding on a given axis. - """ + # Showing what we've got. - flag_3d = numpy.size(embedding_pca, 1) > 2 + d_output = d_output_buffer.value - for index, (position, name) in enumerate( - zip(embedding_pca, distance_header_array)): + distance_output_list = [ + d_output[i : i + 256] + for i in range(0, len(d_output), 256)] - # Checking if any of the previous perspectives are already in this perspective's - # position. + log.debug( + 'cognate_analysis {0}:\ndistance output:\n{1}'.format( + language_str, + pprint.pformat( + distance_output_list, width = 144))) - same_position_index = None + # Saving distance output buffer and distance output to files, if required. + + if __debug_flag__: + + d_output_file_name = ( + 'output {0}.buffer'.format( + cognate_name_str)) + + with open( + d_output_file_name, 'wb') as d_output_file: + + d_output_file.write( + bytes(d_output_buffer)) - for i, p in enumerate(embedding_pca[:index]): - if numpy.linalg.norm(position - p) <= 1e-3: + for extension, encoding in ('utf8', 'utf-8'), ('utf16', 'utf-16'): - same_position_index = i - break + d_output_file_name = ( + 'output {0}.{1}'.format( + cognate_name_str, extension)) - color = matplotlib.colors.hsv_to_rgb( - [(same_position_index or index) * 1.0 / len(distance_header_array), 0.5, 0.75]) + with open( + d_output_file_name, 'wb') as d_output_file: - label_same_str = ( - '' if same_position_index is None else - ' (same as {0})'.format(same_position_index + 1)) + d_output_file.write( + d_output.encode(encoding)) - kwargs = { - 's': 35, - 'color': color, - 'label': '{0}) {1}{2}'.format(index + 1, name, label_same_str)} + # Getting binary output for parsing and exporting. - axes.scatter(*position, **kwargs) + d_result_binary = cognate_distance_analysis_f( + input_buffer, len(perspective_info_list), len(result_list), d_output_buffer, 2) - # Annotating position with its number, but only if we hadn't already annotated nearby. + log.debug( + 'cognate_analysis {0}: distance result_binary {1}'.format( + language_str, + d_result_binary)) - if same_position_index is None: + if d_result_binary <= 0: - if flag_3d: + if task_status is not None: - axes.text( - position[0] + 0.01, position[1], position[2] + 0.01, - str(index + 1), None, fontsize = 14) + task_status.set(5, 100, + 'Finished (ERROR): library call (binary) error {0}'.format(d_result_binary)) - else: + return ResponseError(message = + 'Cognate analysis library call (binary) error {0}'.format(d_result_binary)) - axes.annotate( - str(index + 1), - (position[0] + 0.01, position[1] - 0.005), - fontsize = 14) + # Showing what we've got from the binary output call. - # Plotting minimum spanning trees. + d_output_binary = d_output_buffer[:d_result_binary] - line_list = [ - (embedding_pca[i], embedding_pca[j]) - for i, j in mst_list] + d_output_binary_list = [ + d_output_binary[i : i + 256] + for i in range(0, len(d_output_binary), 256)] - line_collection = ( - Line3DCollection if flag_3d else LineCollection)( - line_list, zorder = 0, color = 'gray') + log.debug( + '\ncognate_analysis {0}:' + '\ndistance output_binary:\n{1}'.format( + language_str, + pprint.pformat( + d_output_binary_list, width = 144))) - axes.add_collection(line_collection) + # Indicating task's final stage, if required. - pyplot.setp(axes.texts, family = 'Gentium') + if task_status is not None: + task_status.set(4, 99, 'Exporting analysis results to XLSX') - # Plotting our embedding, creating the legend. + # Parsing analysis results and exporting them as an Excel file. - f(axes, embedding_2d_pca) + workbook_stream, distance_matrix_list = ( - pyplot.tight_layout() + CognateAnalysis.export_xlsx( + language_str, + mode, + output_binary, + d_output_binary, + len(perspective_info_list), + __debug_flag__, + cognate_name_str if __debug_flag__ else None)) - legend = axes.legend( - scatterpoints = 1, - loc = 'upper center', - bbox_to_anchor = (0.5, -0.05), - frameon = False, - handlelength = 0.5, - handletextpad = 0.75, - fontsize = 14) + current_datetime = datetime.datetime.now(datetime.timezone.utc) - pyplot.setp(legend.texts, family = 'Gentium') - axes.autoscale_view() + xlsx_filename = pathvalidate.sanitize_filename( + '{0} cognate{1} analysis {2:04d}.{3:02d}.{4:02d}.xlsx'.format( + base_language_name[:64], + ' ' + mode if mode else '', + current_datetime.year, + current_datetime.month, + current_datetime.day)) - # Saving generated figure for debug purposes, if required. + if storage_dir is None: - if __debug_flag__: + cur_time = time.time() + storage_dir = os.path.join(storage['path'], 'cognate', str(cur_time)) - figure_file_name = ( - 'figure cognate distance{0}.png'.format( - mode_name_str)) + # Storing Excel file with the results. - with open(figure_file_name, 'wb') as figure_file: + xlsx_path = os.path.join(storage_dir, xlsx_filename) + os.makedirs(os.path.dirname(xlsx_path), exist_ok = True) - pyplot.savefig( - figure_file, - bbox_extra_artists = (legend,), - bbox_inches = 'tight', - pad_inches = 0.25, - format = 'png') + workbook_stream.seek(0) - # Also generating 3d embedding figure. + with open(xlsx_path, 'wb') as xlsx_file: + shutil.copyfileobj(workbook_stream, xlsx_file) - figure_3d = pyplot.figure() - figure_3d.set_size_inches(16, 10) + xlsx_url = ''.join([ + storage['prefix'], storage['static_route'], + 'cognate', '/', str(cur_time), '/', xlsx_filename]) - axes_3d = figure_3d.add_subplot(111, projection = '3d') + # Selecting one of the distance matrices, if we have any. - axes_3d.axis('equal') - axes_3d.view_init(elev = 30, azim = -75) + distance_header_array = None - f(axes_3d, embedding_3d_pca) + if distance_matrix_list is not None: - # Setting up legend. + distance_matrix = distance_matrix_list[-1] - axes_3d.set_xlabel('X') - axes_3d.set_ylabel('Y') - axes_3d.set_zlabel('Z') + if distance_vowel_flag and distance_consonant_flag: + pass - legend_3d = axes_3d.legend( - scatterpoints = 1, - loc = 'upper center', - bbox_to_anchor = (0.5, -0.05), - frameon = False, - handlelength = 0.5, - handletextpad = 0.75, - fontsize = 14) + elif distance_vowel_flag: + distance_matrix = distance_matrix_list[0] - pyplot.setp(legend_3d.texts, family = 'Gentium') + elif distance_consonant_flag: + distance_matrix = distance_matrix_list[1] - # Fake cubic bounding box to force axis aspect ratios, see - # https://stackoverflow.com/a/13701747/2016856. + (distance_title, + distance_header_list, + distance_data_list, + distance_header_array, + distance_data_array) = distance_matrix - X = embedding_3d_pca[:,0] - Y = embedding_3d_pca[:,1] - Z = embedding_3d_pca[:,2] + # Generating list of etymological distances to the reference perspective, if required. - max_range = numpy.array([ - X.max() - X.min(), Y.max() - Y.min(), Z.max() - Z.min()]).max() + distance_list = None - Xb = ( - 0.5 * max_range * numpy.mgrid[-1:2:2,-1:2:2,-1:2:2][0].flatten() + - 0.5 * (X.max() + X.min())) + if distance_flag and reference_perspective_id is not None: - Yb = ( - 0.5 * max_range * numpy.mgrid[-1:2:2,-1:2:2,-1:2:2][1].flatten() + - 0.5 * (Y.max() + Y.min())) + reference_index = None - Zb = ( - 0.5 * max_range * numpy.mgrid[-1:2:2,-1:2:2,-1:2:2][2].flatten() + - 0.5 * (Z.max() + Z.min())) + for index, perspective_id in enumerate(perspective_id_list): + if perspective_id == reference_perspective_id: - for xb, yb, zb in zip(Xb, Yb, Zb): - axes_3d.plot([xb], [yb], [zb], 'w') + reference_index = index + break - axes_3d.autoscale_view() + if reference_index is not None: - # And saving it. + distance_value_list = list(map( + float, distance_data_list[reference_index])) - figure_3d_file_name = ( - 'figure 3d cognate distance{0}.png'.format( - mode_name_str)) + max_distance = float(max(distance_value_list)) - with open(figure_3d_file_name, 'wb') as figure_3d_file: + # Compiling and showing relative distance list. - figure_3d.savefig( - figure_3d_file, - bbox_extra_artists = (legend_3d,), - bbox_inches = 'tight', - pad_inches = 0.25, - format = 'png') + if max_distance > 0: + distance_list = [ + (perspective_id, distance / max_distance) - # Storing generated figure as a PNG image. + for perspective_id, distance in zip( + perspective_id_list, distance_value_list)] - figure_filename = pathvalidate.sanitize_filename( - '{0} cognate{1} analysis {2:04d}.{3:02d}.{4:02d}.png'.format( - base_language_name[:64], - ' ' + mode if mode else '', - current_datetime.year, - current_datetime.month, - current_datetime.day)) + else: - figure_path = os.path.join(storage_dir, figure_filename) - os.makedirs(os.path.dirname(figure_path), exist_ok = True) + distance_list = distance_value_list - with open(figure_path, 'wb') as figure_file: + log.debug( + '\ncognate_analysis {0}:' + '\n perspective_id_list: {1}' + '\n perspective_name_list:\n{2}' + '\n reference_perspective_id: {3}' + '\n reference_index: {4}' + '\n distance_value_list: {5}' + '\n max_distance: {6}' + '\n distance_list: {7}'.format( + language_str, + perspective_id_list, + pprint.pformat(perspective_name_list, width = 144), + reference_perspective_id, + reference_index, + distance_value_list, + max_distance, + distance_list)) - figure.savefig( - figure_file, - bbox_extra_artists = (legend,), - bbox_inches = 'tight', - pad_inches = 0.25, - format = 'png') + # Generating distance graph, if required. + figure_url = None + mst_list = None + embedding_2d_pca = None + embedding_3d_pca = None - figure_url = ''.join([ - storage['prefix'], storage['static_route'], - 'cognate', '/', str(cur_time), '/', figure_filename]) + if figure_flag: + figure_url, mst_list, embedding_2d_pca, embedding_3d_pca = \ + CognateAnalysis.distance_graph( + language_str, + base_language_name, + distance_data_array, + distance_header_array, + mode, + storage, + storage_dir, + __debug_flag__ + ) # Finalizing task status, if required, returning result. - if task_status is not None: result_link_list = ( @@ -12875,7 +12909,6 @@ def mutate(self, info, **args): request.response.status = HTTPOk.code if synchronous: - CognateAnalysis.perform_cognate_analysis( language_str, source_perspective_id, @@ -12931,7 +12964,6 @@ def mutate(self, info, **args): return CognateAnalysis(triumph = True) # We do not use acoustic data, so we perform cognate analysis synchronously. - else: return CognateAnalysis.perform_cognate_analysis( @@ -12974,6 +13006,515 @@ def mutate(self, info, **args): 'Exception:\n' + traceback_string) +class SwadeshAnalysis(graphene.Mutation): + class Arguments: + + source_perspective_id = LingvodocID(required = True) + base_language_id = LingvodocID(required = True) + + group_field_id = LingvodocID(required = True) + perspective_info_list = graphene.List(graphene.List(LingvodocID), required = True) + + triumph = graphene.Boolean() + + result = graphene.String() + xlsx_url = graphene.String() + minimum_spanning_tree = graphene.List(graphene.List(graphene.Int)) + embedding_2d = graphene.List(graphene.List(graphene.Float)) + embedding_3d = graphene.List(graphene.List(graphene.Float)) + perspective_name_list = graphene.List(graphene.String) + + @staticmethod + def export_dataframe(result_pool, distance_data_array, bundles): + ''' + Keys: + result_pool[perspective_id][entry_id] + Fields: + 'group': group_index, + 'borrowed': bool, + 'swadesh': swadesh_lex, + 'transcription': transcription_list[0], + 'translation': translation_lex + ''' + + groups = pd.DataFrame() + singles = pd.DataFrame() + distances = pd.DataFrame(distance_data_array, + columns=[perspective['name'] for perspective in result_pool.values()]) + # Start index for distances from 1 to match with dictionaries numbers + distances.index += 1 + + row_index = 0 + # re-group by group number and add joined values + for perspective in result_pool.values(): + dict_name = perspective['name'] + for entry in perspective.values(): + # 'entry' iterator may present string value of 'name' or 'suite' field + # but not a dictionary for one of entries. Continue in this case. + if not isinstance(entry, dict): + continue + group_num = entry['group'] + entry_text = f"{entry['swadesh']} [ {entry['transcription']} ] {entry['translation']}" + if group_num and group_num in bundles: + groups.loc[group_num, dict_name] = entry_text + else: + singles.loc[row_index, dict_name] = entry_text + row_index += 1 + + return { + 'Cognates': groups if groups.empty else groups.sort_values(groups.columns[0]), + 'Singles': singles.sort_index(), + 'Distances': distances.sort_index() + } + + @staticmethod + def export_xlsx( + result, + base_language_name, + storage + ): + # Exporting analysis results as an Excel file. + + current_datetime = datetime.datetime.now(datetime.timezone.utc) + xlsx_filename = pathvalidate.sanitize_filename( + '{0} {1} {2:04d}.{3:02d}.{4:02d}.xlsx'.format( + base_language_name[:64], + 'glottochronology', + current_datetime.year, + current_datetime.month, + current_datetime.day)) + + cur_time = time.time() + storage_dir = os.path.join(storage['path'], 'glottochronology', str(cur_time)) + + # Storing Excel file with the results. + + xlsx_path = os.path.join(storage_dir, xlsx_filename) + os.makedirs(os.path.dirname(xlsx_path), exist_ok=True) + + with pd.ExcelWriter(xlsx_path, engine='xlsxwriter') as writer: + header_format = writer.book.add_format({'bold': True, + 'text_wrap': True, + 'valign': 'top', + 'fg_color': '#D7E4BC', + 'border': 1}) + for sheet_name, df in result.items(): + index = (sheet_name == 'Distances') + startcol = int(index) + + df.to_excel(writer, + sheet_name=sheet_name, + index=index, + startrow=1, + header=False) + + worksheet = writer.sheets[sheet_name] + worksheet.set_row(0, 70) + worksheet.set_column(startcol, df.shape[1] - 1 + startcol, 30) + # Write the column headers with the defined format. + for col_num, value in enumerate(df.columns.values): + worksheet.write(0, col_num + startcol, value, header_format) + + xlsx_url = ''.join([ + storage['prefix'], storage['static_route'], + 'glottochronology', '/', str(cur_time), '/', xlsx_filename]) + + return xlsx_url + + @staticmethod + def swadesh_statistics( + language_str, + base_language_name, + group_field_id, + perspective_info_list, + locale_id, + storage): + + swadesh_list = ['я','ты','мы','этот, это','тот, то','кто','что','не','все','много','один','два','большой', + 'долгий','маленький','женщина','мужчина','человек','рыба','птица','собака','вошь','дерево', + 'семя','лист','корень','кора','кожа','мясо','кровь','кость','жир','яйцо','рог','хвост','перо', + 'волосы','голова','ухо','глаз','нос','рот','зуб','язык (орган)','ноготь','нога (стопа)','колено', + 'рука (кисть)','живот','горло','грудь','сердце','печень','пить','есть (кушать)','кусать','видеть', + 'слышать','знать','спать','умирать','убивать','плавать','летать','гулять','приходить','лежать', + 'сидеть','стоять','дать','сказать','солнце','луна','звезда','вода','дождь','камень','песок', + 'земля','облако','дым','огонь','пепел','гореть','дорога,тропа','гора','красный','зелёный', + 'жёлтый','белый','чёрный','ночь','тёплый','холодный','полный','новый','хороший','круглый', + 'сухой','имя'] + + def compare_translations(swadesh_lex, dictionary_lex): + def split_lex(lex): + # Split by commas and open brackets to separate + # various forms of lexeme and extra note if is + return set(f" {form}".lower().replace(" заим.", "").strip() + for form in lex.replace('(', ',').split(',') + if form.strip() + and ')' not in form) # exclude notes + # return true if the intersection is not empty + return bool(split_lex(swadesh_lex) & split_lex(dictionary_lex)) + + _, group_list, _ = ( + CognateAnalysis.tag_data_plpgsql( + perspective_info_list, group_field_id)) + + # Getting text data for each perspective. + # entries_set gathers entry_id(s) of words met in Swadesh' list + # swadesh_total gathers numbers of words within Swadesh' list + entries_set = {} + swadesh_total = {} + result_pool = {} + tiny_dicts = set() + for index, (perspective_id, transcription_field_id, translation_field_id) in \ + enumerate(perspective_info_list): + + # Getting and saving perspective info. + perspective = ( + DBSession + .query(dbPerspective) + .filter_by(client_id=perspective_id[0], object_id=perspective_id[1]) + .first() + ) + dictionary_name = perspective.parent.get_translation(locale_id) + + # GC + del perspective + + # Getting text data. + transcription_query = ( + DBSession + .query( + dbLexicalEntry.client_id, + dbLexicalEntry.object_id) + .filter( + dbLexicalEntry.parent_client_id == perspective_id[0], + dbLexicalEntry.parent_object_id == perspective_id[1], + dbLexicalEntry.marked_for_deletion == False, + dbEntity.parent_client_id == dbLexicalEntry.client_id, + dbEntity.parent_object_id == dbLexicalEntry.object_id, + dbEntity.field_client_id == transcription_field_id[0], + dbEntity.field_object_id == transcription_field_id[1], + dbEntity.marked_for_deletion == False, + dbPublishingEntity.client_id == dbEntity.client_id, + dbPublishingEntity.object_id == dbEntity.object_id, + dbPublishingEntity.published == True, + dbPublishingEntity.accepted == True) + .add_columns( + func.array_agg(dbEntity.content).label('transcription')) + .group_by(dbLexicalEntry) + .subquery()) + + translation_query = ( + DBSession + .query( + dbLexicalEntry.client_id, + dbLexicalEntry.object_id) + .filter( + dbLexicalEntry.parent_client_id == perspective_id[0], + dbLexicalEntry.parent_object_id == perspective_id[1], + dbLexicalEntry.marked_for_deletion == False, + dbEntity.parent_client_id == dbLexicalEntry.client_id, + dbEntity.parent_object_id == dbLexicalEntry.object_id, + dbEntity.field_client_id == translation_field_id[0], + dbEntity.field_object_id == translation_field_id[1], + dbEntity.marked_for_deletion == False, + dbPublishingEntity.client_id == dbEntity.client_id, + dbPublishingEntity.object_id == dbEntity.object_id, + dbPublishingEntity.published == True, + dbPublishingEntity.accepted == True) + .add_columns( + func.array_agg(dbEntity.content).label('translation')) + .group_by(dbLexicalEntry) + .subquery()) + + # Main query for transcription/translation data. + data_query = ( + DBSession + .query(transcription_query) + .outerjoin(translation_query, and_( + transcription_query.c.client_id == translation_query.c.client_id, + transcription_query.c.object_id == translation_query.c.object_id)) + .add_columns( + translation_query.c.translation) + .all()) + + # GC + del transcription_query + del translation_query + + # Grouping translations by lexical entries. + entries_set[perspective_id] = set() + swadesh_total[perspective_id] = set() + result_pool[perspective_id] = {'name': dictionary_name} + for row_index, row in enumerate(data_query): + entry_id = tuple(row[:2]) + transcription_list, translation_list = row[2:4] + + # If we have no transcriptions for this lexical entry, we skip it altogether. + if not transcription_list: + continue + + translation_list = ( + [] if not translation_list else [ + translation.strip() + for translation in translation_list + if translation.strip()]) + + # Parsing translations and matching with Swadesh's words + transcription_lex = ', '.join(transcription_list) + for swadesh_num, swadesh_lex in enumerate(swadesh_list): + for translation_lex in translation_list: + if compare_translations(swadesh_lex, translation_lex): + # Store the entry's content in human readable format + result_pool[perspective_id][entry_id] = { + 'group': None, + 'borrowed': (" заим." in f" {transcription_lex} {translation_lex}"), + 'swadesh': swadesh_lex, + 'transcription': transcription_lex, + 'translation': translation_lex + } + # Store entry_id and number of the lex within Swadesh's list + entries_set[perspective_id].add(entry_id) + if not result_pool[perspective_id][entry_id]['borrowed']: + # Total list of Swadesh's words in the perspective, + # they can have no any etimological links + swadesh_total[perspective_id].add(swadesh_num) + + # Forget the dictionary if it contains less than 50 Swadesh words + if len(swadesh_total[perspective_id]) < 50: + del entries_set[perspective_id] + del swadesh_total[perspective_id] + del result_pool[perspective_id] + tiny_dicts.add(dictionary_name) + + # GC + del data_query + + # Checking if found entries have links + means = collections.OrderedDict() + for perspective_id, entries in entries_set.items(): + means[perspective_id] = collections.defaultdict(set) + for group_index, group in enumerate(group_list): + # Select etimologically linked entries + linked = entries & group + if linked: + entry_id = linked.pop() + result_pool[perspective_id][entry_id]['group'] = group_index + swadesh = result_pool[perspective_id][entry_id]['swadesh'] + # Store the correspondence: perspective { means(1/2/3) { etimological_groups(1.1/1.2/2.1/3.1) + if not result_pool[perspective_id][entry_id]['borrowed']: + means[perspective_id][swadesh].add(group_index) + + dictionary_count = len(means) + distance_data_array = numpy.full((dictionary_count, dictionary_count), 50, dtype='float') + distance_header_array = numpy.full(dictionary_count, "", dtype='object') + + # Calculate intersection between lists of linked means (Swadesh matching) + # So length of this intersection is the similarity of corresponding perspectives + # means_total is amount of Swadesh's lexems met in the both perspectives + bundles = set() + # Calculate each-to-each distances, exclude self-to-self + for n1, (perspective1, means1) in enumerate(means.items()): + # Numerate dictionaries + result_pool[perspective1]['name'] = f"{n1 + 1}. {result_pool[perspective1]['name']}" + distance_header_array[n1] = result_pool[perspective1]['name'] + for n2, (perspective2, means2) in enumerate(means.items()): + if n1 == n2: + distance_data_array[n1][n2] = 0 + else: + # Common means of entries which have etimological linkes + # but this linkes may be not mutual + means_common = means1.keys() & means2.keys() + means_linked = 0 + # Checking if the found means have common links + for swadesh in means_common: + links_common = means1[swadesh] & means2[swadesh] + if links_common: + # Bundles are linkes with two or more entries in the result table + bundles.update(links_common) + means_linked += 1 + + means_total = len(swadesh_total[perspective1] & swadesh_total[perspective2]) + + if n2 > n1 and len(means_common) > 0: + log.debug(f"{n1+1},{n2+1} : " + f"{len(means_common)} but {means_linked} of {means_total} : " + f"{', '.join(sorted(means_common))}") + + # means_linked > 0 means that means_total > 0 even more so + distance = math.log(means_linked / means_total) / -0.14 if means_linked > 0 else 50 + distance_data_array[n1][n2] = round(distance, 2) + + result = SwadeshAnalysis.export_dataframe(result_pool, distance_data_array, bundles) + + # GC + del result_pool + + xlsx_url = SwadeshAnalysis.export_xlsx(result, base_language_name, storage) + result_tables = (build_table(result['Distances'], 'orange_light', width="300px", index=True), + build_table(result['Cognates'], 'blue_light', width="300px"), + build_table(result['Singles'], 'green_light', width="300px")) + + # Control output size + huge_size = 1048576 + result = f"{result_tables[0]}
\n\n
{result_tables[1]}
\n\n
{result_tables[2]}" + if len(result) > huge_size: + result = f"{result_tables[0]}
\n\n
{result_tables[1]}" \ + f"
\n\nNote: The table with single words is not shown due to huge summary size
" + if len(result) > huge_size: + result = f"{result_tables[0]}" \ + f"
\n\nNote: The result tables with words are not shown due to huge summary size
" + result += ("
Note: The following dictionaries contain too less words and were not processed: \n\n" +
+                   '\n'.join(tiny_dicts) + "
") if tiny_dicts else "" + + # GC + del result_tables + + _, mst_list, embedding_2d_pca, embedding_3d_pca = \ + CognateAnalysis.distance_graph( + language_str, + base_language_name, + distance_data_array, + distance_header_array, + None, + None, + None, + __plot_flag__ = False + ) + + result_dict = ( + dict( + triumph = True, + + result = result, + xlsx_url = xlsx_url, + minimum_spanning_tree = mst_list, + embedding_2d = embedding_2d_pca, + embedding_3d = embedding_3d_pca, + perspective_name_list = distance_header_array)) + + return SwadeshAnalysis(**result_dict) + + @staticmethod + def mutate(self, info, **args): + """ + mutation SwadeshAnalysis { + swadesh_analysis( + base_language_id: [508, 41], + group_field_id: [66, 25], + perspective_info_list: [ + [[425, 4], [66, 8], [66, 10]], + [[1552, 1759], [66, 8], [66, 10]], + [[418, 4], [66, 8], [66, 10]]]) + { + triumph } + } + """ + + # Administrator / perspective author / editing permission check. + error_str = ( + 'Only administrator, perspective author and users with perspective editing permissions ' + 'can perform Swadesh analysis.') + + client_id = info.context.request.authenticated_userid + + if not client_id: + raise ResponseError(error_str) + + user = Client.get_user_by_client_id(client_id) + + author_client_id_set = ( + + set( + client_id + for (client_id, _), _, _ in args['perspective_info_list'])) + + author_id_check = ( + + DBSession + + .query( + + DBSession + .query(literal(1)) + .filter( + Client.id.in_(author_client_id_set), + Client.user_id == user.id) + .exists()) + + .scalar()) + + if (user.id != 1 and + not author_id_check and + not info.context.acl_check_if('edit', 'perspective', args['source_perspective_id'])): + + raise ResponseError(error_str) + + # Getting arguments. + + source_perspective_id = args['source_perspective_id'] + base_language_id = args['base_language_id'] + + group_field_id = args['group_field_id'] + perspective_info_list = args['perspective_info_list'] + + language_str = ( + '{0}/{1}, language {2}/{3}'.format( + source_perspective_id[0], source_perspective_id[1], + base_language_id[0], base_language_id[1])) + + try: + + # Getting base language info. + + locale_id = info.context.get('locale_id') or 2 + + base_language = DBSession.query(dbLanguage).filter_by( + client_id = base_language_id[0], object_id = base_language_id[1]).first() + + base_language_name = base_language.get_translation(locale_id) + + request = info.context.request + storage = request.registry.settings['storage'] + + # Transforming client/object pair ids from lists to 2-tuples. + + source_perspective_id = tuple(source_perspective_id) + base_language_id = tuple(base_language_id) + group_field_id = tuple(group_field_id) + + perspective_info_list = [ + + (tuple(perspective_id), + tuple(transcription_field_id), + tuple(translation_field_id)) + + for perspective_id, + transcription_field_id, + translation_field_id in perspective_info_list] + + return SwadeshAnalysis.swadesh_statistics( + language_str, + base_language_name, + group_field_id, + perspective_info_list, + locale_id, + storage) + + # Exception occured while we tried to perform swadesh analysis. + except Exception as exception: + + traceback_string = ''.join(traceback.format_exception( + exception, exception, exception.__traceback__))[:-1] + + log.warning( + 'swadesh_analysis {0}: exception'.format( + language_str)) + + log.warning(traceback_string) + + return ResponseError(message = + 'Exception:\n' + traceback_string) + + class Phonology(graphene.Mutation): class Arguments: @@ -18807,6 +19348,7 @@ class MyMutations(graphene.ObjectType): starling_etymology = StarlingEtymology.Field() phonemic_analysis = PhonemicAnalysis.Field() cognate_analysis = CognateAnalysis.Field() + swadesh_analysis = SwadeshAnalysis.Field() phonology = Phonology.Field() phonological_statistical_distance = PhonologicalStatisticalDistance.Field() sound_and_markup = SoundAndMarkup.Field() diff --git a/server-requirements-1.txt b/server-requirements-1.txt index 50c1d5b0..0441a615 100644 --- a/server-requirements-1.txt +++ b/server-requirements-1.txt @@ -20,7 +20,7 @@ configparser==4.0.2 cycler==0.10.0 DataProperty==0.42.1 defusedxml==0.6.0 -dill==0.3.5.1 +dill==0.3.6 docutils==0.15.2 dogpile.cache==0.6.8 et-xmlfile==1.0.1 @@ -65,10 +65,10 @@ pyramid-debugtoolbar==3.0.4 pyramid-mailer==0.15.1 pyramid-mako==1.0.2 pyramid-tm==1.0.1 -python-dateutil==2.8.0 +python-dateutil==2.8.1 python-docx==0.8.10 python-editor==1.0.3 -pytz==2018.5 +pytz==2020.1 PyYAML==5.2 redis==2.10.5 regex==2019.6.8 diff --git a/server-requirements-final.txt b/server-requirements-final.txt index 460af8a7..1a41656e 100644 --- a/server-requirements-final.txt +++ b/server-requirements-final.txt @@ -1 +1,4 @@ matplotlib==1.5.3 +pandas==1.4.3 +pretty_html_table +xlsxwriter