diff --git a/app/criteria/comparison_speech_slides/criterion.py b/app/criteria/comparison_speech_slides/criterion.py index 2f0fc54..5caec3a 100644 --- a/app/criteria/comparison_speech_slides/criterion.py +++ b/app/criteria/comparison_speech_slides/criterion.py @@ -7,7 +7,7 @@ from app.audio import Audio from app.presentation import Presentation from app.utils import normalize_text, delete_punctuation -from ..text_comparison import tfidf_similarity, word2vec_similarity, n_gramms_similarity +from ..text_comparison import SlidesSimilarityEvaluator logger = get_root_logger('web') @@ -26,6 +26,7 @@ def __init__(self, parameters, dependent_criteria, name=''): parameters=parameters, dependent_criteria=dependent_criteria, ) + self.evaluator = SlidesSimilarityEvaluator() @property def description(self): @@ -34,8 +35,8 @@ def description(self): "Описание": t( "Проверяет, что текст слайда соответствует словам, которые произносит студент во время демонстрации " "этого слайда"), - # TODO Проработать критерий оценки - "Оценка": t("COMMING SOON") + "Оценка": t("1, если среднее значение соответствия речи содержимому слайдов равно или превосходит 0.125, " + "иначе 8 * r, где r - среднее значение соответствия речи демонстрируемым слайдам") } def skip_slide(self, current_slide_text: str) -> bool: @@ -46,9 +47,10 @@ def skip_slide(self, current_slide_text: str) -> bool: def apply(self, audio: Audio, presentation: Presentation, training_id: ObjectId, criteria_results: dict) -> CriterionResult: - tf_idf = [] - word2vec = [] - n_grams = [] + # Результаты сравнения текстов + results = {} + + slides_to_process = [] for current_slide_index in range(len(audio.audio_slides)): # Список слов, сказанных студентом на данном слайде -- список из RecognizedWord @@ -58,6 +60,11 @@ def apply(self, audio: Audio, presentation: Presentation, training_id: ObjectId, # Нормализация текста выступления current_slide_speech = " ".join(normalize_text(current_slide_speech)) + # Если на данном слайде ничего не сказано, то не обрабатываем данный слайд + if len(current_slide_speech.split()) == 0: + results[current_slide_index + 1] = 0.000 + continue + # Список слов со слайда презентации current_slide_text = presentation.slides[current_slide_index].words # Проверяем, входит ли рассматриваемый слайд в список нерасмматриваемых @@ -67,26 +74,17 @@ def apply(self, audio: Audio, presentation: Presentation, training_id: ObjectId, # Нормализация текста слайда current_slide_text = " ".join(normalize_text(current_slide_text.split())) + slides_to_process.append((current_slide_speech, current_slide_text, current_slide_index + 1)) - # На этом слайде ничего не сказано или в презентации нет текста -- пропускаем - if len(current_slide_text.split()) == 0 or len(current_slide_speech.split()) == 0: - tf_idf.append(0.000) - word2vec.append(0.000) - n_grams.append(0.000) - continue + self.evaluator.train_model([" ".join(list(map(lambda x: x[0], slides_to_process))), " ".join(list(map(lambda x: x[1], slides_to_process)))]) + + for speech, slide_text, slide_number in slides_to_process: + results[slide_number] = self.evaluator.evaluate_semantic_similarity(speech, slide_text) + + results = dict(sorted(results.items())) + + score = 8 * (sum(list(results.values())) / len(list(results.values()))) - # TF-IDF - tf_idf.append(tfidf_similarity(current_slide_speech, current_slide_text)) - # word2vec - word2vec.append(word2vec_similarity(current_slide_speech, current_slide_text)) - # n-gramms - n_grams.append(n_gramms_similarity(current_slide_speech, - current_slide_text, - self.parameters["n_values"], - self.parameters["weights"])) - - logger.info(f"TF-IDF: {tf_idf}\n") - logger.info(f"Word2Vec: {word2vec}\n") - logger.info(f"N-grams: {n_grams}\n") - - return CriterionResult(1.0, "Отлично") + return CriterionResult(1 if score >= 1 else score, "Отлично" if score >= 1 else "Следует уделить внимание " + "соотвествию речи на слайдах " + "{}".format(",\n".join([f"№{n} - {results[n]}" for n in dict(filter(lambda item: item[1] < 0.125, results.items()))]))) diff --git a/app/criteria/text_comparison.py b/app/criteria/text_comparison.py index 04d52d2..04d0bf3 100644 --- a/app/criteria/text_comparison.py +++ b/app/criteria/text_comparison.py @@ -1,53 +1,17 @@ from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity -from gensim.models import Word2Vec -from nltk.tokenize import word_tokenize -from nltk.util import ngrams -from collections import Counter -def tfidf_similarity(current_slide_speech: str, current_slide_text: str) -> float: - corpus = [current_slide_speech, current_slide_text] - vectorizer = TfidfVectorizer() - X = vectorizer.fit_transform(corpus) - cosine_sim = cosine_similarity(X[0], X[1]) - similarity = cosine_sim[0][0] - return round(similarity, 3) +class SlidesSimilarityEvaluator: + def __init__(self): + self.vectorizer = TfidfVectorizer(ngram_range=(1, 1)) + def train_model(self, corpus: list): + self.vectorizer.fit(corpus) -def word2vec_similarity(current_slide_speech: str, current_slide_text: str) -> float: - tokens_speech = word_tokenize(current_slide_speech) - tokens_slide = word_tokenize(current_slide_text) - sentences = [tokens_speech, tokens_slide] - model = Word2Vec(sentences, min_count=1) - similarity = model.wv.n_similarity(tokens_speech, tokens_slide) - return round(similarity, 3) + def evaluate_semantic_similarity(self, text1: str, text2: str) -> float: + vector1 = self.vectorizer.transform([text1]) + vector2 = self.vectorizer.transform([text2]) + similarity = cosine_similarity(vector1, vector2)[0][0] - -def n_gramms_similarity(current_slide_speech: str, current_slide_text: str, n_values: list, weights: list) -> float: - get_ngrams = lambda text, n: [' '.join(gram) for gram in ngrams(word_tokenize(text.lower()), n)] - similarities = [] - for n in n_values: - ngrams_text1 = get_ngrams(current_slide_speech, n) - ngrams_text2 = get_ngrams(current_slide_text, n) - - counter_text1 = Counter(ngrams_text1) - counter_text2 = Counter(ngrams_text2) - - intersection = set(ngrams_text1) & set(ngrams_text2) - - if len(ngrams_text1) == 0 or len(ngrams_text2) == 0: - similarities.append(0.000) - else: - similarity = sum( - min(counter_text1[ngram], counter_text2[ngram]) for ngram in intersection) / max( - len(ngrams_text1), len(ngrams_text2)) - similarities.append(similarity) - - if weights: - combined_similarity = sum( - weight * similarity for weight, similarity in zip(weights, similarities)) - else: - combined_similarity = sum(similarities) / len(similarities) - - return round(combined_similarity, 3) + return round(similarity, 3) diff --git a/app/utils.py b/app/utils.py index 79bf379..4961cd5 100644 --- a/app/utils.py +++ b/app/utils.py @@ -204,6 +204,11 @@ def normalize_text(text: list) -> list: return text +# Функция нормализации для списка текстов +def normalize_list(text: list) -> list: + return list(map(lambda x: " ".join(x), map(lambda x: normalize_text(x.split()), text))) + + # Удаление пунктуации из текста def delete_punctuation(text: str) -> str: return text.translate(str.maketrans('', '', string.punctuation + "\t\n\r\v\f"))