Skip to content

Commit

Permalink
add tf-idf vectorizer and verdict
Browse files Browse the repository at this point in the history
  • Loading branch information
arhihihipov committed May 1, 2024
1 parent f927741 commit a2d538d
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 73 deletions.
52 changes: 25 additions & 27 deletions app/criteria/comparison_speech_slides/criterion.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from app.audio import Audio
from app.presentation import Presentation
from app.utils import normalize_text, delete_punctuation
from ..text_comparison import tfidf_similarity, word2vec_similarity, n_gramms_similarity
from ..text_comparison import SlidesSimilarityEvaluator

logger = get_root_logger('web')

Expand All @@ -26,6 +26,7 @@ def __init__(self, parameters, dependent_criteria, name=''):
parameters=parameters,
dependent_criteria=dependent_criteria,
)
self.evaluator = SlidesSimilarityEvaluator()

@property
def description(self):
Expand All @@ -34,8 +35,8 @@ def description(self):
"Описание": t(
"Проверяет, что текст слайда соответствует словам, которые произносит студент во время демонстрации "
"этого слайда"),
# TODO Проработать критерий оценки
"Оценка": t("COMMING SOON")
"Оценка": t("1, если среднее значение соответствия речи содержимому слайдов равно или превосходит 0.125, "
"иначе 8 * r, где r - среднее значение соответствия речи демонстрируемым слайдам")
}

def skip_slide(self, current_slide_text: str) -> bool:
Expand All @@ -46,9 +47,10 @@ def skip_slide(self, current_slide_text: str) -> bool:

def apply(self, audio: Audio, presentation: Presentation, training_id: ObjectId,
criteria_results: dict) -> CriterionResult:
tf_idf = []
word2vec = []
n_grams = []
# Результаты сравнения текстов
results = {}

slides_to_process = []

for current_slide_index in range(len(audio.audio_slides)):
# Список слов, сказанных студентом на данном слайде -- список из RecognizedWord
Expand All @@ -58,6 +60,11 @@ def apply(self, audio: Audio, presentation: Presentation, training_id: ObjectId,
# Нормализация текста выступления
current_slide_speech = " ".join(normalize_text(current_slide_speech))

# Если на данном слайде ничего не сказано, то не обрабатываем данный слайд
if len(current_slide_speech.split()) == 0:
results[current_slide_index + 1] = 0.000
continue

# Список слов со слайда презентации
current_slide_text = presentation.slides[current_slide_index].words
# Проверяем, входит ли рассматриваемый слайд в список нерасмматриваемых
Expand All @@ -67,26 +74,17 @@ def apply(self, audio: Audio, presentation: Presentation, training_id: ObjectId,

# Нормализация текста слайда
current_slide_text = " ".join(normalize_text(current_slide_text.split()))
slides_to_process.append((current_slide_speech, current_slide_text, current_slide_index + 1))

# На этом слайде ничего не сказано или в презентации нет текста -- пропускаем
if len(current_slide_text.split()) == 0 or len(current_slide_speech.split()) == 0:
tf_idf.append(0.000)
word2vec.append(0.000)
n_grams.append(0.000)
continue
self.evaluator.train_model([" ".join(list(map(lambda x: x[0], slides_to_process))), " ".join(list(map(lambda x: x[1], slides_to_process)))])

for speech, slide_text, slide_number in slides_to_process:
results[slide_number] = self.evaluator.evaluate_semantic_similarity(speech, slide_text)

results = dict(sorted(results.items()))

score = 8 * (sum(list(results.values())) / len(list(results.values())))

# TF-IDF
tf_idf.append(tfidf_similarity(current_slide_speech, current_slide_text))
# word2vec
word2vec.append(word2vec_similarity(current_slide_speech, current_slide_text))
# n-gramms
n_grams.append(n_gramms_similarity(current_slide_speech,
current_slide_text,
self.parameters["n_values"],
self.parameters["weights"]))

logger.info(f"TF-IDF: {tf_idf}\n")
logger.info(f"Word2Vec: {word2vec}\n")
logger.info(f"N-grams: {n_grams}\n")

return CriterionResult(1.0, "Отлично")
return CriterionResult(1 if score >= 1 else score, "Отлично" if score >= 1 else "Следует уделить внимание "
"соотвествию речи на слайдах "
"{}".format(",\n".join([f"№{n} - {results[n]}" for n in dict(filter(lambda item: item[1] < 0.125, results.items()))])))
56 changes: 10 additions & 46 deletions app/criteria/text_comparison.py
Original file line number Diff line number Diff line change
@@ -1,53 +1,17 @@
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from collections import Counter


def tfidf_similarity(current_slide_speech: str, current_slide_text: str) -> float:
corpus = [current_slide_speech, current_slide_text]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
cosine_sim = cosine_similarity(X[0], X[1])
similarity = cosine_sim[0][0]
return round(similarity, 3)
class SlidesSimilarityEvaluator:
def __init__(self):
self.vectorizer = TfidfVectorizer(ngram_range=(1, 1))

def train_model(self, corpus: list):
self.vectorizer.fit(corpus)

def word2vec_similarity(current_slide_speech: str, current_slide_text: str) -> float:
tokens_speech = word_tokenize(current_slide_speech)
tokens_slide = word_tokenize(current_slide_text)
sentences = [tokens_speech, tokens_slide]
model = Word2Vec(sentences, min_count=1)
similarity = model.wv.n_similarity(tokens_speech, tokens_slide)
return round(similarity, 3)
def evaluate_semantic_similarity(self, text1: str, text2: str) -> float:
vector1 = self.vectorizer.transform([text1])
vector2 = self.vectorizer.transform([text2])
similarity = cosine_similarity(vector1, vector2)[0][0]


def n_gramms_similarity(current_slide_speech: str, current_slide_text: str, n_values: list, weights: list) -> float:
get_ngrams = lambda text, n: [' '.join(gram) for gram in ngrams(word_tokenize(text.lower()), n)]
similarities = []
for n in n_values:
ngrams_text1 = get_ngrams(current_slide_speech, n)
ngrams_text2 = get_ngrams(current_slide_text, n)

counter_text1 = Counter(ngrams_text1)
counter_text2 = Counter(ngrams_text2)

intersection = set(ngrams_text1) & set(ngrams_text2)

if len(ngrams_text1) == 0 or len(ngrams_text2) == 0:
similarities.append(0.000)
else:
similarity = sum(
min(counter_text1[ngram], counter_text2[ngram]) for ngram in intersection) / max(
len(ngrams_text1), len(ngrams_text2))
similarities.append(similarity)

if weights:
combined_similarity = sum(
weight * similarity for weight, similarity in zip(weights, similarities))
else:
combined_similarity = sum(similarities) / len(similarities)

return round(combined_similarity, 3)
return round(similarity, 3)
5 changes: 5 additions & 0 deletions app/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,11 @@ def normalize_text(text: list) -> list:
return text


# Функция нормализации для списка текстов
def normalize_list(text: list) -> list:
return list(map(lambda x: " ".join(x), map(lambda x: normalize_text(x.split()), text)))


# Удаление пунктуации из текста
def delete_punctuation(text: str) -> str:
return text.translate(str.maketrans('', '', string.punctuation + "\t\n\r\v\f"))
Expand Down

0 comments on commit a2d538d

Please sign in to comment.