Skip to content
This repository has been archived by the owner on Mar 1, 2022. It is now read-only.

Updates can be optionally sorted by word frequency or score #66

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/SmoothNLP.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 21 additions & 0 deletions .idea/inspectionProfiles/Project_Default.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/inspectionProfiles/profiles_settings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion smoothnlp/algorithm/phrase/ngram_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,8 @@ def get_scores(corpus,
left_right_entropy[word][0], #left_entropy
left_right_entropy[word][1], #right_entropy
min(left_right_entropy[word][0],left_right_entropy[word][1]), #branch entropy BE=min{left_entropy,right_entropy}
word_liberalization(left_right_entropy[word][0],left_right_entropy[word][1])+mi[word][1] #our score
word_liberalization(left_right_entropy[word][0],left_right_entropy[word][1])+mi[word][1], #our score
ngram_freq[word] # word frequency
)
for word in joint_phrase}

Expand Down
11 changes: 8 additions & 3 deletions smoothnlp/algorithm/phrase/phrase_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,16 @@ def extract_phrase(corpus,
chunk_size: int = 1000000,
min_n:int = 2,
max_n:int=4,
min_freq:int = 5):
min_freq:int = 5,
order_by: str = 'score'):
'''
取前k个new words或前k%的new words
按score或者freq取前k个new words或前k%的new words
:param corpus:
:param top_k:
:param chunk_size:
:param max_n:
:param min_freq:
:param order_by:
:return:
'''
if isinstance(corpus,str):
Expand All @@ -57,7 +59,10 @@ def extract_phrase(corpus,
else:
corpus_splits = chunk_generator_adapter(corpus, chunk_size)
word_info_scores = get_scores(corpus_splits,min_n,max_n,chunk_size,min_freq)
new_words = [item[0] for item in sorted(word_info_scores.items(),key=lambda item:item[1][-1],reverse = True)]
if order_by == 'score':
new_words = [item[0] for item in sorted(word_info_scores.items(), key=lambda item: item[1][-2], reverse=True)]
elif order_by == 'freq':
new_words = [item[0] for item in sorted(word_info_scores.items(), key=lambda item: item[1][-1], reverse=True)]
if top_k > 1: #输出前k个词
return new_words[:top_k]
elif top_k < 1: #输出前k%的词
Expand Down