From 6ab9af1cfe74a126e709539a2467426d0881945c Mon Sep 17 00:00:00 2001 From: Maarten Grootendorst Date: Tue, 28 Sep 2021 15:29:27 +0200 Subject: [PATCH] v0.5 (#58) * Guided KeyBERT * Update default SBERT model --- .github/workflows/testing.yml | 2 +- README.md | 12 +++------- docs/changelog.md | 14 ++++++++++++ docs/faq.md | 27 +++++++++++++++++++++-- docs/guides/embeddings.md | 4 ++-- docs/guides/quickstart.md | 28 +++++++++++++++++++++++- keybert/__init__.py | 2 +- keybert/_model.py | 22 ++++++++++++++----- keybert/backend/_sentencetransformers.py | 6 ++--- keybert/backend/_utils.py | 2 +- setup.py | 5 ++--- tests/test_model.py | 18 ++++++++++++++- 12 files changed, 113 insertions(+), 29 deletions(-) diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index f20c8c40..df8a3e2a 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -26,6 +26,6 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install -e ".[test]" + pip install -e ".[dev]" - name: Run Checking Mechanisms run: make check diff --git a/README.md b/README.md index 0ae2de62..a7d96f75 100644 --- a/README.md +++ b/README.md @@ -75,12 +75,6 @@ pip install keybert[spacy] pip install keybert[use] ``` -To install all backends: - -``` -pip install keybert[all] -``` - ### 2.2. Usage @@ -136,7 +130,7 @@ keywords = kw_model.extract_keywords(doc, highlight=True) **NOTE**: For a full overview of all possible transformer models see [sentence-transformer](https://www.sbert.net/docs/pretrained_models.html). -I would advise either `"paraphrase-MiniLM-L6-v2"` for English documents or `"paraphrase-multilingual-MiniLM-L12-v2"` +I would advise either `"all-MiniLM-L6-v2"` for English documents or `"paraphrase-multilingual-MiniLM-L12-v2"` for multi-lingual documents or any other language. @@ -205,7 +199,7 @@ and pass it through KeyBERT with `model`: ```python from keybert import KeyBERT -kw_model = KeyBERT(model='paraphrase-MiniLM-L6-v2') +kw_model = KeyBERT(model='all-MiniLM-L6-v2') ``` Or select a SentenceTransformer model with your own parameters: @@ -214,7 +208,7 @@ Or select a SentenceTransformer model with your own parameters: from keybert import KeyBERT from sentence_transformers import SentenceTransformer -sentence_model = SentenceTransformer("paraphrase-MiniLM-L6-v2") +sentence_model = SentenceTransformer("all-MiniLM-L6-v2") kw_model = KeyBERT(model=sentence_model) ``` diff --git a/docs/changelog.md b/docs/changelog.md index fc56e812..4437ec6a 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,3 +1,17 @@ +## **Version 0.5.0** +*Release date: 28 September, 2021* + +**Highlights**: + +* Added Guided KeyBERT + * kw_model.extract_keywords(doc, seed_keywords=seed_keywords) + * Thanks to [@zolekode](https://github.com/zolekode) for the inspiration! +* Use the newest all-* models from SBERT + +**Miscellaneous**: + +* Added instructions in the FAQ to extract keywords from Chinese documents + ## **Version 0.4.0** *Release date: 23 June, 2021* diff --git a/docs/faq.md b/docs/faq.md index a883b58c..1ac43fd5 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -1,7 +1,7 @@ ## **Which embedding model works best for which language?** Unfortunately, there is not a definitive list of the best models for each language, this highly depends on your data, the model, and your specific use-case. However, the default model in KeyBERT -(`"paraphrase-MiniLM-L6-v2"`) works great for **English** documents. In contrast, for **multi-lingual** +(`"all-MiniLM-L6-v2"`) works great for **English** documents. In contrast, for **multi-lingual** documents or any other language, `"paraphrase-multilingual-MiniLM-L12-v2""` has shown great performance. If you want to use a model that provides a higher quality, but takes more compute time, then I would advise using `paraphrase-mpnet-base-v2` and `paraphrase-multilingual-mpnet-base-v2` instead. @@ -17,4 +17,27 @@ topic modeling to HTML-code to extract topics of code, then it becomes important ## **Can I use the GPU to speed up the model?** Yes! Since KeyBERT uses embeddings as its backend, a GPU is actually prefered when using this package. -Although it is possible to use it without a dedicated GPU, the inference speed will be significantly slower. \ No newline at end of file +Although it is possible to use it without a dedicated GPU, the inference speed will be significantly slower. + +## **How can I use KeyBERT with Chinese documents?** +You need to make sure you use a Tokenizer in KeyBERT that supports tokenization of Chinese. I suggest installing [`jieba`](https://github.com/fxsjy/jieba) for this: + +```python +from sklearn.feature_extraction.text import CountVectorizer +import jieba + +def tokenize_zh(text): + words = jieba.lcut(text) + return words + +vectorizer = CountVectorizer(tokenizer=tokenize_zh) +``` + +Then, simply pass the vectorizer to your KeyBERT instance: + +```python +from keybert import KeyBERT + +kw_model = KeyBERT() +keywords = kw_model.extract_keywords(doc, vectorizer=vectorizer) +``` \ No newline at end of file diff --git a/docs/guides/embeddings.md b/docs/guides/embeddings.md index febe4799..b44ea236 100644 --- a/docs/guides/embeddings.md +++ b/docs/guides/embeddings.md @@ -8,7 +8,7 @@ and pass it through KeyBERT with `model`: ```python from keybert import KeyBERT -kw_model = KeyBERT(model="paraphrase-MiniLM-L6-v2") +kw_model = KeyBERT(model="all-MiniLM-L6-v2") ``` Or select a SentenceTransformer model with your own parameters: @@ -16,7 +16,7 @@ Or select a SentenceTransformer model with your own parameters: ```python from sentence_transformers import SentenceTransformer -sentence_model = SentenceTransformer("paraphrase-MiniLM-L6-v2") +sentence_model = SentenceTransformer("all-MiniLM-L6-v2") kw_model = KeyBERT(model=sentence_model) ``` diff --git a/docs/guides/quickstart.md b/docs/guides/quickstart.md index ddf130ac..a9dac5c3 100644 --- a/docs/guides/quickstart.md +++ b/docs/guides/quickstart.md @@ -72,7 +72,7 @@ keywords = kw_model.extract_keywords(doc, highlight=True) ``` **NOTE**: For a full overview of all possible transformer models see [sentence-transformer](https://www.sbert.net/docs/pretrained_models.html). -I would advise either `"paraphrase-MiniLM-L6-v2"` for English documents or `"paraphrase-multilingual-MiniLM-L12-v2"` +I would advise either `"all-MiniLM-L6-v2"` for English documents or `"paraphrase-multilingual-MiniLM-L12-v2"` for multi-lingual documents or any other language. ### Max Sum Similarity @@ -147,4 +147,30 @@ candidates = [candidate[0] for candidate in candidates] # KeyBERT init kw_model = KeyBERT() keywords = kw_model.extract_keywords(doc, candidates) +``` + +### Guided KeyBERT + +Guided KeyBERT is similar to Guided Topic Modeling in that it tries to steer the training towards a set of seeded terms. When applying KeyBERT it automatically extracts the most related keywords to a specific document. However, there are times when stakeholders and users are looking for specific types of keywords. For example, when publishing an article on your website through contentful, you typically already know the global keywords related to the article. However, there might be a specific topic in the article that you would like to be extracted through the keywords. To achieve this, we simply give KeyBERT a set of related seeded keywords (it can also be a single one!) and search for keywords that are similar to both the document and the seeded keywords. + +Using this feature is as simple as defining a list of seeded keywords and passing them to KeyBERT: + + +```python +doc = """ + Supervised learning is the machine learning task of learning a function that + maps an input to an output based on example input-output pairs.[1] It infers a + function from labeled training data consisting of a set of training examples.[2] + In supervised learning, each example is a pair consisting of an input object + (typically a vector) and a desired output value (also called the supervisory signal). + A supervised learning algorithm analyzes the training data and produces an inferred function, + which can be used for mapping new examples. An optimal scenario will allow for the + algorithm to correctly determine the class labels for unseen instances. This requires + the learning algorithm to generalize from the training data to unseen situations in a + 'reasonable' way (see inductive bias). + """ + +kw_model = KeyBERT() +seed_keywords = ["information"] +keywords = kw_model.extract_keywords(doc, use_mmr=True, diversity=0.1, seed_keywords=seed_keywords) ``` \ No newline at end of file diff --git a/keybert/__init__.py b/keybert/__init__.py index 6d8bc87b..1e5d20eb 100644 --- a/keybert/__init__.py +++ b/keybert/__init__.py @@ -1,3 +1,3 @@ from keybert._model import KeyBERT -__version__ = "0.4.0" +__version__ = "0.5.0" diff --git a/keybert/_model.py b/keybert/_model.py index 497adafb..61021c5f 100644 --- a/keybert/_model.py +++ b/keybert/_model.py @@ -31,7 +31,7 @@ class KeyBERT: """ def __init__(self, - model="paraphrase-MiniLM-L6-v2"): + model="all-MiniLM-L6-v2"): """ KeyBERT initialization Arguments: @@ -60,8 +60,9 @@ def extract_keywords(self, diversity: float = 0.5, nr_candidates: int = 20, vectorizer: CountVectorizer = None, - highlight: bool = False) -> Union[List[Tuple[str, float]], - List[List[Tuple[str, float]]]]: + highlight: bool = False, + seed_keywords: List[str] = None) -> Union[List[Tuple[str, float]], + List[List[Tuple[str, float]]]]: """ Extract keywords/keyphrases NOTE: @@ -99,6 +100,8 @@ def extract_keywords(self, highlight: Whether to print the document and highlight its keywords/keyphrases. NOTE: This does not work if multiple documents are passed. + seed_keywords: Seed keywords that may guide the extraction of keywords by + steering the similarities towards the seeded keywords Returns: keywords: the top n keywords for a document with their respective distances @@ -116,7 +119,8 @@ def extract_keywords(self, use_mmr=use_mmr, diversity=diversity, nr_candidates=nr_candidates, - vectorizer=vectorizer) + vectorizer=vectorizer, + seed_keywords=seed_keywords) if highlight: highlight_document(docs, keywords) @@ -143,7 +147,8 @@ def _extract_keywords_single_doc(self, use_mmr: bool = False, diversity: float = 0.5, nr_candidates: int = 20, - vectorizer: CountVectorizer = None) -> List[Tuple[str, float]]: + vectorizer: CountVectorizer = None, + seed_keywords: List[str] = None) -> List[Tuple[str, float]]: """ Extract keywords/keyphrases for a single document Arguments: @@ -157,6 +162,8 @@ def _extract_keywords_single_doc(self, diversity: The diversity of results between 0 and 1 if use_mmr is True nr_candidates: The number of candidates to consider if use_maxsum is set to True vectorizer: Pass in your own CountVectorizer from scikit-learn + seed_keywords: Seed keywords that may guide the extraction of keywords by + steering the similarities towards the seeded keywords Returns: keywords: the top n keywords for a document with their respective distances @@ -175,6 +182,11 @@ def _extract_keywords_single_doc(self, doc_embedding = self.model.embed([doc]) candidate_embeddings = self.model.embed(candidates) + # Guided KeyBERT with seed keywords + if seed_keywords is not None: + seed_embeddings = self.model.embed([" ".join(seed_keywords)]) + doc_embedding = np.average([doc_embedding, seed_embeddings], axis=0, weights=[3, 1]) + # Calculate distances and extract keywords if use_mmr: keywords = mmr(doc_embedding, candidate_embeddings, candidates, top_n, diversity) diff --git a/keybert/backend/_sentencetransformers.py b/keybert/backend/_sentencetransformers.py index 60e4845c..2e599dbc 100644 --- a/keybert/backend/_sentencetransformers.py +++ b/keybert/backend/_sentencetransformers.py @@ -16,13 +16,13 @@ class SentenceTransformerBackend(BaseEmbedder): sentence-transformers model: ```python from keybert.backend import SentenceTransformerBackend - sentence_model = SentenceTransformerBackend("paraphrase-MiniLM-L6-v2") + sentence_model = SentenceTransformerBackend("all-MiniLM-L6-v2") ``` or you can instantiate a model yourself: ```python from keybert.backend import SentenceTransformerBackend from sentence_transformers import SentenceTransformer - embedding_model = SentenceTransformer("paraphrase-MiniLM-L6-v2") + embedding_model = SentenceTransformer("all-MiniLM-L6-v2") sentence_model = SentenceTransformerBackend(embedding_model) ``` """ @@ -36,7 +36,7 @@ def __init__(self, embedding_model: Union[str, SentenceTransformer]): else: raise ValueError("Please select a correct SentenceTransformers model: \n" "`from sentence_transformers import SentenceTransformer` \n" - "`model = SentenceTransformer('paraphrase-MiniLM-L6-v2')`") + "`model = SentenceTransformer('all-MiniLM-L6-v2')`") def embed(self, documents: List[str], diff --git a/keybert/backend/_utils.py b/keybert/backend/_utils.py index 4d135129..9d41adfe 100644 --- a/keybert/backend/_utils.py +++ b/keybert/backend/_utils.py @@ -4,7 +4,7 @@ def select_backend(embedding_model) -> BaseEmbedder: """ Select an embedding model based on language or a specific sentence transformer models. - When selecting a language, we choose `paraphrase-MiniLM-L6-v2` for English and + When selecting a language, we choose `all-MiniLM-L6-v2` for English and `paraphrase-multilingual-MiniLM-L12-v2` for all other languages as it support 100+ languages. Returns: diff --git a/setup.py b/setup.py index 2527b71e..e8f7ca14 100644 --- a/setup.py +++ b/setup.py @@ -48,7 +48,7 @@ setup( name="keybert", packages=find_packages(exclude=["notebooks", "docs"]), - version="0.4.0", + version="0.5.0", author="Maarten Grootendorst", author_email="maartengrootendorst@gmail.com", description="KeyBERT performs keyword extraction with state-of-the-art transformer models.", @@ -76,8 +76,7 @@ "test": test_packages, "docs": docs_packages, "dev": dev_packages, - "flair": flair_packages, - "all": extra_packages + "flair": flair_packages }, python_requires='>=3.6', ) \ No newline at end of file diff --git a/tests/test_model.py b/tests/test_model.py index 977f98eb..487e16ed 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -4,7 +4,7 @@ from keybert import KeyBERT doc_one, doc_two = get_test_data() -model = KeyBERT(model='paraphrase-MiniLM-L6-v2') +model = KeyBERT(model='all-MiniLM-L6-v2') @pytest.mark.parametrize("keyphrase_length", [(1, i+1) for i in range(5)]) @@ -68,6 +68,22 @@ def test_extract_keywords_multiple_docs(keyphrase_length): assert len(keyword[0].split(" ")) <= keyphrase_length[1] +def test_guided(): + """ Test whether the keywords are correctly extracted """ + top_n = 5 + seed_keywords = ["time", "night", "day", "moment"] + keywords = model.extract_keywords(doc_one, + min_df=1, + top_n=top_n, + seed_keywords=seed_keywords) + + assert isinstance(keywords, list) + assert isinstance(keywords[0], tuple) + assert isinstance(keywords[0][0], str) + assert isinstance(keywords[0][1], float) + assert len(keywords) == top_n + + def test_error(): """ Empty doc should raise a ValueError """ with pytest.raises(AttributeError):