Adressing issue #1: Added print statements to the

decomposition.ClusterDecomposer.get_component_repr method to avoid errors when no components are found. Adressing issue #2: We've added functionalityto the `SCA` initialization method to allow for custom tokenizers. We've alos added respective test cases and notes in the README.
mainlp · Jan 8, 2025 · 1274f9f · 1274f9f
1 parent e5c1c58
commit 1274f9f
Show file tree

Hide file tree

Showing 5 changed files with 65 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -97,6 +97,26 @@ Besides, we publish the Chinese News dataset, which we acquired to the Twitter A
 
 The current version of the Hausa Tweet dataset is available at the [NaijaSenti repository](https://github.com/hausanlp/NaijaSenti/blob/main/sections/unlabeled_twitter_corpus.md).
 
+## Support of Other Languages
+
+We wanted to make `SCA` as adaptable as possible to use-cases in other languages. The main parts of the pipeline that do not always generalize across languages are the base embedding model as well as the tokenizer and stopwords-list used to compute the c-TF-IDF representations. Since the embeddings are calculated outside of the `SCA` class, you can use any method that will output vector-valued embeddings. Just make sure to pass them as `numpy.array` or equivalent.
+
+For the other parts, you can pass custom versions when instantiating `SCA`:
+
+```python
+from semantic_components.sca import SCA
+from semantic_components.representation import GenericTokenizer
+
+custom_tokenizer = GenericTokenizer()
+custom_stopwords_path = "path/to/stopwords.txt"
+
+# fit sca model to data
+sca = SCA(tokenizer=custom_tokenizer, stopwords_path=custom_stopwords_path)
+scores, residuals, ids = sca.fit(documents, embeddings)
+```
+
+You can look at the implementation of `GenericTokenizer` for a minimal example of what your custom tokenizer should do (you only need to implement `tokenize` and `__call__`). The stopwords are passed as a path to a stopwords where each line is interpreted as a single stopword. The representer will ignore these words when calculating the token representations. Passing either of these arguments will overwrite the respective standard choices inferred by the `language` argument (which currently only supports Chinese and English, though the latter generalizes to other languages where tokens are separated by whitespace).
+
 ## AI Usage Disclaimer
 
 The code in this repository has been written with the support of code completions of an AI coding assistant, namely GitHub Copilot. Completions were mostly single lines up to a few lines of code and were always checked carefully to ensure their functionality and safety. Furthermore, we did our best to avoid accepting code completions that would be incompatible with the license of our code or could be regarded as plagiarism.

diff --git a/semantic_components/decomposition.py b/semantic_components/decomposition.py
@@ -508,6 +508,14 @@ def get_component_repr(self, i):
         """
         Get the representation of a component.
         """
+        if not self.component_vectors:
+            print("ClusterDecomposer: No components have been fitted.")
+            return None
+
+        if i >= len(self.component_vectors):
+            print("ClusterDecomposer: Component index out of bounds.")
+            return None
+
         return self.component_vectors[i]
 
 

diff --git a/semantic_components/representation.py b/semantic_components/representation.py
@@ -97,6 +97,8 @@ def __init__(
         self.log = lambda x: None if log is None else log(x)
         self.n_grams = n_grams
 
+        self.token_pattern = r"\b..+\b"
+
         if self.stopwords is None:
             self.stopwords = self.load_stopwords(stopwords_path)
 
@@ -108,7 +110,6 @@ def __init__(
                 self.token_pattern = r"(?u)\b\w\w+\b"
                 self.tokenizer = EnglishTokenizer(stopwords=self.stopwords)
             else:
-                self.token_pattern = r"\b..+\b"
                 self.tokenizer = GenericTokenizer(stopwords=self.stopwords)
 
         self.ctfidf_model = None
@@ -717,4 +718,3 @@ def tokenize(self, text):
     def __call__(self, text):
         return self.tokenize(text)
 
-
diff --git a/semantic_components/sca.py b/semantic_components/sca.py
@@ -81,12 +81,17 @@ class SCA:
     language : str, optional
         The language. Default is "english". For the languages we discuss in the paper, this influences the tokenizer and
           the stopword list for the representation.
+    
+    tokenizer : optional
+        Pass a custom tokenizer. The tokenizer should implement a `tokenize` and `__call__` function. See
+        `semantic_components.representation.GenericTokenizer` for a minimal example. Default is None.
 
     evaluation : bool, optional
         Whether to evaluate the model. Default is False.
 
     stopwords_path : str, optional
-        The path to the stopwords file. Default is None.
+        The path to the stopwords file. Default is None, which will use the standard stopwords for the specified 
+        language.
 
     verbose : bool, optional
         Whether to print the progress. Default is False.
@@ -127,6 +132,7 @@ def __init__(
         n_history=2,
         max_iterations=10,
         language="english",
+        tokenizer=None,
         evaluation=False,
         stopwords_path=None,
         verbose=False,
@@ -152,6 +158,7 @@ def __init__(
         self.n_history = n_history
         self.max_iterations = max_iterations
         self.language = language
+        self.tokenizer = tokenizer
         self.stopwords_path = stopwords_path
         self.verbose = verbose
         self.logging = logging
@@ -169,6 +176,14 @@ def __init__(
     def fit(self, documents, embeddings):
         """
         Fit the SCA model.
+
+        Parameters
+        ----------
+        documents : pd.DataFrame
+            The documents to fit the model on.
+
+        embeddings : np.array
+            The embeddings to fit the model on.
         """
         residuals, scores, ids = self.decompose(documents, embeddings)
 
@@ -263,6 +278,7 @@ def represent(self, documents, embeddings, assignments):
 
         ctfidf_representer = CTFIDFRepresenter(
             language=self.language,
+            tokenizer=self.tokenizer,
             stopwords_path=self.stopwords_path,
             verbose=self.verbose,
             log=self.log,

diff --git a/test/test_sca.py b/test/test_sca.py
@@ -4,6 +4,7 @@
 
 # from cluster_pipeline import CTFIDFInterface
 from semantic_components.sca import SCA
+from semantic_components.representation import GenericTokenizer
 
 
 class ToyHDBSCAN:
@@ -196,3 +197,20 @@ def test_evaluate(self):
         )
 
         sca.fit(self.documents, self.embeddings)
+
+    def test_custom_tokenizer(self):
+        tokenizer = GenericTokenizer()
+        sca = SCA(
+            dim_reduction_algorithm=self.dim_reduction,
+            cluster_algorithm=self.clustering_algorithm,
+            normalize_components=True,
+            tokenizer=tokenizer,
+            language="zh",
+            mu=0.5,
+            termination_crit="new_components",
+            verbose=True,
+        )
+
+        sca.fit(self.documents, self.embeddings)
+
+