From c8513a0dcddd2d5aa58934171f31f70f36591f11 Mon Sep 17 00:00:00 2001
From: David Hundley <david.k.hundley@gmail.com>
Date: Sun, 15 Dec 2024 00:24:38 -0600
Subject: [PATCH] Add Coleman-Liau Index calculation and update documentation
 in readability metrics

---
 docs/wiki/metrics/text/readability_metrics.md | 31 +++++++++++-
 whetstone/metrics/text/readability_metrics.py | 48 ++++++++++++++++++-
 2 files changed, 76 insertions(+), 3 deletions(-)

diff --git a/docs/wiki/metrics/text/readability_metrics.md b/docs/wiki/metrics/text/readability_metrics.md
index c045552..2a65dd4 100644
--- a/docs/wiki/metrics/text/readability_metrics.md
+++ b/docs/wiki/metrics/text/readability_metrics.md
@@ -98,4 +98,33 @@ To optimize the Gunning Fog index, consider applying the following strategies:
 
 - Use shorter sentences.
 - Avoid complex words where simpler alternatives exist.
-- Focus on clarity and brevity.
\ No newline at end of file
+- Focus on clarity and brevity.
+
+
+
+## Coleman-Liau Index
+The Coleman-Liau Index is a readability metric used to estimate the grade level required for someone to understand a text. It assesses the complexity of a text based on its characters, words, and sentences, rather than using syllables like some other readability formulas (e.g., the Flesch-Kincaid formula).
+
+The formula for the Coleman-Liau Index is:
+
+$$
+0.0588 \times \text{average letters per 100 words} - 0.296 \times \text{average sentences per 100 words} - 
+$$
+
+Here’s a breakdown of what the ranges generally indicate:
+
+| **CLI Range**   | **Reading Level**                                | **Audience**                          |
+|------------------|-------------------------------------------------|---------------------------------------|
+| 1.0 – 5.0       | Very easy to read                               | Young children or early elementary   |
+| 6.0 – 8.0       | Fairly easy to read                             | Upper elementary or middle school    |
+| 9.0 – 12.0      | Standard readability (average complexity)       | High school students                 |
+| 13.0 – 16.0     | More complex (college level)                    | College students                     |
+| 17.0+           | Very complex (graduate level or professional)   | Advanced academic or professional    |
+
+To optimize the Coleman-Liau Index metric, consider applying the following strategies:
+
+- Reduce sentence length. (e.g., Break down long, complex sentences into shorter, simpler ones.)
+- Minimize word complexity. (e.g., Use shorter words with fewer characters and avoid technical or uncommon terms.)
+- Focus on conciseness. (e.g., Eliminate unnecessary modifiers or redundant phrases.)
+- Use clear, direct writing. (e.g., Aim for straightforward phrasing without overly elaborate descriptions.)
+- Target grade-level readability. (e.g., Adjust content based on the desired audience grade level to balance simplicity and effectiveness.)
\ No newline at end of file
diff --git a/whetstone/metrics/text/readability_metrics.py b/whetstone/metrics/text/readability_metrics.py
index 6746c46..275e49c 100644
--- a/whetstone/metrics/text/readability_metrics.py
+++ b/whetstone/metrics/text/readability_metrics.py
@@ -116,6 +116,48 @@ def calculate_gunning_fog_index(text: str) -> float:
 
 
 
+def calculate_coleman_liau_index(texts: Union[str, List[str]]) -> List[float]:
+    '''
+    Calculate the Coleman-Liau Index for one or multiple texts.
+
+    Inputs:
+        - texts (str or list[str]): The input text(s) for which to calculate the Coleman-Liau Index.
+
+    Returns:
+        - cl_index_scores (list[float]): A list of Coleman-Liau Index scores corresponding to each input text.
+    '''
+    # Wrapping the input in a list if it is a single string
+    if isinstance(texts, str):
+        texts = [texts]
+
+    cl_index_scores = []
+    for text in texts:
+        # Tokenizing sentences and words
+        sentences = tokenize_sentence(text)
+        words = re.findall(r'\b\w+\b', text)
+        
+        # Calculating the number of sentences, words, and characters
+        num_sentences = len(sentences)
+        num_words = len(words)
+        num_characters = sum(len(word) for word in words)
+        
+        # Avoiding division by zero
+        if num_words == 0 or num_sentences == 0:
+            cl_index_scores.append(0.0)
+            continue
+        
+        # Calculating average number of letters per 100 words and sentences per 100 words
+        avg_letters_per_100_words = (num_characters / num_words) * 100
+        avg_sentences_per_100_words = (num_sentences / num_words) * 100
+
+        # Calculating the Coleman-Liau Index
+        score = round(0.0588 * avg_letters_per_100_words - 0.296 * avg_sentences_per_100_words - 15.8, 2)
+        cl_index_scores.append(score)
+
+    return cl_index_scores
+
+
+
 
 def calculate_all_readability_metrics(texts: Union[str, List[str]]) -> List[dict]:
     '''
@@ -136,14 +178,16 @@ def calculate_all_readability_metrics(texts: Union[str, List[str]]) -> List[dict
     fk_reading_ease_scores = calculate_flesch_kincaid_reading_ease(texts)
     fk_grade_level_scores = calculate_flesch_kincaid_grade_level(texts)
     gunning_fog_scores = [calculate_gunning_fog_index(text) for text in texts]
+    coleman_liau_scores = calculate_coleman_liau_index(texts)
 
     # Combining into a list of dictionaries
     results = []
-    for re_score, gl_score, gf_score in zip(fk_reading_ease_scores, fk_grade_level_scores, gunning_fog_scores):
+    for re_score, gl_score, gf_score, cl_score in zip(fk_reading_ease_scores, fk_grade_level_scores, gunning_fog_scores, coleman_liau_scores):
         readability_metrics = {
             'flesch_kincaid_reading_ease': re_score,
             'flesch_kincaid_grade_level': gl_score,
-            'gunning_fog_index': gf_score
+            'gunning_fog_index': gf_score,
+            'coleman_liau_index': cl_score
         }
         results.append(readability_metrics)