From c8513a0dcddd2d5aa58934171f31f70f36591f11 Mon Sep 17 00:00:00 2001 From: David Hundley Date: Sun, 15 Dec 2024 00:24:38 -0600 Subject: [PATCH] Add Coleman-Liau Index calculation and update documentation in readability metrics --- docs/wiki/metrics/text/readability_metrics.md | 31 +++++++++++- whetstone/metrics/text/readability_metrics.py | 48 ++++++++++++++++++- 2 files changed, 76 insertions(+), 3 deletions(-) diff --git a/docs/wiki/metrics/text/readability_metrics.md b/docs/wiki/metrics/text/readability_metrics.md index c045552..2a65dd4 100644 --- a/docs/wiki/metrics/text/readability_metrics.md +++ b/docs/wiki/metrics/text/readability_metrics.md @@ -98,4 +98,33 @@ To optimize the Gunning Fog index, consider applying the following strategies: - Use shorter sentences. - Avoid complex words where simpler alternatives exist. -- Focus on clarity and brevity. \ No newline at end of file +- Focus on clarity and brevity. + + + +## Coleman-Liau Index +The Coleman-Liau Index is a readability metric used to estimate the grade level required for someone to understand a text. It assesses the complexity of a text based on its characters, words, and sentences, rather than using syllables like some other readability formulas (e.g., the Flesch-Kincaid formula). + +The formula for the Coleman-Liau Index is: + +$$ +0.0588 \times \text{average letters per 100 words} - 0.296 \times \text{average sentences per 100 words} - +$$ + +Here’s a breakdown of what the ranges generally indicate: + +| **CLI Range** | **Reading Level** | **Audience** | +|------------------|-------------------------------------------------|---------------------------------------| +| 1.0 – 5.0 | Very easy to read | Young children or early elementary | +| 6.0 – 8.0 | Fairly easy to read | Upper elementary or middle school | +| 9.0 – 12.0 | Standard readability (average complexity) | High school students | +| 13.0 – 16.0 | More complex (college level) | College students | +| 17.0+ | Very complex (graduate level or professional) | Advanced academic or professional | + +To optimize the Coleman-Liau Index metric, consider applying the following strategies: + +- Reduce sentence length. (e.g., Break down long, complex sentences into shorter, simpler ones.) +- Minimize word complexity. (e.g., Use shorter words with fewer characters and avoid technical or uncommon terms.) +- Focus on conciseness. (e.g., Eliminate unnecessary modifiers or redundant phrases.) +- Use clear, direct writing. (e.g., Aim for straightforward phrasing without overly elaborate descriptions.) +- Target grade-level readability. (e.g., Adjust content based on the desired audience grade level to balance simplicity and effectiveness.) \ No newline at end of file diff --git a/whetstone/metrics/text/readability_metrics.py b/whetstone/metrics/text/readability_metrics.py index 6746c46..275e49c 100644 --- a/whetstone/metrics/text/readability_metrics.py +++ b/whetstone/metrics/text/readability_metrics.py @@ -116,6 +116,48 @@ def calculate_gunning_fog_index(text: str) -> float: +def calculate_coleman_liau_index(texts: Union[str, List[str]]) -> List[float]: + ''' + Calculate the Coleman-Liau Index for one or multiple texts. + + Inputs: + - texts (str or list[str]): The input text(s) for which to calculate the Coleman-Liau Index. + + Returns: + - cl_index_scores (list[float]): A list of Coleman-Liau Index scores corresponding to each input text. + ''' + # Wrapping the input in a list if it is a single string + if isinstance(texts, str): + texts = [texts] + + cl_index_scores = [] + for text in texts: + # Tokenizing sentences and words + sentences = tokenize_sentence(text) + words = re.findall(r'\b\w+\b', text) + + # Calculating the number of sentences, words, and characters + num_sentences = len(sentences) + num_words = len(words) + num_characters = sum(len(word) for word in words) + + # Avoiding division by zero + if num_words == 0 or num_sentences == 0: + cl_index_scores.append(0.0) + continue + + # Calculating average number of letters per 100 words and sentences per 100 words + avg_letters_per_100_words = (num_characters / num_words) * 100 + avg_sentences_per_100_words = (num_sentences / num_words) * 100 + + # Calculating the Coleman-Liau Index + score = round(0.0588 * avg_letters_per_100_words - 0.296 * avg_sentences_per_100_words - 15.8, 2) + cl_index_scores.append(score) + + return cl_index_scores + + + def calculate_all_readability_metrics(texts: Union[str, List[str]]) -> List[dict]: ''' @@ -136,14 +178,16 @@ def calculate_all_readability_metrics(texts: Union[str, List[str]]) -> List[dict fk_reading_ease_scores = calculate_flesch_kincaid_reading_ease(texts) fk_grade_level_scores = calculate_flesch_kincaid_grade_level(texts) gunning_fog_scores = [calculate_gunning_fog_index(text) for text in texts] + coleman_liau_scores = calculate_coleman_liau_index(texts) # Combining into a list of dictionaries results = [] - for re_score, gl_score, gf_score in zip(fk_reading_ease_scores, fk_grade_level_scores, gunning_fog_scores): + for re_score, gl_score, gf_score, cl_score in zip(fk_reading_ease_scores, fk_grade_level_scores, gunning_fog_scores, coleman_liau_scores): readability_metrics = { 'flesch_kincaid_reading_ease': re_score, 'flesch_kincaid_grade_level': gl_score, - 'gunning_fog_index': gf_score + 'gunning_fog_index': gf_score, + 'coleman_liau_index': cl_score } results.append(readability_metrics)