diff --git a/README.md b/README.md index c3f0fe7..35086ac 100644 --- a/README.md +++ b/README.md @@ -144,3 +144,39 @@ paper. Categories:\n + +## Real-World Problem Solving + +The `real_world_problems` module extends the capabilities of alpha-math to tackle real-world challenges. This module currently includes a mental health analysis component, demonstrating the application of data analysis and visualization techniques to a practical domain. + +### Mental Health Dataset + +The mental health dataset used in this module contains information about individuals' mental health statuses and their corresponding statements. This dataset allows us to explore the relationship between linguistic features and mental health conditions. + +### Functions in mental_health_analysis.py + +1. `load_dataset(filename='Combined Data.csv')`: Loads the mental health dataset from a CSV file. +2. `perform_eda(data)`: Conducts exploratory data analysis on the dataset, including basic information, descriptive statistics, and visualizations of mental health status distributions. +3. `analyze_sentiment_length(data)`: Analyzes the relationship between statement length and mental health status, providing insights into how the length of expressions might correlate with different mental health conditions. + +### Running the Script + +To run the mental health analysis script, use the following command: + +```shell +python -m alphamath.real_world_problems.mental_health_analysis +``` + +### Interpreting the Results + +The script generates two main outputs: + +1. Exploratory Data Analysis (EDA) results: Stored in the `eda_results` dictionary, providing an overview of the dataset's characteristics. +2. Sentiment Length Analysis results: Stored in the `sentiment_length_results` dictionary, offering insights into the relationship between statement length and mental health status. + +Additionally, two visualizations are generated: + +1. `mental_health_distribution.png`: A bar chart showing the distribution of mental health statuses in the dataset. +2. `statement_length_by_status.png`: A bar chart displaying the average statement length for each mental health status. + +These results and visualizations can be used to gain insights into the dataset and potentially inform further analysis or machine learning tasks related to mental health assessment based on textual data. diff --git a/alphamath/real_world_problems/mental_health_analysis.py b/alphamath/real_world_problems/mental_health_analysis.py new file mode 100644 index 0000000..a8119d1 --- /dev/null +++ b/alphamath/real_world_problems/mental_health_analysis.py @@ -0,0 +1,96 @@ +import pandas as pd +import matplotlib.pyplot as plt + +def load_dataset(filename='Combined Data.csv'): + """ + Load the mental health dataset from a CSV file. + + Args: + filename (str): The name of the CSV file to load. + + Returns: + pandas.DataFrame: The loaded dataset. + """ + return pd.read_csv(filename) + +def perform_eda(data): + """ + Perform initial exploratory data analysis on the mental health dataset. + + Args: + data (pandas.DataFrame): The dataset to analyze. + + Returns: + dict: A dictionary containing various EDA results. + """ + eda_results = {} + + # Basic information about the dataset + eda_results['info'] = data.info() + + # Descriptive statistics + eda_results['describe'] = data.describe() + + # Count of each mental health status + status_counts = data['status'].value_counts() + eda_results['status_counts'] = status_counts + + # Visualize the distribution of mental health statuses + plt.figure(figsize=(10, 6)) + status_counts.plot(kind='bar') + plt.title('Distribution of Mental Health Statuses') + plt.xlabel('Mental Health Status') + plt.ylabel('Count') + plt.tight_layout() + plt.savefig('mental_health_distribution.png') + plt.close() + + # Calculate descriptive statistics using pandas + eda_results['custom_stats'] = data['status'].describe() + + return eda_results + +def analyze_sentiment_length(data): + """ + Analyze the relationship between statement length and mental health status. + + Args: + data (pandas.DataFrame): The dataset to analyze. + + Returns: + dict: A dictionary containing analysis results. + """ + analysis_results = {} + + # Calculate statement lengths + data['statement_length'] = data['statement'].str.len() + + # Group by mental health status and calculate mean statement length + mean_lengths = data.groupby('status')['statement_length'].mean().sort_values(ascending=False) + analysis_results['mean_lengths'] = mean_lengths + + # Visualize mean statement lengths + plt.figure(figsize=(10, 6)) + mean_lengths.plot(kind='bar') + plt.title('Average Statement Length by Mental Health Status') + plt.xlabel('Mental Health Status') + plt.ylabel('Average Statement Length') + plt.tight_layout() + plt.savefig('statement_length_by_status.png') + plt.close() + + return analysis_results + +if __name__ == "__main__": + # Load the dataset + data = load_dataset() + + # Perform EDA + eda_results = perform_eda(data) + + # Analyze sentiment length + sentiment_length_results = analyze_sentiment_length(data) + + print("EDA completed. Results saved in 'eda_results' dictionary.") + print("Sentiment length analysis completed. Results saved in 'sentiment_length_results' dictionary.") + print("Visualizations saved as 'mental_health_distribution.png' and 'statement_length_by_status.png'.")