diff --git a/.gitignore b/.gitignore index 13e5995..6a4f5f5 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,7 @@ pyproject.toml *__pycache__* *egg-info* + + +.vscode +.scribbles \ No newline at end of file diff --git a/output/stats/NQDataHandler/NQDataHandler_answer_lengths.png b/output/stats/NQDataHandler/NQDataHandler_answer_lengths.png new file mode 100644 index 0000000..df2faa8 Binary files /dev/null and b/output/stats/NQDataHandler/NQDataHandler_answer_lengths.png differ diff --git a/output/stats/NQDataHandler/NQDataHandler_document_lengths.png b/output/stats/NQDataHandler/NQDataHandler_document_lengths.png new file mode 100644 index 0000000..a78ba74 Binary files /dev/null and b/output/stats/NQDataHandler/NQDataHandler_document_lengths.png differ diff --git a/output/stats/NQDataHandler/NQDataHandler_num_questions.png b/output/stats/NQDataHandler/NQDataHandler_num_questions.png new file mode 100644 index 0000000..e250895 Binary files /dev/null and b/output/stats/NQDataHandler/NQDataHandler_num_questions.png differ diff --git a/output/stats/NQDataHandler/NQDataHandler_num_tokens.png b/output/stats/NQDataHandler/NQDataHandler_num_tokens.png new file mode 100644 index 0000000..68eaccf Binary files /dev/null and b/output/stats/NQDataHandler/NQDataHandler_num_tokens.png differ diff --git a/output/stats/NQDataHandler/NQDataHandler_num_tokens_answers.png b/output/stats/NQDataHandler/NQDataHandler_num_tokens_answers.png new file mode 100644 index 0000000..1e52ef8 Binary files /dev/null and b/output/stats/NQDataHandler/NQDataHandler_num_tokens_answers.png differ diff --git a/output/stats/NQDataHandler/NQDataHandler_num_tokens_answers_whitespace.png b/output/stats/NQDataHandler/NQDataHandler_num_tokens_answers_whitespace.png new file mode 100644 index 0000000..0708eb2 Binary files /dev/null and b/output/stats/NQDataHandler/NQDataHandler_num_tokens_answers_whitespace.png differ diff --git a/output/stats/NQDataHandler/NQDataHandler_num_tokens_questions.png b/output/stats/NQDataHandler/NQDataHandler_num_tokens_questions.png new file mode 100644 index 0000000..74adb0d Binary files /dev/null and b/output/stats/NQDataHandler/NQDataHandler_num_tokens_questions.png differ diff --git a/output/stats/NQDataHandler/NQDataHandler_num_tokens_questions_whitespace.png b/output/stats/NQDataHandler/NQDataHandler_num_tokens_questions_whitespace.png new file mode 100644 index 0000000..834725a Binary files /dev/null and b/output/stats/NQDataHandler/NQDataHandler_num_tokens_questions_whitespace.png differ diff --git a/output/stats/NQDataHandler/NQDataHandler_num_tokens_whitespace.png b/output/stats/NQDataHandler/NQDataHandler_num_tokens_whitespace.png new file mode 100644 index 0000000..934cd5a Binary files /dev/null and b/output/stats/NQDataHandler/NQDataHandler_num_tokens_whitespace.png differ diff --git a/output/stats/NQDataHandler/NQDataHandler_num_unique_tokens.png b/output/stats/NQDataHandler/NQDataHandler_num_unique_tokens.png new file mode 100644 index 0000000..9c17656 Binary files /dev/null and b/output/stats/NQDataHandler/NQDataHandler_num_unique_tokens.png differ diff --git a/output/stats/NQDataHandler/NQDataHandler_num_unique_tokens_whitespace.png b/output/stats/NQDataHandler/NQDataHandler_num_unique_tokens_whitespace.png new file mode 100644 index 0000000..0a2da82 Binary files /dev/null and b/output/stats/NQDataHandler/NQDataHandler_num_unique_tokens_whitespace.png differ diff --git a/output/stats/NQDataHandler/NQDataHandler_question_lengths.png b/output/stats/NQDataHandler/NQDataHandler_question_lengths.png new file mode 100644 index 0000000..ca91f32 Binary files /dev/null and b/output/stats/NQDataHandler/NQDataHandler_question_lengths.png differ diff --git a/output/stats/NQDataHandler/NQDataHandler_stats.json b/output/stats/NQDataHandler/NQDataHandler_stats.json new file mode 100644 index 0000000..0b4ee1c --- /dev/null +++ b/output/stats/NQDataHandler/NQDataHandler_stats.json @@ -0,0 +1,15 @@ +{ + "avg_doc_length": 88991.1, + "avg_question_length": 47.3, + "avg_answer_length": 41.5, + "avg_num_questions": 1.0, + "num_documents": 25010, + "avg_num_tokens": 20637.5, + "avg_num_tokens_questions": 9.0, + "avg_num_tokens_answers": 7.7, + "avg_num_tokens_by_whitespace": 6918.2, + "avg_num_tokens_questions_by_whitespace": 9.0, + "avg_num_tokens_answers_by_whitespace": 6.8, + "avg_num_unique_tokens": 2939.8, + "avg_num_unique_tokens_by_whitespace": 2864.8 +} \ No newline at end of file diff --git a/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_answer_lengths.png b/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_answer_lengths.png new file mode 100644 index 0000000..f2d4b27 Binary files /dev/null and b/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_answer_lengths.png differ diff --git a/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_document_lengths.png b/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_document_lengths.png new file mode 100644 index 0000000..916750b Binary files /dev/null and b/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_document_lengths.png differ diff --git a/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_num_questions.png b/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_num_questions.png new file mode 100644 index 0000000..30f874b Binary files /dev/null and b/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_num_questions.png differ diff --git a/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_num_tokens.png b/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_num_tokens.png new file mode 100644 index 0000000..1b21c1f Binary files /dev/null and b/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_num_tokens.png differ diff --git a/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_num_tokens_answers.png b/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_num_tokens_answers.png new file mode 100644 index 0000000..ac49f83 Binary files /dev/null and b/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_num_tokens_answers.png differ diff --git a/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_num_tokens_answers_whitespace.png b/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_num_tokens_answers_whitespace.png new file mode 100644 index 0000000..7dd7e3a Binary files /dev/null and b/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_num_tokens_answers_whitespace.png differ diff --git a/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_num_tokens_questions.png b/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_num_tokens_questions.png new file mode 100644 index 0000000..052cdb2 Binary files /dev/null and b/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_num_tokens_questions.png differ diff --git a/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_num_tokens_questions_whitespace.png b/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_num_tokens_questions_whitespace.png new file mode 100644 index 0000000..0277030 Binary files /dev/null and b/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_num_tokens_questions_whitespace.png differ diff --git a/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_num_tokens_whitespace.png b/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_num_tokens_whitespace.png new file mode 100644 index 0000000..7ee2815 Binary files /dev/null and b/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_num_tokens_whitespace.png differ diff --git a/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_num_unique_tokens.png b/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_num_unique_tokens.png new file mode 100644 index 0000000..8baf4ee Binary files /dev/null and b/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_num_unique_tokens.png differ diff --git a/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_num_unique_tokens_whitespace.png b/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_num_unique_tokens_whitespace.png new file mode 100644 index 0000000..eb2bf7a Binary files /dev/null and b/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_num_unique_tokens_whitespace.png differ diff --git a/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_question_lengths.png b/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_question_lengths.png new file mode 100644 index 0000000..5d02ca1 Binary files /dev/null and b/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_question_lengths.png differ diff --git a/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_stats.json b/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_stats.json new file mode 100644 index 0000000..db2cf63 --- /dev/null +++ b/output/stats/NarrativeQADataHandler/NarrativeQADataHandler_stats.json @@ -0,0 +1,15 @@ +{ + "avg_doc_length": 331328.0, + "avg_question_length": 46.7, + "avg_answer_length": 12.6, + "avg_num_questions": 1.9, + "num_documents": 1073, + "avg_num_tokens": 68540.1, + "avg_num_tokens_questions": 9.6, + "avg_num_tokens_answers": 2.5, + "avg_num_tokens_by_whitespace": 51830.4, + "avg_num_tokens_questions_by_whitespace": 8.5, + "avg_num_tokens_answers_by_whitespace": 2.1, + "avg_num_unique_tokens": 5997.9, + "avg_num_unique_tokens_by_whitespace": 8983.5 +} \ No newline at end of file diff --git a/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_answer_lengths.png b/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_answer_lengths.png new file mode 100644 index 0000000..440d03b Binary files /dev/null and b/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_answer_lengths.png differ diff --git a/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_document_lengths.png b/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_document_lengths.png new file mode 100644 index 0000000..037856f Binary files /dev/null and b/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_document_lengths.png differ diff --git a/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_num_questions.png b/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_num_questions.png new file mode 100644 index 0000000..8cc0cc4 Binary files /dev/null and b/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_num_questions.png differ diff --git a/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_num_tokens.png b/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_num_tokens.png new file mode 100644 index 0000000..7110d4b Binary files /dev/null and b/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_num_tokens.png differ diff --git a/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_num_tokens_answers.png b/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_num_tokens_answers.png new file mode 100644 index 0000000..cff31e6 Binary files /dev/null and b/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_num_tokens_answers.png differ diff --git a/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_num_tokens_answers_whitespace.png b/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_num_tokens_answers_whitespace.png new file mode 100644 index 0000000..2258d83 Binary files /dev/null and b/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_num_tokens_answers_whitespace.png differ diff --git a/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_num_tokens_questions.png b/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_num_tokens_questions.png new file mode 100644 index 0000000..69021bf Binary files /dev/null and b/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_num_tokens_questions.png differ diff --git a/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_num_tokens_questions_whitespace.png b/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_num_tokens_questions_whitespace.png new file mode 100644 index 0000000..ac3f85f Binary files /dev/null and b/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_num_tokens_questions_whitespace.png differ diff --git a/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_num_tokens_whitespace.png b/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_num_tokens_whitespace.png new file mode 100644 index 0000000..fb0b94e Binary files /dev/null and b/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_num_tokens_whitespace.png differ diff --git a/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_num_unique_tokens.png b/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_num_unique_tokens.png new file mode 100644 index 0000000..97b1bfe Binary files /dev/null and b/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_num_unique_tokens.png differ diff --git a/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_num_unique_tokens_whitespace.png b/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_num_unique_tokens_whitespace.png new file mode 100644 index 0000000..c5efa27 Binary files /dev/null and b/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_num_unique_tokens_whitespace.png differ diff --git a/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_question_lengths.png b/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_question_lengths.png new file mode 100644 index 0000000..bb591f8 Binary files /dev/null and b/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_question_lengths.png differ diff --git a/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_stats.json b/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_stats.json new file mode 100644 index 0000000..1b3fead --- /dev/null +++ b/output/stats/StitchedCovidQADataHandler/StitchedCovidQADataHandler_stats.json @@ -0,0 +1,15 @@ +{ + "avg_doc_length": 65415.9, + "avg_question_length": 55.6, + "avg_answer_length": 79.0, + "avg_num_questions": 2.1, + "num_documents": 55, + "avg_num_tokens": 12154.7, + "avg_num_tokens_questions": 10.2, + "avg_num_tokens_answers": 13.5, + "avg_num_tokens_by_whitespace": 10009.2, + "avg_num_tokens_questions_by_whitespace": 8.8, + "avg_num_tokens_answers_by_whitespace": 11.1, + "avg_num_unique_tokens": 2382.0, + "avg_num_unique_tokens_by_whitespace": 2904.5 +} \ No newline at end of file diff --git a/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_answer_lengths.png b/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_answer_lengths.png new file mode 100644 index 0000000..bfb2aec Binary files /dev/null and b/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_answer_lengths.png differ diff --git a/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_document_lengths.png b/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_document_lengths.png new file mode 100644 index 0000000..95ec2ab Binary files /dev/null and b/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_document_lengths.png differ diff --git a/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_num_questions.png b/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_num_questions.png new file mode 100644 index 0000000..6aa28da Binary files /dev/null and b/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_num_questions.png differ diff --git a/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_num_tokens.png b/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_num_tokens.png new file mode 100644 index 0000000..9e29c0c Binary files /dev/null and b/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_num_tokens.png differ diff --git a/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_num_tokens_answers.png b/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_num_tokens_answers.png new file mode 100644 index 0000000..37b7dd5 Binary files /dev/null and b/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_num_tokens_answers.png differ diff --git a/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_num_tokens_answers_whitespace.png b/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_num_tokens_answers_whitespace.png new file mode 100644 index 0000000..628abcb Binary files /dev/null and b/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_num_tokens_answers_whitespace.png differ diff --git a/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_num_tokens_questions.png b/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_num_tokens_questions.png new file mode 100644 index 0000000..20e67d0 Binary files /dev/null and b/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_num_tokens_questions.png differ diff --git a/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_num_tokens_questions_whitespace.png b/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_num_tokens_questions_whitespace.png new file mode 100644 index 0000000..fcf4261 Binary files /dev/null and b/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_num_tokens_questions_whitespace.png differ diff --git a/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_num_tokens_whitespace.png b/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_num_tokens_whitespace.png new file mode 100644 index 0000000..3ff4ce1 Binary files /dev/null and b/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_num_tokens_whitespace.png differ diff --git a/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_num_unique_tokens.png b/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_num_unique_tokens.png new file mode 100644 index 0000000..562a92f Binary files /dev/null and b/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_num_unique_tokens.png differ diff --git a/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_num_unique_tokens_whitespace.png b/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_num_unique_tokens_whitespace.png new file mode 100644 index 0000000..559c44e Binary files /dev/null and b/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_num_unique_tokens_whitespace.png differ diff --git a/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_question_lengths.png b/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_question_lengths.png new file mode 100644 index 0000000..b363c53 Binary files /dev/null and b/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_question_lengths.png differ diff --git a/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_stats.json b/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_stats.json new file mode 100644 index 0000000..7c3334a --- /dev/null +++ b/output/stats/StitchedNewsQADataHandler/StitchedNewsQADataHandler_stats.json @@ -0,0 +1,15 @@ +{ + "avg_doc_length": 51709.0, + "avg_question_length": 36.0, + "avg_answer_length": 32.2, + "avg_num_questions": 11.9, + "num_documents": 685, + "avg_num_tokens": 9842.1, + "avg_num_tokens_questions": 7.4, + "avg_num_tokens_answers": 6.2, + "avg_num_tokens_by_whitespace": 8484.5, + "avg_num_tokens_questions_by_whitespace": 6.5, + "avg_num_tokens_answers_by_whitespace": 5.2, + "avg_num_unique_tokens": 2782.5, + "avg_num_unique_tokens_by_whitespace": 3265.3 +} \ No newline at end of file diff --git a/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_answer_lengths.png b/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_answer_lengths.png new file mode 100644 index 0000000..b69eccc Binary files /dev/null and b/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_answer_lengths.png differ diff --git a/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_document_lengths.png b/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_document_lengths.png new file mode 100644 index 0000000..8529cff Binary files /dev/null and b/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_document_lengths.png differ diff --git a/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_num_questions.png b/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_num_questions.png new file mode 100644 index 0000000..3852e71 Binary files /dev/null and b/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_num_questions.png differ diff --git a/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_num_tokens.png b/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_num_tokens.png new file mode 100644 index 0000000..ea272d0 Binary files /dev/null and b/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_num_tokens.png differ diff --git a/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_num_tokens_answers.png b/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_num_tokens_answers.png new file mode 100644 index 0000000..12399c8 Binary files /dev/null and b/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_num_tokens_answers.png differ diff --git a/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_num_tokens_answers_whitespace.png b/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_num_tokens_answers_whitespace.png new file mode 100644 index 0000000..aa3b7cf Binary files /dev/null and b/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_num_tokens_answers_whitespace.png differ diff --git a/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_num_tokens_questions.png b/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_num_tokens_questions.png new file mode 100644 index 0000000..bc94589 Binary files /dev/null and b/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_num_tokens_questions.png differ diff --git a/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_num_tokens_questions_whitespace.png b/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_num_tokens_questions_whitespace.png new file mode 100644 index 0000000..30f8a6e Binary files /dev/null and b/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_num_tokens_questions_whitespace.png differ diff --git a/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_num_tokens_whitespace.png b/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_num_tokens_whitespace.png new file mode 100644 index 0000000..358e380 Binary files /dev/null and b/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_num_tokens_whitespace.png differ diff --git a/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_num_unique_tokens.png b/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_num_unique_tokens.png new file mode 100644 index 0000000..228d89f Binary files /dev/null and b/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_num_unique_tokens.png differ diff --git a/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_num_unique_tokens_whitespace.png b/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_num_unique_tokens_whitespace.png new file mode 100644 index 0000000..8cd0f6e Binary files /dev/null and b/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_num_unique_tokens_whitespace.png differ diff --git a/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_question_lengths.png b/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_question_lengths.png new file mode 100644 index 0000000..14c5dd7 Binary files /dev/null and b/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_question_lengths.png differ diff --git a/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_stats.json b/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_stats.json new file mode 100644 index 0000000..0aaa950 --- /dev/null +++ b/output/stats/StitchedSquadDataHandler/StitchedSquadDataHandler_stats.json @@ -0,0 +1,15 @@ +{ + "avg_doc_length": 50523.6, + "avg_question_length": 58.2, + "avg_answer_length": 24.9, + "avg_num_questions": 43.7, + "num_documents": 306, + "avg_num_tokens": 9280.9, + "avg_num_tokens_questions": 11.0, + "avg_num_tokens_answers": 4.3, + "avg_num_tokens_by_whitespace": 7998.3, + "avg_num_tokens_questions_by_whitespace": 9.9, + "avg_num_tokens_answers_by_whitespace": 3.9, + "avg_num_unique_tokens": 2532.6, + "avg_num_unique_tokens_by_whitespace": 2949.3 +} \ No newline at end of file diff --git a/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_answer_lengths.png b/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_answer_lengths.png new file mode 100644 index 0000000..a48fef5 Binary files /dev/null and b/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_answer_lengths.png differ diff --git a/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_document_lengths.png b/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_document_lengths.png new file mode 100644 index 0000000..95d79b4 Binary files /dev/null and b/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_document_lengths.png differ diff --git a/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_num_questions.png b/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_num_questions.png new file mode 100644 index 0000000..7a04e11 Binary files /dev/null and b/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_num_questions.png differ diff --git a/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_num_tokens.png b/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_num_tokens.png new file mode 100644 index 0000000..905a7f0 Binary files /dev/null and b/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_num_tokens.png differ diff --git a/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_num_tokens_answers.png b/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_num_tokens_answers.png new file mode 100644 index 0000000..30200eb Binary files /dev/null and b/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_num_tokens_answers.png differ diff --git a/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_num_tokens_answers_whitespace.png b/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_num_tokens_answers_whitespace.png new file mode 100644 index 0000000..d48a893 Binary files /dev/null and b/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_num_tokens_answers_whitespace.png differ diff --git a/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_num_tokens_questions.png b/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_num_tokens_questions.png new file mode 100644 index 0000000..79e27f4 Binary files /dev/null and b/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_num_tokens_questions.png differ diff --git a/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_num_tokens_questions_whitespace.png b/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_num_tokens_questions_whitespace.png new file mode 100644 index 0000000..f649fcf Binary files /dev/null and b/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_num_tokens_questions_whitespace.png differ diff --git a/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_num_tokens_whitespace.png b/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_num_tokens_whitespace.png new file mode 100644 index 0000000..5e5320b Binary files /dev/null and b/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_num_tokens_whitespace.png differ diff --git a/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_num_unique_tokens.png b/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_num_unique_tokens.png new file mode 100644 index 0000000..da05c2d Binary files /dev/null and b/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_num_unique_tokens.png differ diff --git a/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_num_unique_tokens_whitespace.png b/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_num_unique_tokens_whitespace.png new file mode 100644 index 0000000..3ad5ad0 Binary files /dev/null and b/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_num_unique_tokens_whitespace.png differ diff --git a/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_question_lengths.png b/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_question_lengths.png new file mode 100644 index 0000000..811f965 Binary files /dev/null and b/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_question_lengths.png differ diff --git a/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_stats.json b/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_stats.json new file mode 100644 index 0000000..dd8bee2 --- /dev/null +++ b/output/stats/StitchedTechQADataHandler/StitchedTechQADataHandler_stats.json @@ -0,0 +1,15 @@ +{ + "avg_doc_length": 60968.4, + "avg_question_length": 348.1, + "avg_answer_length": 354.7, + "avg_num_questions": 10.6, + "num_documents": 45, + "avg_num_tokens": 13409.1, + "avg_num_tokens_questions": 71.0, + "avg_num_tokens_answers": 76.9, + "avg_num_tokens_by_whitespace": 7597.2, + "avg_num_tokens_questions_by_whitespace": 51.1, + "avg_num_tokens_answers_by_whitespace": 46.9, + "avg_num_unique_tokens": 1834.8, + "avg_num_unique_tokens_by_whitespace": 2130.1 +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 03887ee..89aa7b1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ nvitop~=1.4.1 numpy~=2.2.2 pytest~=8.3.4 lxml~=5.3.1 +matplotlib~=3.5.1 diff --git a/src/stats/quant_dataset_stats.py b/src/stats/quant_dataset_stats.py new file mode 100644 index 0000000..174f074 --- /dev/null +++ b/src/stats/quant_dataset_stats.py @@ -0,0 +1,338 @@ +import json +import re +import sys +from typing import Dict, List + +import matplotlib.pyplot as plt + + +def read_dataset_json(filename: str) -> List[Dict]: + with open (filename, "r") as f: + dataset = json.load(f) + + return dataset + + +def get_avg_document_length(dataset: List[Dict]) -> float: + total_length = 0 + for sample in dataset: + total_length += len(sample["document"]) + return total_length / len(dataset) + + +def get_avg_question_length(dataset: List[Dict]) -> float: + total_length = 0 + num_questions = 0 + for sample in dataset: + for question in sample["questions"]: + total_length += len(question) + num_questions += 1 + return total_length / num_questions + + +def get_avg_answer_length(dataset: List[Dict]) -> float: + total_length = 0 + num_answers = 0 + for sample in dataset: + for answer_data in sample["answers"]: + answer = answer_data["answer"] + total_length += len(answer) + num_answers += 1 + return total_length / num_answers + + +def get_avg_num_of_questions(dataset: List[Dict]) -> float: + total_questions = 0 + for sample in dataset: + total_questions += len(sample["questions"]) + return total_questions / len(dataset) + + +def get_num_documents(dataset: List[Dict]) -> int: + return len(dataset) + + +def tokenize(text): + token_pattern = r""" + \b(?:[A-Z]\.)+[A-Z]? # Abbreviations like U.S.A. or U.K. + | \b\w+(?:[-']\w+)*\b # Words, including contractions and hyphenated words + | \.\.\. # Ellipses (...) + | [.,!?;(){}\[\]:\"“”] # Punctuation + | (?:https?://|www\.)\S+ # URLs + | [#@]\w+ # Hashtags and mentions + | \b\d{1,3}(?:,\d{3})*(?:\.\d+)?\b # Numbers with commas or decimals + | [\U0001F600-\U0001F64F] # Emojis (basic range) + | \S # Any other non-whitespace character + """ + + return re.findall(token_pattern, text, re.VERBOSE) + +def get_avg_num_tokens(dataset: List[Dict]) -> float: + total_tokens = 0 + for sample in dataset: + total_tokens += len(tokenize(sample["document"])) + return total_tokens / len(dataset) + +def get_avg_num_tokens_questions(dataset: List[Dict]) -> float: + total_tokens = 0 + num_questions = 0 + for sample in dataset: + for question in sample["questions"]: + total_tokens += len(tokenize(question)) + num_questions += 1 + return total_tokens / num_questions + +def get_avg_num_tokens_answers(dataset: List[Dict]) -> float: + total_tokens = 0 + num_answers = 0 + for sample in dataset: + for answer_data in sample["answers"]: + answer = answer_data["answer"] + total_tokens += len(tokenize(answer)) + num_answers += 1 + return total_tokens / num_answers + + +def tokenize_by_whitespace(text): + return text.split() + + +def get_avg_num_tokens_by_whitespace(dataset: List[Dict]) -> float: + total_tokens = 0 + for sample in dataset: + total_tokens += len(tokenize_by_whitespace(sample["document"])) + return total_tokens / len(dataset) + + +def get_avg_num_tokens_questions_by_whitespace(dataset: List[Dict]) -> float: + total_tokens = 0 + num_questions = 0 + for sample in dataset: + for question in sample["questions"]: + total_tokens += len(tokenize_by_whitespace(question)) + num_questions += 1 + return total_tokens / num_questions + + +def get_avg_num_tokens_answers_by_whitespace(dataset: List[Dict]) -> float: + total_tokens = 0 + num_answers = 0 + for sample in dataset: + for answer_data in sample["answers"]: + answer = answer_data["answer"] + total_tokens += len(tokenize_by_whitespace(answer)) + num_answers += 1 + return total_tokens / num_answers + + +def get_avg_num_unique_tokens(dataset: List[Dict]) -> float: + total_tokens = 0 + for sample in dataset: + total_tokens += len(set(tokenize(sample["document"]))) + return total_tokens / len(dataset) + + +def get_avg_num_unique_tokens_by_whitespace(dataset: List[Dict]) -> float: + total_tokens = 0 + for sample in dataset: + total_tokens += len(set(tokenize_by_whitespace(sample["document"]))) + return total_tokens / len(dataset) + +# TODO: Write a function that stores all outcome values in a dictionary. +def get_all_stats(dataset: List[Dict]) -> Dict: + stats = { + "avg_doc_length": get_avg_document_length(dataset), + "avg_question_length": get_avg_question_length(dataset), + "avg_answer_length": get_avg_answer_length(dataset), + "avg_num_questions": get_avg_num_of_questions(dataset), + "num_documents": get_num_documents(dataset), + "avg_num_tokens": get_avg_num_tokens(dataset), + "avg_num_tokens_questions": get_avg_num_tokens_questions(dataset), + "avg_num_tokens_answers": get_avg_num_tokens_answers(dataset), + "avg_num_tokens_by_whitespace": get_avg_num_tokens_by_whitespace(dataset), + "avg_num_tokens_questions_by_whitespace": get_avg_num_tokens_questions_by_whitespace(dataset), + "avg_num_tokens_answers_by_whitespace": get_avg_num_tokens_answers_by_whitespace(dataset), + "avg_num_unique_tokens": get_avg_num_unique_tokens(dataset), + "avg_num_unique_tokens_by_whitespace": get_avg_num_unique_tokens_by_whitespace(dataset) + } + return stats + +def get_all_stats_rounded_to_1(dataset: List[Dict]) -> Dict: + stats = get_all_stats(dataset) + for key in stats: + stats[key] = round(stats[key], 1) + return stats + + +def write_stats(stats: Dict, filename: str) -> None: + outf = filename.replace(".json", "_stats.json") + with open (outf, "w") as f: + json.dump(stats, f, ensure_ascii=False, indent=4) + +def plot_distribution_of_document_lengths(filename: str, dataset: List[Dict]) -> None: + document_lengths = [len(sample["document"]) for sample in dataset] + plt.hist(document_lengths, bins=50) + plt.xlabel("Document Length") + plt.ylabel("Frequency") + plt.title("Distribution of Document Lengths") + plot_name = filename.replace(".json", "_document_lengths.png") + plt.savefig(plot_name) + plt.clf() + +def plot_distribution_of_question_lengths(filename: str, dataset: List[Dict]) -> None: + question_lengths = [len(question) for sample in dataset for question in sample["questions"]] + plt.hist(question_lengths, bins=50) + plt.xlabel("Question Length") + plt.ylabel("Frequency") + plt.title("Distribution of Question Lengths") + plot_name = filename.replace(".json", "_question_lengths.png") + plt.savefig(plot_name) + plt.clf() + +def plot_distribution_of_answer_lengths(filename: str, dataset: List[Dict]) -> None: + answer_lengths = [len(answer_data["answer"]) for sample in dataset for answer_data in sample["answers"]] + plt.hist(answer_lengths, bins=50) + plt.xlabel("Answer Length") + plt.ylabel("Frequency") + plt.title("Distribution of Answer Lengths") + plot_name = filename.replace(".json", "_answer_lengths.png") + plt.savefig(plot_name) + plt.clf() + +def plot_distribution_of_num_questions(filename: str, dataset: List[Dict]) -> None: + num_questions = [len(sample["questions"]) for sample in dataset] + plt.hist(num_questions, bins=50) + plt.xlabel("Number of Questions") + plt.ylabel("Frequency") + plt.title("Distribution of Number of Questions") + plot_name = filename.replace(".json", "_num_questions.png") + plt.savefig(plot_name) + plt.clf() + + +def plot_distribution_of_num_tokens(filename: str, dataset: List[Dict]) -> None: + num_tokens = [len(tokenize(sample["document"])) for sample in dataset] + plt.hist(num_tokens, bins=50) + plt.xlabel("Number of Tokens") + plt.ylabel("Frequency") + plt.title("Distribution of Number of Tokens") + plot_name = filename.replace(".json", "_num_tokens.png") + plt.savefig(plot_name) + plt.clf() + + +def plot_distribution_of_num_tokens_by_whitespace(filename: str, dataset: List[Dict]) -> None: + num_tokens = [len(tokenize_by_whitespace(sample["document"])) for sample in dataset] + plt.hist(num_tokens, bins=50) + plt.xlabel("Number of Tokens") + plt.ylabel("Frequency") + plt.title("Distribution of Number of Tokens by Whitespace") + plot_name = filename.replace(".json", "_num_tokens_whitespace.png") + plt.savefig(plot_name) + plt.clf() + + +def plot_distribution_of_num_tokens_questions(filename: str, dataset: List[Dict]) -> None: + num_tokens = [len(tokenize(question)) for sample in dataset for question in sample["questions"]] + plt.hist(num_tokens, bins=50) + plt.xlabel("Number of Tokens") + plt.ylabel("Frequency") + plt.title("Distribution of Number of Tokens in Questions") + plot_name = filename.replace(".json", "_num_tokens_questions.png") + plt.savefig(plot_name) + plt.clf() + +def plot_distribution_of_num_tokens_questions_by_whitespace(filename: str, dataset: List[Dict]) -> None: + num_tokens = [] + for sample in dataset: + for question in sample["questions"]: + tokenized = tokenize_by_whitespace(question) + num_tokens.append(len(tokenized)) + + plt.hist(num_tokens, bins=50) + plt.xlabel("Number of Tokens") + plt.ylabel("Frequency") + plt.title("Distribution of Number of Tokens in Questions by Whitespace") + plot_name = filename.replace(".json", "_num_tokens_questions_whitespace.png") + plt.savefig(plot_name) + plt.clf() + +def plot_distribution_of_num_tokens_answers(filename: str, dataset: List[Dict]) -> None: + num_tokens = [] + for sample in dataset: + for answer_data in sample["answers"]: + tokenized = tokenize(answer_data["answer"]) + num_tokens.append(len(tokenized)) + + plt.hist(num_tokens, bins=50) + plt.xlabel("Number of Tokens") + plt.ylabel("Frequency") + plt.title("Distribution of Number of Tokens in Answers") + plot_name = filename.replace(".json", "_num_tokens_answers.png") + plt.savefig(plot_name) + plt.clf() + +def plot_distribution_of_num_tokens_answers_by_whitespace(filename: str, dataset: List[Dict]) -> None: + num_tokens = [] + for sample in dataset: + for answer_data in sample["answers"]: + tokenized = tokenize_by_whitespace(answer_data["answer"]) + num_tokens.append(len(tokenized)) + + plt.hist(num_tokens, bins=50) + plt.xlabel("Number of Tokens") + plt.ylabel("Frequency") + plt.title("Distribution of Number of Tokens in Answers by Whitespace") + plot_name = filename.replace(".json", "_num_tokens_answers_whitespace.png") + plt.savefig(plot_name) + plt.clf() + +def plot_distribution_of_num_unique_tokens(filename: str, dataset: List[Dict]) -> None: + num_tokens = [] + for sample in dataset: + tokenized = tokenize(sample["document"]) + num_tokens.append(len(tokenized)) + + plt.hist(num_tokens, bins=50) + plt.xlabel("Number of Unique Tokens") + plt.ylabel("Frequency") + plt.title("Distribution of Number of Unique Tokens") + plot_name = filename.replace(".json", "_num_unique_tokens.png") + plt.savefig(plot_name) + plt.clf() + +def plot_distribution_of_num_unique_tokens_by_whitespace(filename: str, dataset: List[Dict]) -> None: + num_tokens = [] + for sample in dataset: + tokenized = tokenize_by_whitespace(sample["document"]) + num_tokens.append(len(tokenized)) + + plt.hist(num_tokens, bins=50) + plt.xlabel("Number of Unique Tokens") + plt.ylabel("Frequency") + plt.title("Distribution of Number of Unique Tokens by Whitespace") + plot_name = filename.replace(".json", "_num_unique_tokens_whitespace.png") + plt.savefig(plot_name) + plt.clf() + +def plot_all_distributions(filename: str, dataset: List[Dict]) -> None: + plot_distribution_of_document_lengths(filename, dataset) + plot_distribution_of_question_lengths(filename, dataset) + plot_distribution_of_answer_lengths(filename, dataset) + plot_distribution_of_num_questions(filename, dataset) + plot_distribution_of_num_tokens(filename, dataset) + plot_distribution_of_num_tokens_by_whitespace(filename, dataset) + plot_distribution_of_num_tokens_questions(filename, dataset) + plot_distribution_of_num_tokens_questions_by_whitespace(filename, dataset) + plot_distribution_of_num_tokens_answers(filename, dataset) + plot_distribution_of_num_tokens_answers_by_whitespace(filename, dataset) + plot_distribution_of_num_unique_tokens(filename, dataset) + plot_distribution_of_num_unique_tokens_by_whitespace(filename, dataset) + + +if __name__ == "__main__": + filename = sys.argv[1] + dataset = read_dataset_json(filename) + stats = get_all_stats_rounded_to_1(dataset) + plot_all_distributions(filename, dataset) + write_stats(stats, filename)