From 3d755fca3ec9fe2becc33f7e13b6e91e987b42b0 Mon Sep 17 00:00:00 2001 From: Henry Dowd Date: Fri, 27 Feb 2026 04:56:34 +0000 Subject: [PATCH] Ran files --- notebooks/01_data_exploration.ipynb | 1440 +++++++++++------------ notebooks/02_baseline_experiments.ipynb | 53 +- notebooks/03_semantic_methods.ipynb | 2 +- notebooks/04_fusion_model.ipynb | 103 +- 4 files changed, 829 insertions(+), 769 deletions(-) diff --git a/notebooks/01_data_exploration.ipynb b/notebooks/01_data_exploration.ipynb index 247908a..5eecf2f 100644 --- a/notebooks/01_data_exploration.ipynb +++ b/notebooks/01_data_exploration.ipynb @@ -10,6 +10,7 @@ }, { "cell_type": "code", + "execution_count": 2, "id": "12579bf734bb1a92", "metadata": { "ExecuteTime": { @@ -17,6 +18,15 @@ "start_time": "2025-12-14T01:12:31.060344929Z" } }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded 13 test sentences\n" + ] + } + ], "source": [ "import spacy\n", "import pandas as pd\n", @@ -27,8 +37,8 @@ "import string\n", "from IPython.display import display, HTML\n", "\n", - "nlp_lg = spacy.load(\"en_core_web_lg\")\n", - "nlp_trf = spacy.load(\"en_core_web_trf\")\n", + "nlp_lg = spacy.load('en_core_web_lg')\n", + "nlp_trf = spacy.load('en_core_web_trf')\n", "\n", "nlp = nlp_lg\n", "\n", @@ -59,17 +69,7 @@ "]\n", "\n", "print(f\"Loaded {len(test_sentences)} test sentences\")" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loaded 13 test sentences\n" - ] - } - ], - "execution_count": 1 + ] }, { "cell_type": "markdown", @@ -79,6 +79,7 @@ }, { "cell_type": "code", + "execution_count": 3, "id": "e003ac06a58cfbb4", "metadata": { "ExecuteTime": { @@ -86,94 +87,9 @@ "start_time": "2025-12-14T01:12:41.953456434Z" } }, - "source": [ - "def analyze_text_statistics(sentences):\n", - " stats = []\n", - " for i, sent in enumerate(sentences):\n", - " doc = nlp(sent)\n", - " words = [token.text for token in doc if not token.is_space]\n", - " alpha_words = [token.text for token in doc if token.is_alpha]\n", - " \n", - " stats.append({\n", - " 'sentence_id': i,\n", - " 'sentence': sent,\n", - " 'char_length': len(sent),\n", - " 'word_count': len(words),\n", - " 'alpha_word_count': len(alpha_words),\n", - " 'avg_word_length': np.mean([len(word) for word in alpha_words]) if alpha_words else 0,\n", - " 'has_punctuation': any(token.is_punct for token in doc),\n", - " 'is_empty': len(sent.strip()) == 0\n", - " })\n", - " \n", - " return pd.DataFrame(stats)\n", - "\n", - "stats_df = analyze_text_statistics(test_sentences)\n", - "display(stats_df)\n", - "\n", - "# Visualize basic statistics\n", - "fig, axes = plt.subplots(2, 2, figsize=(12, 8))\n", - "\n", - "# Character length distribution\n", - "axes[0,0].bar(range(len(stats_df)), stats_df['char_length'])\n", - "axes[0,0].set_title('Character Length by Sentence')\n", - "axes[0,0].set_xlabel('Sentence ID')\n", - "axes[0,0].set_ylabel('Characters')\n", - "\n", - "# Word count distribution\n", - "axes[0,1].bar(range(len(stats_df)), stats_df['word_count'])\n", - "axes[0,1].set_title('Word Count by Sentence')\n", - "axes[0,1].set_xlabel('Sentence ID')\n", - "axes[0,1].set_ylabel('Words')\n", - "\n", - "# Average word length\n", - "axes[1,0].bar(range(len(stats_df)), stats_df['avg_word_length'])\n", - "axes[1,0].set_title('Average Word Length')\n", - "axes[1,0].set_xlabel('Sentence ID')\n", - "axes[1,0].set_ylabel('Characters')\n", - "\n", - "# Sentence type breakdown\n", - "types = ['With Punctuation' if x else 'No Punctuation' for x in stats_df['has_punctuation']]\n", - "type_counts = pd.Series(types).value_counts()\n", - "axes[1,1].pie(type_counts.values, labels=type_counts.index, autopct='%1.1f%%')\n", - "axes[1,1].set_title('Punctuation Distribution')\n", - "\n", - "plt.tight_layout()\n", - "plt.show()" - ], "outputs": [ { "data": { - "text/plain": [ - " sentence_id sentence char_length \\\n", - "0 0 The cat sat on the mat. 23 \n", - "1 1 The cat sat on the mat 22 \n", - "2 2 The cat sat on the mat. 28 \n", - "3 3 On the mat, the cat was sitting. 32 \n", - "4 4 The feline rested on the rug. 29 \n", - "5 5 The quick brown fox jumps over the lazy dog. 44 \n", - "6 6 Over the lazy dog jumps the quick brown fox. 44 \n", - "7 7 The dog ran in the park. 24 \n", - "8 8 I love programming. 19 \n", - "9 9 She enjoys reading books. 25 \n", - "10 10 Short. 6 \n", - "11 11 A B C D E F G 13 \n", - "12 12 0 \n", - "\n", - " word_count alpha_word_count avg_word_length has_punctuation is_empty \n", - "0 7 6 2.833333 True False \n", - "1 6 6 2.833333 False False \n", - "2 7 6 2.833333 True False \n", - "3 9 7 3.428571 True False \n", - "4 7 6 3.833333 True False \n", - "5 10 9 3.888889 True False \n", - "6 10 9 3.888889 True False \n", - "7 7 6 3.000000 True False \n", - "8 4 3 5.333333 True False \n", - "9 5 4 5.250000 True False \n", - "10 2 1 5.000000 True False \n", - "11 7 7 1.000000 False False \n", - "12 0 0 0.000000 False True " - ], "text/html": [ "
\n", "