diff --git a/notebooks/01_data_exploration.ipynb b/notebooks/01_data_exploration.ipynb index 9be6358..247908a 100644 --- a/notebooks/01_data_exploration.ipynb +++ b/notebooks/01_data_exploration.ipynb @@ -10,23 +10,13 @@ }, { "cell_type": "code", - "execution_count": 4, "id": "12579bf734bb1a92", "metadata": { "ExecuteTime": { - "end_time": "2025-11-23T13:53:57.753560Z", - "start_time": "2025-11-23T13:53:56.325948Z" + "end_time": "2025-12-14T01:12:41.942156089Z", + "start_time": "2025-12-14T01:12:31.060344929Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loaded 13 test sentences\n" - ] - } - ], "source": [ "import spacy\n", "import pandas as pd\n", @@ -69,7 +59,17 @@ "]\n", "\n", "print(f\"Loaded {len(test_sentences)} test sentences\")" - ] + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded 13 test sentences\n" + ] + } + ], + "execution_count": 1 }, { "cell_type": "markdown", @@ -79,17 +79,101 @@ }, { "cell_type": "code", - "execution_count": 5, "id": "e003ac06a58cfbb4", "metadata": { "ExecuteTime": { - "end_time": "2025-11-23T13:54:12.922343Z", - "start_time": "2025-11-23T13:54:12.896440Z" + "end_time": "2025-12-14T01:12:42.630828162Z", + "start_time": "2025-12-14T01:12:41.953456434Z" } }, + "source": [ + "def analyze_text_statistics(sentences):\n", + " stats = []\n", + " for i, sent in enumerate(sentences):\n", + " doc = nlp(sent)\n", + " words = [token.text for token in doc if not token.is_space]\n", + " alpha_words = [token.text for token in doc if token.is_alpha]\n", + " \n", + " stats.append({\n", + " 'sentence_id': i,\n", + " 'sentence': sent,\n", + " 'char_length': len(sent),\n", + " 'word_count': len(words),\n", + " 'alpha_word_count': len(alpha_words),\n", + " 'avg_word_length': np.mean([len(word) for word in alpha_words]) if alpha_words else 0,\n", + " 'has_punctuation': any(token.is_punct for token in doc),\n", + " 'is_empty': len(sent.strip()) == 0\n", + " })\n", + " \n", + " return pd.DataFrame(stats)\n", + "\n", + "stats_df = analyze_text_statistics(test_sentences)\n", + "display(stats_df)\n", + "\n", + "# Visualize basic statistics\n", + "fig, axes = plt.subplots(2, 2, figsize=(12, 8))\n", + "\n", + "# Character length distribution\n", + "axes[0,0].bar(range(len(stats_df)), stats_df['char_length'])\n", + "axes[0,0].set_title('Character Length by Sentence')\n", + "axes[0,0].set_xlabel('Sentence ID')\n", + "axes[0,0].set_ylabel('Characters')\n", + "\n", + "# Word count distribution\n", + "axes[0,1].bar(range(len(stats_df)), stats_df['word_count'])\n", + "axes[0,1].set_title('Word Count by Sentence')\n", + "axes[0,1].set_xlabel('Sentence ID')\n", + "axes[0,1].set_ylabel('Words')\n", + "\n", + "# Average word length\n", + "axes[1,0].bar(range(len(stats_df)), stats_df['avg_word_length'])\n", + "axes[1,0].set_title('Average Word Length')\n", + "axes[1,0].set_xlabel('Sentence ID')\n", + "axes[1,0].set_ylabel('Characters')\n", + "\n", + "# Sentence type breakdown\n", + "types = ['With Punctuation' if x else 'No Punctuation' for x in stats_df['has_punctuation']]\n", + "type_counts = pd.Series(types).value_counts()\n", + "axes[1,1].pie(type_counts.values, labels=type_counts.index, autopct='%1.1f%%')\n", + "axes[1,1].set_title('Punctuation Distribution')\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ], "outputs": [ { "data": { + "text/plain": [ + " sentence_id sentence char_length \\\n", + "0 0 The cat sat on the mat. 23 \n", + "1 1 The cat sat on the mat 22 \n", + "2 2 The cat sat on the mat. 28 \n", + "3 3 On the mat, the cat was sitting. 32 \n", + "4 4 The feline rested on the rug. 29 \n", + "5 5 The quick brown fox jumps over the lazy dog. 44 \n", + "6 6 Over the lazy dog jumps the quick brown fox. 44 \n", + "7 7 The dog ran in the park. 24 \n", + "8 8 I love programming. 19 \n", + "9 9 She enjoys reading books. 25 \n", + "10 10 Short. 6 \n", + "11 11 A B C D E F G 13 \n", + "12 12 0 \n", + "\n", + " word_count alpha_word_count avg_word_length has_punctuation is_empty \n", + "0 7 6 2.833333 True False \n", + "1 6 6 2.833333 False False \n", + "2 7 6 2.833333 True False \n", + "3 9 7 3.428571 True False \n", + "4 7 6 3.833333 True False \n", + "5 10 9 3.888889 True False \n", + "6 10 9 3.888889 True False \n", + "7 7 6 3.000000 True False \n", + "8 4 3 5.333333 True False \n", + "9 5 4 5.250000 True False \n", + "10 2 1 5.000000 True False \n", + "11 7 7 1.000000 False False \n", + "12 0 0 0.000000 False True " + ], "text/html": [ "
\n", "