From a0e88c1525a027905dd3f537253798e907c7219c Mon Sep 17 00:00:00 2001 From: Henry Dowd Date: Fri, 21 Nov 2025 18:31:23 +0000 Subject: [PATCH] Preprocessing text for each analyis type --- notebooks/01_data_exploration.ipynb | 547 ++++++++++++++++++++++------ 1 file changed, 441 insertions(+), 106 deletions(-) diff --git a/notebooks/01_data_exploration.ipynb b/notebooks/01_data_exploration.ipynb index ab59c02..1a80103 100644 --- a/notebooks/01_data_exploration.ipynb +++ b/notebooks/01_data_exploration.ipynb @@ -1,5 +1,37 @@ { "cells": [ + { + "metadata": {}, + "cell_type": "markdown", + "source": "Import Cell\n", + "id": "dd72d1539056a64" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-11-21T17:01:35.974978Z", + "start_time": "2025-11-21T17:01:34.412508Z" + } + }, + "cell_type": "code", + "source": [ + "import token\n", + "import spacy\n", + "from spacy import displacy\n", + "from IPython.display import display, HTML\n", + "\n", + "nlp = spacy.load(\"en_core_web_md\") # Medium size model\n", + "\n", + "test_sentences = [\n", + " \"The cat sat on the mat.\",\n", + " \"On the mat, the cat was sitting.\",\n", + " \"A completely different sentence about something else.\"\n", + "]" + ], + "id": "12579bf734bb1a92", + "outputs": [], + "execution_count": 21 + }, { "metadata": {}, "cell_type": "markdown", @@ -15,16 +47,6 @@ }, "cell_type": "code", "source": [ - "import token\n", - "import spacy\n", - "\n", - "nlp = spacy.load(\"en_core_web_md\") # Can swap for large model if required\n", - "\n", - "test_sentences = [\n", - " \"The cat sat on the mat.\",\n", - " \"On the mat, the cat was sitting.\",\n", - " \"A completely different sentence about something else.\"\n", - "]\n", "\n", "for sent in test_sentences:\n", " doc = nlp(sent)\n", @@ -55,65 +77,98 @@ { "metadata": { "ExecuteTime": { - "end_time": "2025-11-20T19:53:11.868566Z", - "start_time": "2025-11-20T19:53:11.861295Z" + "end_time": "2025-11-21T16:32:45.216663Z", + "start_time": "2025-11-21T16:32:42.601290Z" } }, "cell_type": "code", "source": [ - "import spacy\n", - "import token\n", - "\n", - "nlp = spacy.load(\"en_core_web_md\")\n", - "\n", - "test_sentences = [\n", - " \"The cat sat on the mat.\",\n", - " \"On the mat, the cat was sitting.\",\n", - " \"A completely different sentence about something else.\"\n", - "]\n", "\n", "class TextPreprocessor:\n", " def __init__(self):\n", " self.nlp = spacy.load(\"en_core_web_md\")\n", "\n", - " def direct_detection(self, text):\n", - " \"\"\"For direct copy detection\"\"\"\n", - " #Keep punctuation\n", - " return text.lower().strip()\n", - "\n", - " def semantic_analysis(self, text):\n", - " \"\"\"Semantic Similarity\"\"\"\n", - " doc = self.nlp(text)\n", - " tokens = []\n", - " for token in doc:\n", - " if (not token.is_punct and not token.is_space and token.is_alpha and token.is_stop and len(token.lemma_) > 1): #Remove single char tokens\n", - " tokens.append(token.lemma_.lower())\n", - " return \" \".join(tokens)\n", + " @staticmethod\n", + " def direct_detection(text):\n", + " \"\"\"For direct copy detection\"\"\"\n", + " #Keep punctuation\n", + " return text.lower().strip()\n", "\n", + " def semantic_analysis(self, text):\n", + " \"\"\"Semantic Similarity\"\"\"\n", + " doc = self.nlp(text)\n", + " processed_tokens = []\n", + " # Remove stopwords, punctuation\n", + " for token in doc:\n", + " if not token.is_punct and not token.is_space and token.is_alpha and not token.is_stop:\n", + " processed_tokens.append(token.lemma_.lower())\n", + " return \" \".join(processed_tokens)\n", + "\n", + " def syntactic_analysis(self, text):\n", + " \"\"\"Syntactic Similarity\"\"\"\n", + " doc = self.nlp(text)\n", + " processed_tokens = []\n", + "\n", + " # Normalize content words\n", + " for token in doc:\n", + " if token.is_space:\n", + " continue\n", + " elif token.is_punct:\n", + " processed_tokens.append(token.text) # Keep punctuation\n", + " elif token.is_stop:\n", + " processed_tokens.append(token.lemma_.lower()) # Normalize stopwords\n", + " else:\n", + " processed_tokens.append(token.lemma_.lower()) # Normalize content words\n", + " return \" \".join(processed_tokens)\n", "\n", "\n", + "preprocessor = TextPreprocessor()\n", "\n", + "processed_direct = []\n", + "processed_semantic = []\n", + "processed_syntactic = []\n", "\n", + "for sentence in test_sentences:\n", + " processed_direct[iter] = preprocessor.semantic_analysis(sentence)\n", + "#print(preprocessor.syntactic_analysis(\"A completely different sentence about something else.\"))\n", "\n", "\n", "for sent in test_sentences:\n", " print(f\"Original Sentence: {sent}\")\n", - " print(\"---\")\n", - " print(f\"Preprocessed Sentence: {preprocess_semantic(sent)}\")\n", - " print(\"-\" * 50)" + " print(\"--- Semantic Analysis ---\")\n", + " print(f\"Preprocessed Sentence: {preprocessor.semantic_analysis(sent)}\")\n", + " print(\"--- Syntactic Analysis ---\")\n", + " print(f\"Preprocessed Sentence: {preprocessor.syntactic_analysis(sent)}\")\n", + " print(\"-\" * 50)" ], "id": "5e488a878a5cfccb", "outputs": [ { - "ename": "IndentationError", - "evalue": "expected an indented block after 'if' statement on line 26 (400725648.py, line 31)", - "output_type": "error", - "traceback": [ - " \u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[19]\u001B[39m\u001B[32m, line 31\u001B[39m\n\u001B[31m \u001B[39m\u001B[31mfor sent in test_sentences:\u001B[39m\n ^\n\u001B[31mIndentationError\u001B[39m\u001B[31m:\u001B[39m expected an indented block after 'if' statement on line 26\n" + "name": "stdout", + "output_type": "stream", + "text": [ + "Original Sentence: The cat sat on the mat.\n", + "--- Semantic Analysis ---\n", + "Preprocessed Sentence: cat sit mat\n", + "--- Syntactic Analysis ---\n", + "Preprocessed Sentence: the cat sit on the mat .\n", + "--------------------------------------------------\n", + "Original Sentence: On the mat, the cat was sitting.\n", + "--- Semantic Analysis ---\n", + "Preprocessed Sentence: mat cat sit\n", + "--- Syntactic Analysis ---\n", + "Preprocessed Sentence: on the mat , the cat be sit .\n", + "--------------------------------------------------\n", + "Original Sentence: A completely different sentence about something else.\n", + "--- Semantic Analysis ---\n", + "Preprocessed Sentence: completely different sentence\n", + "--- Syntactic Analysis ---\n", + "Preprocessed Sentence: a completely different sentence about something else .\n", + "--------------------------------------------------\n" ] } ], - "execution_count": 19 + "execution_count": 17 }, { "metadata": { @@ -124,9 +179,6 @@ }, "cell_type": "code", "source": [ - "import spacy\n", - "\n", - "nlp = spacy.load(\"en_core_web_md\")\n", "\n", "def extract_parse_tree(text):\n", " doc = nlp(text)\n", @@ -140,12 +192,6 @@ "\n", " return doc\n", "\n", - "test_sentences = [\n", - " \"The cat sat on the mat.\",\n", - " \"On the mat, the cat was sitting.\",\n", - " \"A completely different sentence about something else.\"\n", - "]\n", - "\n", "for sentence in test_sentences:\n", " doc = extract_parse_tree(sentence)\n", " print(\"\\n\" + \"=\"*60 + \"\\n\")" @@ -206,34 +252,36 @@ ], "execution_count": 15 }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "***USE NetworkX", + "id": "5b5c8742d7c4c4c5" + }, { "metadata": { "ExecuteTime": { - "end_time": "2025-11-20T15:46:08.461059Z", - "start_time": "2025-11-20T15:45:47.529073Z" + "end_time": "2025-11-21T18:20:09.575176Z", + "start_time": "2025-11-21T18:20:09.465504Z" } }, "cell_type": "code", "source": [ - "import spacy\n", - "from spacy import displacy\n", - "from IPython.display import display, HTML\n", "\n", - "nlp = spacy.load(\"en_core_web_md\")\n", - "\n", - "test_sentences = [\n", - " \"The cat sat on the mat.\",\n", - " \"On the mat, the cat was sitting.\",\n", - " \"A completely different sentence about something else.\"\n", - "]\n", "\n", "def visualize_parse_tree(text):\n", " doc = nlp(text)\n", " html = displacy.render(doc, style=\"dep\", jupyter=False, options={\"distance\": 100})\n", " display(HTML(html))\n", "\n", + "\n", + "\n", "for sentence in test_sentences:\n", " print(f\"Sentence: {sentence}\")\n", + " print(\"---\")\n", + " processed_sentence = preprocessor.syntactic_analysis(sentence)\n", + " print(f\"Processed Sentence: \" + processed_sentence)\n", + " visualize_parse_tree(processed_sentence)\n", " visualize_parse_tree(sentence)" ], "id": "e413238c1af12f62", @@ -242,7 +290,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "Sentence: The cat sat on the mat.\n" + "Sentence: The cat sat on the mat.\n", + "---\n", + "Processed Sentence: the cat sit on the mat .\n" ] }, { @@ -251,7 +301,92 @@ "" ], "text/html": [ - "\n", + "\n", + "\n", + " the\n", + " DET\n", + "\n", + "\n", + "\n", + " cat\n", + " NOUN\n", + "\n", + "\n", + "\n", + " sit\n", + " VERB\n", + "\n", + "\n", + "\n", + " on\n", + " ADP\n", + "\n", + "\n", + "\n", + " the\n", + " DET\n", + "\n", + "\n", + "\n", + " mat .\n", + " NOUN\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " det\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " nsubj\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " prep\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " det\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " pobj\n", + " \n", + " \n", + "\n", + "" + ] + }, + "metadata": {}, + "output_type": "display_data", + "jetTransient": { + "display_id": null + } + }, + { + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", "\n", " The\n", " DET\n", @@ -283,41 +418,41 @@ "\n", "\n", "\n", - " \n", + " \n", " \n", - " det\n", + " det\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " nsubj\n", + " nsubj\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " prep\n", + " prep\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " det\n", + " det\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " pobj\n", + " pobj\n", " \n", " \n", "\n", @@ -334,7 +469,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "Sentence: On the mat, the cat was sitting.\n" + "Sentence: On the mat, the cat was sitting.\n", + "---\n", + "Processed Sentence: on the mat , the cat be sit .\n" ] }, { @@ -343,7 +480,105 @@ "" ], "text/html": [ - "\n", + "\n", + "\n", + " on\n", + " ADP\n", + "\n", + "\n", + "\n", + " the\n", + " DET\n", + "\n", + "\n", + "\n", + " mat ,\n", + " NOUN\n", + "\n", + "\n", + "\n", + " the\n", + " DET\n", + "\n", + "\n", + "\n", + " cat\n", + " NOUN\n", + "\n", + "\n", + "\n", + " be\n", + " AUX\n", + "\n", + "\n", + "\n", + " sit .\n", + " VERB\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " prep\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " det\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " pobj\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " det\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " nsubj\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " aux\n", + " \n", + " \n", + "\n", + "" + ] + }, + "metadata": {}, + "output_type": "display_data", + "jetTransient": { + "display_id": null + } + }, + { + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", "\n", " On\n", " ADP\n", @@ -380,49 +615,49 @@ "\n", "\n", "\n", - " \n", + " \n", " \n", - " prep\n", + " prep\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " det\n", + " det\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " pobj\n", + " pobj\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " det\n", + " det\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " nsubj\n", + " nsubj\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " aux\n", + " aux\n", " \n", " \n", "\n", @@ -439,7 +674,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "Sentence: A completely different sentence about something else.\n" + "Sentence: A completely different sentence about something else.\n", + "---\n", + "Processed Sentence: a completely different sentence about something else .\n" ] }, { @@ -448,7 +685,105 @@ "" ], "text/html": [ - "\n", + "\n", + "\n", + " a\n", + " DET\n", + "\n", + "\n", + "\n", + " completely\n", + " ADV\n", + "\n", + "\n", + "\n", + " different\n", + " ADJ\n", + "\n", + "\n", + "\n", + " sentence\n", + " NOUN\n", + "\n", + "\n", + "\n", + " about\n", + " ADP\n", + "\n", + "\n", + "\n", + " something\n", + " PRON\n", + "\n", + "\n", + "\n", + " else .\n", + " ADV\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " det\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " advmod\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " amod\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " prep\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " pobj\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " advmod\n", + " \n", + " \n", + "\n", + "" + ] + }, + "metadata": {}, + "output_type": "display_data", + "jetTransient": { + "display_id": null + } + }, + { + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", "\n", " A\n", " DET\n", @@ -485,49 +820,49 @@ "\n", "\n", "\n", - " \n", + " \n", " \n", - " det\n", + " det\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " advmod\n", + " advmod\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " amod\n", + " amod\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " prep\n", + " prep\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " pobj\n", + " pobj\n", " \n", " \n", "\n", "\n", "\n", - " \n", + " \n", " \n", - " advmod\n", + " advmod\n", " \n", " \n", "\n", @@ -541,7 +876,7 @@ } } ], - "execution_count": 14 + "execution_count": 32 }, { "metadata": {},