{ "cells": [ { "metadata": {}, "cell_type": "markdown", "source": "Keep punctuation for direct copy detection but remove for semantic/keyword based methods", "id": "1c26616777253f10" }, { "metadata": { "ExecuteTime": { "end_time": "2025-11-20T19:03:29.658876Z", "start_time": "2025-11-20T19:03:27.809309Z" } }, "cell_type": "code", "source": [ "import token\n", "import spacy\n", "\n", "nlp = spacy.load(\"en_core_web_md\") # Can swap for large model if required\n", "\n", "test_sentences = [\n", " \"The cat sat on the mat.\",\n", " \"On the mat, the cat was sitting.\",\n", " \"A completely different sentence about something else.\"\n", "]\n", "\n", "for sent in test_sentences:\n", " doc = nlp(sent)\n", " print(f\"Sentence: {sent}\")\n", " print(f\"Tokens: {[token.text for token in doc]}\")\n", " print(\"---\")\n" ], "id": "e003ac06a58cfbb4", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sentence: The cat sat on the mat.\n", "Tokens: ['The', 'cat', 'sat', 'on', 'the', 'mat', '.']\n", "---\n", "Sentence: On the mat, the cat was sitting.\n", "Tokens: ['On', 'the', 'mat', ',', 'the', 'cat', 'was', 'sitting', '.']\n", "---\n", "Sentence: A completely different sentence about something else.\n", "Tokens: ['A', 'completely', 'different', 'sentence', 'about', 'something', 'else', '.']\n", "---\n" ] } ], "execution_count": 17 }, { "metadata": { "ExecuteTime": { "end_time": "2025-11-20T19:53:11.868566Z", "start_time": "2025-11-20T19:53:11.861295Z" } }, "cell_type": "code", "source": [ "import spacy\n", "import token\n", "\n", "nlp = spacy.load(\"en_core_web_md\")\n", "\n", "test_sentences = [\n", " \"The cat sat on the mat.\",\n", " \"On the mat, the cat was sitting.\",\n", " \"A completely different sentence about something else.\"\n", "]\n", "\n", "class TextPreprocessor:\n", " def __init__(self):\n", " self.nlp = spacy.load(\"en_core_web_md\")\n", "\n", " def direct_detection(self, text):\n", " \"\"\"For direct copy detection\"\"\"\n", " #Keep punctuation\n", " return text.lower().strip()\n", "\n", " def semantic_analysis(self, text):\n", " \"\"\"Semantic Similarity\"\"\"\n", " doc = self.nlp(text)\n", " tokens = []\n", " for token in doc:\n", " if (not token.is_punct and not token.is_space and token.is_alpha and token.is_stop and len(token.lemma_) > 1): #Remove single char tokens\n", " tokens.append(token.lemma_.lower())\n", " return \" \".join(tokens)\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "for sent in test_sentences:\n", " print(f\"Original Sentence: {sent}\")\n", " print(\"---\")\n", " print(f\"Preprocessed Sentence: {preprocess_semantic(sent)}\")\n", " print(\"-\" * 50)" ], "id": "5e488a878a5cfccb", "outputs": [ { "ename": "IndentationError", "evalue": "expected an indented block after 'if' statement on line 26 (400725648.py, line 31)", "output_type": "error", "traceback": [ " \u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[19]\u001B[39m\u001B[32m, line 31\u001B[39m\n\u001B[31m \u001B[39m\u001B[31mfor sent in test_sentences:\u001B[39m\n ^\n\u001B[31mIndentationError\u001B[39m\u001B[31m:\u001B[39m expected an indented block after 'if' statement on line 26\n" ] } ], "execution_count": 19 }, { "metadata": { "ExecuteTime": { "end_time": "2025-11-20T15:51:42.074798Z", "start_time": "2025-11-20T15:51:42.050593Z" } }, "cell_type": "code", "source": [ "import spacy\n", "\n", "nlp = spacy.load(\"en_core_web_md\")\n", "\n", "def extract_parse_tree(text):\n", " doc = nlp(text)\n", "\n", " print(f\"Sentence: {text}\")\n", " print(\"\\nDependenct Parse Tree:\")\n", " print(\"-\" * 50)\n", "\n", " for token in doc:\n", " print(f\"{token.text:<12} {token.dep_:<12} {token.head.text:<12} {[child.text for child in token.children]}\")\n", "\n", " return doc\n", "\n", "test_sentences = [\n", " \"The cat sat on the mat.\",\n", " \"On the mat, the cat was sitting.\",\n", " \"A completely different sentence about something else.\"\n", "]\n", "\n", "for sentence in test_sentences:\n", " doc = extract_parse_tree(sentence)\n", " print(\"\\n\" + \"=\"*60 + \"\\n\")" ], "id": "83fc18c9de2e354", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sentence: The cat sat on the mat.\n", "\n", "Dependenct Parse Tree:\n", "--------------------------------------------------\n", "The det cat []\n", "cat nsubj sat ['The']\n", "sat ROOT sat ['cat', 'on', '.']\n", "on prep sat ['mat']\n", "the det mat []\n", "mat pobj on ['the']\n", ". punct sat []\n", "\n", "============================================================\n", "\n", "Sentence: On the mat, the cat was sitting.\n", "\n", "Dependenct Parse Tree:\n", "--------------------------------------------------\n", "On prep sitting ['mat']\n", "the det mat []\n", "mat pobj On ['the']\n", ", punct sitting []\n", "the det cat []\n", "cat nsubj sitting ['the']\n", "was aux sitting []\n", "sitting ROOT sitting ['On', ',', 'cat', 'was', '.']\n", ". punct sitting []\n", "\n", "============================================================\n", "\n", "Sentence: A completely different sentence about something else.\n", "\n", "Dependenct Parse Tree:\n", "--------------------------------------------------\n", "A det sentence []\n", "completely advmod different []\n", "different amod sentence ['completely']\n", "sentence ROOT sentence ['A', 'different', 'about', '.']\n", "about prep sentence ['something']\n", "something pobj about ['else']\n", "else advmod something []\n", ". punct sentence []\n", "\n", "============================================================\n", "\n" ] } ], "execution_count": 15 }, { "metadata": { "ExecuteTime": { "end_time": "2025-11-20T15:46:08.461059Z", "start_time": "2025-11-20T15:45:47.529073Z" } }, "cell_type": "code", "source": [ "import spacy\n", "from spacy import displacy\n", "from IPython.display import display, HTML\n", "\n", "nlp = spacy.load(\"en_core_web_md\")\n", "\n", "test_sentences = [\n", " \"The cat sat on the mat.\",\n", " \"On the mat, the cat was sitting.\",\n", " \"A completely different sentence about something else.\"\n", "]\n", "\n", "def visualize_parse_tree(text):\n", " doc = nlp(text)\n", " html = displacy.render(doc, style=\"dep\", jupyter=False, options={\"distance\": 100})\n", " display(HTML(html))\n", "\n", "for sentence in test_sentences:\n", " print(f\"Sentence: {sentence}\")\n", " visualize_parse_tree(sentence)" ], "id": "e413238c1af12f62", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sentence: The cat sat on the mat.\n" ] }, { "data": { "text/plain": [ "" ], "text/html": [ "\n", "\n", " The\n", " DET\n", "\n", "\n", "\n", " cat\n", " NOUN\n", "\n", "\n", "\n", " sat\n", " VERB\n", "\n", "\n", "\n", " on\n", " ADP\n", "\n", "\n", "\n", " the\n", " DET\n", "\n", "\n", "\n", " mat.\n", " NOUN\n", "\n", "\n", "\n", " \n", " \n", " det\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " nsubj\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " prep\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " det\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " pobj\n", " \n", " \n", "\n", "" ] }, "metadata": {}, "output_type": "display_data", "jetTransient": { "display_id": null } }, { "name": "stdout", "output_type": "stream", "text": [ "Sentence: On the mat, the cat was sitting.\n" ] }, { "data": { "text/plain": [ "" ], "text/html": [ "\n", "\n", " On\n", " ADP\n", "\n", "\n", "\n", " the\n", " DET\n", "\n", "\n", "\n", " mat,\n", " NOUN\n", "\n", "\n", "\n", " the\n", " DET\n", "\n", "\n", "\n", " cat\n", " NOUN\n", "\n", "\n", "\n", " was\n", " AUX\n", "\n", "\n", "\n", " sitting.\n", " VERB\n", "\n", "\n", "\n", " \n", " \n", " prep\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " det\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " pobj\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " det\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " nsubj\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " aux\n", " \n", " \n", "\n", "" ] }, "metadata": {}, "output_type": "display_data", "jetTransient": { "display_id": null } }, { "name": "stdout", "output_type": "stream", "text": [ "Sentence: A completely different sentence about something else.\n" ] }, { "data": { "text/plain": [ "" ], "text/html": [ "\n", "\n", " A\n", " DET\n", "\n", "\n", "\n", " completely\n", " ADV\n", "\n", "\n", "\n", " different\n", " ADJ\n", "\n", "\n", "\n", " sentence\n", " NOUN\n", "\n", "\n", "\n", " about\n", " ADP\n", "\n", "\n", "\n", " something\n", " PRON\n", "\n", "\n", "\n", " else.\n", " ADV\n", "\n", "\n", "\n", " \n", " \n", " det\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " advmod\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " amod\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " prep\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " pobj\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " advmod\n", " \n", " \n", "\n", "" ] }, "metadata": {}, "output_type": "display_data", "jetTransient": { "display_id": null } } ], "execution_count": 14 }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "", "id": "6aff51eb71eb2238" } ], "metadata": { "kernelspec": { "name": "python3", "language": "python", "display_name": "Python 3 (ipykernel)" } }, "nbformat": 4, "nbformat_minor": 5 }