{ "cells": [ { "metadata": {}, "cell_type": "markdown", "source": "Import Cell\n", "id": "dd72d1539056a64" }, { "metadata": { "ExecuteTime": { "end_time": "2025-11-21T17:01:35.974978Z", "start_time": "2025-11-21T17:01:34.412508Z" } }, "cell_type": "code", "source": [ "import token\n", "import spacy\n", "from spacy import displacy\n", "from IPython.display import display, HTML\n", "\n", "nlp = spacy.load(\"en_core_web_md\") # Medium size model\n", "\n", "test_sentences = [\n", " \"The cat sat on the mat.\",\n", " \"On the mat, the cat was sitting.\",\n", " \"A completely different sentence about something else.\"\n", "]" ], "id": "12579bf734bb1a92", "outputs": [], "execution_count": 21 }, { "metadata": {}, "cell_type": "markdown", "source": "Keep punctuation for direct copy detection but remove for semantic/keyword based methods", "id": "1c26616777253f10" }, { "metadata": { "ExecuteTime": { "end_time": "2025-11-20T19:03:29.658876Z", "start_time": "2025-11-20T19:03:27.809309Z" } }, "cell_type": "code", "source": [ "\n", "for sent in test_sentences:\n", " doc = nlp(sent)\n", " print(f\"Sentence: {sent}\")\n", " print(f\"Tokens: {[token.text for token in doc]}\")\n", " print(\"---\")\n" ], "id": "e003ac06a58cfbb4", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sentence: The cat sat on the mat.\n", "Tokens: ['The', 'cat', 'sat', 'on', 'the', 'mat', '.']\n", "---\n", "Sentence: On the mat, the cat was sitting.\n", "Tokens: ['On', 'the', 'mat', ',', 'the', 'cat', 'was', 'sitting', '.']\n", "---\n", "Sentence: A completely different sentence about something else.\n", "Tokens: ['A', 'completely', 'different', 'sentence', 'about', 'something', 'else', '.']\n", "---\n" ] } ], "execution_count": 17 }, { "metadata": { "ExecuteTime": { "end_time": "2025-11-21T23:26:23.172796Z", "start_time": "2025-11-21T23:26:21.541542Z" } }, "cell_type": "code", "source": [ "\n", "class TextPreprocessor:\n", " def __init__(self):\n", " self.nlp = spacy.load(\"en_core_web_md\")\n", "\n", " @staticmethod\n", " def direct_detection(text):\n", " \"\"\"For direct copy detection\"\"\"\n", " #Keep punctuation\n", " return text.lower().strip()\n", "\n", " def semantic_analysis(self, text):\n", " \"\"\"Semantic Similarity\"\"\"\n", " doc = self.nlp(text)\n", " processed_tokens = []\n", " # Remove stopwords, punctuation\n", " for token in doc:\n", " if not token.is_punct and not token.is_space and token.is_alpha and not token.is_stop:\n", " processed_tokens.append(token.lemma_.lower())\n", " return \" \".join(processed_tokens)\n", "\n", " def syntactic_analysis(self, text):\n", " \"\"\"Syntactic Similarity\"\"\"\n", " doc = self.nlp(text)\n", " processed_tokens = []\n", "\n", " # Normalize content words\n", " for token in doc:\n", " if token.is_space:\n", " continue\n", " elif token.is_punct:\n", " processed_tokens.append(token.text) # Keep punctuation\n", " elif token.is_stop:\n", " processed_tokens.append(token.lemma_.lower()) # Normalize stopwords\n", " else:\n", " processed_tokens.append(token.lemma_.lower()) # Normalize content words\n", " return \" \".join(processed_tokens)\n", "\n", "\n", "preprocessor = TextPreprocessor()\n", "\n", "processed_direct = []\n", "processed_semantic = []\n", "processed_syntactic = []\n", "\n", "for sentence in test_sentences:\n", " print(\"-\" * 50)\n", " print(f\"Sentence: {sentence}\")\n", " direct = preprocessor.direct_detection(sentence)\n", " processed_direct.append(direct)\n", " print(\"--- Direct Sentence ---\")\n", " print(f\"{direct}\")\n", " semantic = preprocessor.semantic_analysis(sentence)\n", " processed_semantic.append(semantic)\n", " print(\"--- Semantic Sentence ---\")\n", " print(f\"{semantic}\")\n", " syntactic = preprocessor.syntactic_analysis(sentence)\n", " processed_syntactic.append(syntactic)\n", " print(\"--- Syntactic Sentence ---\")\n", " print(f\"{syntactic}\")\n", "\n", "print(\"-\" * 50)\n", "#for sent in test_sentences:\n", "# print(f\"Original Sentence: {sent}\")\n", "# print(\"--- Semantic Analysis ---\")\n", "# print(f\"Preprocessed Sentence: {preprocessor.semantic_analysis(sent)}\")\n", "# print(\"--- Syntactic Analysis ---\")\n", "# print(f\"Preprocessed Sentence: {preprocessor.syntactic_analysis(sent)}\")\n", "# print(\"-\" * 50)" ], "id": "5e488a878a5cfccb", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--------------------------------------------------\n", "Sentence: The cat sat on the mat.\n", "--- Direct Sentence ---\n", "the cat sat on the mat.\n", "--- Semantic Sentence ---\n", "cat sit mat\n", "--- Syntactic Sentence ---\n", "the cat sit on the mat .\n", "--------------------------------------------------\n", "Sentence: On the mat, the cat was sitting.\n", "--- Direct Sentence ---\n", "on the mat, the cat was sitting.\n", "--- Semantic Sentence ---\n", "mat cat sit\n", "--- Syntactic Sentence ---\n", "on the mat , the cat be sit .\n", "--------------------------------------------------\n", "Sentence: A completely different sentence about something else.\n", "--- Direct Sentence ---\n", "a completely different sentence about something else.\n", "--- Semantic Sentence ---\n", "completely different sentence\n", "--- Syntactic Sentence ---\n", "a completely different sentence about something else .\n", "--------------------------------------------------\n" ] } ], "execution_count": 37 }, { "metadata": { "ExecuteTime": { "end_time": "2025-11-21T23:39:48.443022Z", "start_time": "2025-11-21T23:39:48.411766Z" } }, "cell_type": "code", "source": [ "\n", "def extract_parse_tree(text):\n", " doc = nlp(text)\n", "\n", " print(f\"Sentence: {text}\")\n", " print(\"\\nDependenct Parse Tree:\")\n", " print(\"-\" * 50)\n", "\n", " for token in doc:\n", " print(f\"{token.text:<12} {token.dep_:<12} {token.head.text:<12} {[child.text for child in token.children]}\")\n", "\n", " return doc\n", "\n", "for sentence in processed_syntactic:\n", " doc = extract_parse_tree(sentence)\n", " print(\"\\n\" + \"=\"*60 + \"\\n\")" ], "id": "83fc18c9de2e354", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sentence: the cat sit on the mat .\n", "\n", "Dependenct Parse Tree:\n", "--------------------------------------------------\n", "the det cat []\n", "cat nsubj sit ['the']\n", "sit ROOT sit ['cat', 'on', '.']\n", "on prep sit ['mat']\n", "the det mat []\n", "mat pobj on ['the']\n", ". punct sit []\n", "\n", "============================================================\n", "\n", "Sentence: on the mat , the cat be sit .\n", "\n", "Dependenct Parse Tree:\n", "--------------------------------------------------\n", "on prep sit ['mat']\n", "the det mat []\n", "mat pobj on ['the']\n", ", punct sit []\n", "the det cat []\n", "cat nsubj sit ['the']\n", "be aux sit []\n", "sit ROOT sit ['on', ',', 'cat', 'be', '.']\n", ". punct sit []\n", "\n", "============================================================\n", "\n", "Sentence: a completely different sentence about something else .\n", "\n", "Dependenct Parse Tree:\n", "--------------------------------------------------\n", "a det sentence []\n", "completely advmod different []\n", "different amod sentence ['completely']\n", "sentence ROOT sentence ['a', 'different', 'about', '.']\n", "about prep sentence ['something']\n", "something pobj about ['else']\n", "else advmod something []\n", ". punct sentence []\n", "\n", "============================================================\n", "\n" ] } ], "execution_count": 39 }, { "metadata": {}, "cell_type": "markdown", "source": "***USE NetworkX", "id": "5b5c8742d7c4c4c5" }, { "metadata": { "ExecuteTime": { "end_time": "2025-11-21T18:20:09.575176Z", "start_time": "2025-11-21T18:20:09.465504Z" } }, "cell_type": "code", "source": [ "\n", "\n", "def visualize_parse_tree(text):\n", " doc = nlp(text)\n", " html = displacy.render(doc, style=\"dep\", jupyter=False, options={\"distance\": 100})\n", " display(HTML(html))\n", "\n", "\n", "\n", "for sentence in test_sentences:\n", " print(f\"Sentence: {sentence}\")\n", " print(\"---\")\n", " processed_sentence = preprocessor.syntactic_analysis(sentence)\n", " print(f\"Processed Sentence: \" + processed_sentence)\n", " visualize_parse_tree(processed_sentence)\n", " visualize_parse_tree(sentence)" ], "id": "e413238c1af12f62", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sentence: The cat sat on the mat.\n", "---\n", "Processed Sentence: the cat sit on the mat .\n" ] }, { "data": { "text/plain": [ "" ], "text/html": [ "\n", "\n", " the\n", " DET\n", "\n", "\n", "\n", " cat\n", " NOUN\n", "\n", "\n", "\n", " sit\n", " VERB\n", "\n", "\n", "\n", " on\n", " ADP\n", "\n", "\n", "\n", " the\n", " DET\n", "\n", "\n", "\n", " mat .\n", " NOUN\n", "\n", "\n", "\n", " \n", " \n", " det\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " nsubj\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " prep\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " det\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " pobj\n", " \n", " \n", "\n", "" ] }, "metadata": {}, "output_type": "display_data", "jetTransient": { "display_id": null } }, { "data": { "text/plain": [ "" ], "text/html": [ "\n", "\n", " The\n", " DET\n", "\n", "\n", "\n", " cat\n", " NOUN\n", "\n", "\n", "\n", " sat\n", " VERB\n", "\n", "\n", "\n", " on\n", " ADP\n", "\n", "\n", "\n", " the\n", " DET\n", "\n", "\n", "\n", " mat.\n", " NOUN\n", "\n", "\n", "\n", " \n", " \n", " det\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " nsubj\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " prep\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " det\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " pobj\n", " \n", " \n", "\n", "" ] }, "metadata": {}, "output_type": "display_data", "jetTransient": { "display_id": null } }, { "name": "stdout", "output_type": "stream", "text": [ "Sentence: On the mat, the cat was sitting.\n", "---\n", "Processed Sentence: on the mat , the cat be sit .\n" ] }, { "data": { "text/plain": [ "" ], "text/html": [ "\n", "\n", " on\n", " ADP\n", "\n", "\n", "\n", " the\n", " DET\n", "\n", "\n", "\n", " mat ,\n", " NOUN\n", "\n", "\n", "\n", " the\n", " DET\n", "\n", "\n", "\n", " cat\n", " NOUN\n", "\n", "\n", "\n", " be\n", " AUX\n", "\n", "\n", "\n", " sit .\n", " VERB\n", "\n", "\n", "\n", " \n", " \n", " prep\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " det\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " pobj\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " det\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " nsubj\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " aux\n", " \n", " \n", "\n", "" ] }, "metadata": {}, "output_type": "display_data", "jetTransient": { "display_id": null } }, { "data": { "text/plain": [ "" ], "text/html": [ "\n", "\n", " On\n", " ADP\n", "\n", "\n", "\n", " the\n", " DET\n", "\n", "\n", "\n", " mat,\n", " NOUN\n", "\n", "\n", "\n", " the\n", " DET\n", "\n", "\n", "\n", " cat\n", " NOUN\n", "\n", "\n", "\n", " was\n", " AUX\n", "\n", "\n", "\n", " sitting.\n", " VERB\n", "\n", "\n", "\n", " \n", " \n", " prep\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " det\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " pobj\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " det\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " nsubj\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " aux\n", " \n", " \n", "\n", "" ] }, "metadata": {}, "output_type": "display_data", "jetTransient": { "display_id": null } }, { "name": "stdout", "output_type": "stream", "text": [ "Sentence: A completely different sentence about something else.\n", "---\n", "Processed Sentence: a completely different sentence about something else .\n" ] }, { "data": { "text/plain": [ "" ], "text/html": [ "\n", "\n", " a\n", " DET\n", "\n", "\n", "\n", " completely\n", " ADV\n", "\n", "\n", "\n", " different\n", " ADJ\n", "\n", "\n", "\n", " sentence\n", " NOUN\n", "\n", "\n", "\n", " about\n", " ADP\n", "\n", "\n", "\n", " something\n", " PRON\n", "\n", "\n", "\n", " else .\n", " ADV\n", "\n", "\n", "\n", " \n", " \n", " det\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " advmod\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " amod\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " prep\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " pobj\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " advmod\n", " \n", " \n", "\n", "" ] }, "metadata": {}, "output_type": "display_data", "jetTransient": { "display_id": null } }, { "data": { "text/plain": [ "" ], "text/html": [ "\n", "\n", " A\n", " DET\n", "\n", "\n", "\n", " completely\n", " ADV\n", "\n", "\n", "\n", " different\n", " ADJ\n", "\n", "\n", "\n", " sentence\n", " NOUN\n", "\n", "\n", "\n", " about\n", " ADP\n", "\n", "\n", "\n", " something\n", " PRON\n", "\n", "\n", "\n", " else.\n", " ADV\n", "\n", "\n", "\n", " \n", " \n", " det\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " advmod\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " amod\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " prep\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " pobj\n", " \n", " \n", "\n", "\n", "\n", " \n", " \n", " advmod\n", " \n", " \n", "\n", "" ] }, "metadata": {}, "output_type": "display_data", "jetTransient": { "display_id": null } } ], "execution_count": 32 }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "", "id": "6aff51eb71eb2238" } ], "metadata": { "kernelspec": { "name": "python3", "language": "python", "display_name": "Python 3 (ipykernel)" } }, "nbformat": 4, "nbformat_minor": 5 }