From 8d6b1cab2c168c305c7ef15deb91c3f1d59aca7e Mon Sep 17 00:00:00 2001 From: Henry Dowd Date: Tue, 18 Nov 2025 23:25:04 +0000 Subject: [PATCH] Testing parsing and jaccard similarity in notebooks --- notebooks/01_data_exploration.ipynb | 73 +++++++++++++++++++++++++ notebooks/02_baseline_experiments.ipynb | 54 ++++++++++++++++++ notebooks/03_semantic_methods.ipynb | 15 +++++ 3 files changed, 142 insertions(+) diff --git a/notebooks/01_data_exploration.ipynb b/notebooks/01_data_exploration.ipynb index e69de29..07cffdd 100644 --- a/notebooks/01_data_exploration.ipynb +++ b/notebooks/01_data_exploration.ipynb @@ -0,0 +1,73 @@ +{ + "cells": [ + { + "metadata": {}, + "cell_type": "markdown", + "source": "Keep punctuation for direct copy detection but remove for semantic/keyword based methods", + "id": "1c26616777253f10" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-11-18T23:01:17.888318Z", + "start_time": "2025-11-18T23:01:16.494987Z" + } + }, + "cell_type": "code", + "source": [ + "import spacy\n", + "\n", + "nlp = spacy.load(\"en_core_web_md\") # Can swap for large model if required\n", + "\n", + "test_sentences = [\n", + " \"The cat sat on the mat.\",\n", + " \"On the mat, the cat was sitting.\",\n", + " \"A completely different sentence about something else.\"\n", + "]\n", + "\n", + "for sent in test_sentences:\n", + " doc = nlp(sent)\n", + " print(f\"Sentence: {sent}\")\n", + " print(f\"Tokens: {[token.text for token in doc]}\")\n", + " print(\"---\")\n", + "\n" + ], + "id": "e003ac06a58cfbb4", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sentence: The cat sat on the mat.\n", + "Tokens: ['The', 'cat', 'sat', 'on', 'the', 'mat', '.']\n", + "---\n", + "Sentence: On the mat, the cat was sitting.\n", + "Tokens: ['On', 'the', 'mat', ',', 'the', 'cat', 'was', 'sitting', '.']\n", + "---\n", + "Sentence: A completely different sentence about something else.\n", + "Tokens: ['A', 'completely', 'different', 'sentence', 'about', 'something', 'else', '.']\n", + "---\n" + ] + } + ], + "execution_count": 1 + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": "", + "id": "83fc18c9de2e354" + } + ], + "metadata": { + "kernelspec": { + "name": "python3", + "language": "python", + "display_name": "Python 3 (ipykernel)" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/02_baseline_experiments.ipynb b/notebooks/02_baseline_experiments.ipynb index e69de29..34f5390 100644 --- a/notebooks/02_baseline_experiments.ipynb +++ b/notebooks/02_baseline_experiments.ipynb @@ -0,0 +1,54 @@ +{ + "cells": [ + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-11-18T23:15:35.056834Z", + "start_time": "2025-11-18T23:15:35.051218Z" + } + }, + "cell_type": "code", + "source": [ + "def jaccard_similarity(sent1, sent2):\n", + " # make lowercase and split into words\n", + " words1 = set(sent1.lower().split())\n", + " words2 = set(sent2.lower().split())\n", + " intersection = words1.intersection(words2)\n", + " union = words1.union(words2)\n", + " return float(len(intersection)) / len(union) if union else 0.0\n", + "\n", + "test_pairs = [\n", + " (\"The cat sat on the mat.\", \"The cat sat on the mat.\"), # Copy\n", + " (\"The cat sat on the mat.\", \"On the mat sat the cat.\"), # Same words rearranged\n", + " (\"The cat sat on the mat.\", \"The dog ran in the park\") # Different\n", + "]\n", + "\n", + "for sent1, sent2 in test_pairs:\n", + " similarity = jaccard_similarity(sent1, sent2)\n", + " print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n" + ], + "id": "e60d024e969254a", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n", + "'The cat sat on the mat.' vs 'On the mat sat the cat.': 0.429\n", + "'The cat sat on the mat.' vs 'The dog ran in the park': 0.111\n" + ] + } + ], + "execution_count": 7 + } + ], + "metadata": { + "kernelspec": { + "name": "python3", + "language": "python", + "display_name": "Python 3 (ipykernel)" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/03_semantic_methods.ipynb b/notebooks/03_semantic_methods.ipynb index e69de29..08f9bd0 100644 --- a/notebooks/03_semantic_methods.ipynb +++ b/notebooks/03_semantic_methods.ipynb @@ -0,0 +1,15 @@ +{ + "cells": [ + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": "", + "id": "8a3c4314a90086fe" + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +}