{ "cells": [ { "metadata": {}, "cell_type": "markdown", "source": "Keep punctuation for direct copy detection but remove for semantic/keyword based methods", "id": "1c26616777253f10" }, { "metadata": { "ExecuteTime": { "end_time": "2025-11-18T23:01:17.888318Z", "start_time": "2025-11-18T23:01:16.494987Z" } }, "cell_type": "code", "source": [ "import spacy\n", "\n", "nlp = spacy.load(\"en_core_web_md\") # Can swap for large model if required\n", "\n", "test_sentences = [\n", " \"The cat sat on the mat.\",\n", " \"On the mat, the cat was sitting.\",\n", " \"A completely different sentence about something else.\"\n", "]\n", "\n", "for sent in test_sentences:\n", " doc = nlp(sent)\n", " print(f\"Sentence: {sent}\")\n", " print(f\"Tokens: {[token.text for token in doc]}\")\n", " print(\"---\")\n", "\n" ], "id": "e003ac06a58cfbb4", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sentence: The cat sat on the mat.\n", "Tokens: ['The', 'cat', 'sat', 'on', 'the', 'mat', '.']\n", "---\n", "Sentence: On the mat, the cat was sitting.\n", "Tokens: ['On', 'the', 'mat', ',', 'the', 'cat', 'was', 'sitting', '.']\n", "---\n", "Sentence: A completely different sentence about something else.\n", "Tokens: ['A', 'completely', 'different', 'sentence', 'about', 'something', 'else', '.']\n", "---\n" ] } ], "execution_count": 1 }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "", "id": "83fc18c9de2e354" } ], "metadata": { "kernelspec": { "name": "python3", "language": "python", "display_name": "Python 3 (ipykernel)" } }, "nbformat": 4, "nbformat_minor": 5 }