paraphrase_detector/notebooks/01_data_exploration.ipynb

{
 "cells": [
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "Keep punctuation for direct copy detection but remove for semantic/keyword based methods",
   "id": "1c26616777253f10"
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-11-19T00:00:02.012627Z",
     "start_time": "2025-11-19T00:00:00.160731Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import import_ipynb\n",
    "import spacy\n",
    "\n",
    "nlp = spacy.load(\"en_core_web_md\")  # Can swap for large model if required\n",
    "\n",
    "test_sentences = [\n",
    "    \"The cat sat on the mat.\",\n",
    "    \"On the mat, the cat was sitting.\",\n",
    "    \"A completely different sentence about something else.\"\n",
    "]\n",
    "\n",
    "for sent in test_sentences:\n",
    "    doc = nlp(sent)\n",
    "    print(f\"Sentence: {sent}\")\n",
    "    print(f\"Tokens: {[token.text for token in doc]}\")\n",
    "    print(\"---\")\n",
    "\n"
   ],
   "id": "e003ac06a58cfbb4",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Sentence: The cat sat on the mat.\n",
      "Tokens: ['The', 'cat', 'sat', 'on', 'the', 'mat', '.']\n",
      "---\n",
      "Sentence: On the mat, the cat was sitting.\n",
      "Tokens: ['On', 'the', 'mat', ',', 'the', 'cat', 'was', 'sitting', '.']\n",
      "---\n",
      "Sentence: A completely different sentence about something else.\n",
      "Tokens: ['A', 'completely', 'different', 'sentence', 'about', 'something', 'else', '.']\n",
      "---\n"
     ]
    }
   ],
   "execution_count": 2
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "",
   "id": "83fc18c9de2e354"
  }
 ],
 "metadata": {
  "kernelspec": {
   "name": "python3",
   "language": "python",
   "display_name": "Python 3 (ipykernel)"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}