paraphrase_detector/notebooks/02_baseline_experiments.ipynb

{
 "cells": [
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-11-19T00:00:04.962487Z",
     "start_time": "2025-11-19T00:00:04.958995Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import import_ipynb\n",
    "from notebooks.01_data_exploration import *\n",
    "\n",
    "def jaccard_similarity(sent1, sent2):\n",
    "    # make lowercase and split into words\n",
    "    words1 = set(sent1.lower().split())\n",
    "    words2 = set(sent2.lower().split())\n",
    "    intersection = words1.intersection(words2)\n",
    "    union = words1.union(words2)\n",
    "    return float(len(intersection)) / len(union) if union else 0.0\n",
    "\n",
    "test_pairs = [\n",
    "    (\"The cat sat on the mat.\", \"The cat sat on the mat.\"),     # Copy\n",
    "    (\"The cat sat on the mat.\", \"On the mat sat the cat.\"),     # Same words rearranged\n",
    "    (\"The cat sat on the mat.\", \"The dog ran in the park\")      # Different\n",
    "]\n",
    "\n",
    "for sent1, sent2 in test_pairs:\n",
    "    similarity = jaccard_similarity(sent1, sent2)\n",
    "    print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n"
   ],
   "id": "e60d024e969254a",
   "outputs": [
    {
     "ename": "SyntaxError",
     "evalue": "invalid decimal literal (2501033926.py, line 2)",
     "output_type": "error",
     "traceback": [
      "  \u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[8]\u001B[39m\u001B[32m, line 2\u001B[39m\n\u001B[31m    \u001B[39m\u001B[31mfrom notebooks.01_data_exploration import *\u001B[39m\n                     ^\n\u001B[31mSyntaxError\u001B[39m\u001B[31m:\u001B[39m invalid decimal literal\n"
     ]
    }
   ],
   "execution_count": 8
  }
 ],
 "metadata": {
  "kernelspec": {
   "name": "python3",
   "language": "python",
   "display_name": "Python 3 (ipykernel)"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}