Testing parsing and jaccard similarity in notebooks

2025-11-18 23:25:04 +00:00
parent 42deee19f6
commit 8d6b1cab2c
3 changed files with 142 additions and 0 deletions
--- a/notebooks/02_baseline_experiments.ipynb
+++ b/notebooks/02_baseline_experiments.ipynb
@@ -0,0 +1,54 @@
+{
+ "cells": [
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-11-18T23:15:35.056834Z",
+     "start_time": "2025-11-18T23:15:35.051218Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "def jaccard_similarity(sent1, sent2):\n",
+    "    # make lowercase and split into words\n",
+    "    words1 = set(sent1.lower().split())\n",
+    "    words2 = set(sent2.lower().split())\n",
+    "    intersection = words1.intersection(words2)\n",
+    "    union = words1.union(words2)\n",
+    "    return float(len(intersection)) / len(union) if union else 0.0\n",
+    "\n",
+    "test_pairs = [\n",
+    "    (\"The cat sat on the mat.\", \"The cat sat on the mat.\"),     # Copy\n",
+    "    (\"The cat sat on the mat.\", \"On the mat sat the cat.\"),     # Same words rearranged\n",
+    "    (\"The cat sat on the mat.\", \"The dog ran in the park\")      # Different\n",
+    "]\n",
+    "\n",
+    "for sent1, sent2 in test_pairs:\n",
+    "    similarity = jaccard_similarity(sent1, sent2)\n",
+    "    print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n"
+   ],
+   "id": "e60d024e969254a",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
+      "'The cat sat on the mat.' vs 'On the mat sat the cat.': 0.429\n",
+      "'The cat sat on the mat.' vs 'The dog ran in the park': 0.111\n"
+     ]
+    }
+   ],
+   "execution_count": 7
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "name": "python3",
+   "language": "python",
+   "display_name": "Python 3 (ipykernel)"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}