diff --git a/notebooks/01_data_exploration.ipynb b/notebooks/01_data_exploration.ipynb index 234dd5b..b4f026a 100644 --- a/notebooks/01_data_exploration.ipynb +++ b/notebooks/01_data_exploration.ipynb @@ -18,7 +18,19 @@ "start_time": "2025-11-23T13:53:56.325948Z" } }, - "outputs": [], + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'spacy'", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtoken\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mspacy\u001b[39;00m\n\u001b[32m 3\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mspacy\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m displacy\n\u001b[32m 4\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mIPython\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdisplay\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m display, HTML\n", + "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'spacy'" + ] + } + ], "source": [ "import token\n", "import spacy\n", diff --git a/notebooks/02_baseline_experiments.ipynb b/notebooks/02_baseline_experiments.ipynb index 2454db0..f4a5c48 100644 --- a/notebooks/02_baseline_experiments.ipynb +++ b/notebooks/02_baseline_experiments.ipynb @@ -1,17 +1,102 @@ { "cells": [ { + "cell_type": "code", + "execution_count": 3, + "id": "d2aa2997", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "done\n" + ] + } + ], + "source": [ + "# pip install rapidfuzz scikit-learn numpy\n", + "\n", + "import numpy as np\n", + "from collections import Counter\n", + "import string\n", + "from rapidfuzz import fuzz, distance\n", + "\n", + "test_pairs = [\n", + " # Direct copies and near-copies\n", + " (\"The cat sat on the mat.\", \"The cat sat on the mat.\"), # Exact copy\n", + " (\"The cat sat on the mat.\", \"The cat sat on the mat\"), # No punctuation\n", + " (\"The cat sat on the mat.\", \"The cat sat on the mat.\"), # Extra spaces\n", + " \n", + " # Paraphrases with same meaning\n", + " (\"The cat sat on the mat.\", \"On the mat, the cat was sitting.\"), # Structural change\n", + " (\"The cat sat on the mat.\", \"The feline rested on the rug.\"), # Synonym replacement\n", + " (\"The quick brown fox jumps.\", \"A fast brown fox leaps.\"), # Partial synonym\n", + " \n", + " # Different sentences\n", + " (\"The cat sat on the mat.\", \"The dog ran in the park.\"), # Different content\n", + " (\"I love programming.\", \"She enjoys reading books.\"), # Completely different\n", + " (\"The weather is nice today.\", \"It's raining outside.\"), # Opposite meaning\n", + " \n", + " # Edge cases\n", + " (\"Short.\", \"Short.\"), # Very short\n", + " (\"A B C D E F G\", \"A B C D E F G\"), # Repeated words\n", + " (\"\", \"\"), # Empty strings\n", + "]\n", + "print(\"done\")" + ] + }, + { + "cell_type": "markdown", + "id": "b06d10d0", + "metadata": {}, + "source": [ + "### Jaccard Similarity " + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "e60d024e969254a", "metadata": { "ExecuteTime": { "end_time": "2025-11-19T10:01:11.039074Z", "start_time": "2025-11-19T10:01:09.613806Z" } }, - "cell_type": "code", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n", + "--------------------------------------------------\n", + "'The cat sat on the mat.' vs 'The cat sat on the mat': 0.667\n", + "--------------------------------------------------\n", + "'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n", + "--------------------------------------------------\n", + "'The cat sat on the mat.' vs 'On the mat, the cat was sitting.': 0.375\n", + "--------------------------------------------------\n", + "'The cat sat on the mat.' vs 'The feline rested on the rug.': 0.250\n", + "--------------------------------------------------\n", + "'The quick brown fox jumps.' vs 'A fast brown fox leaps.': 0.250\n", + "--------------------------------------------------\n", + "'The cat sat on the mat.' vs 'The dog ran in the park.': 0.111\n", + "--------------------------------------------------\n", + "'I love programming.' vs 'She enjoys reading books.': 0.000\n", + "--------------------------------------------------\n", + "'The weather is nice today.' vs 'It's raining outside.': 0.000\n", + "--------------------------------------------------\n", + "'Short.' vs 'Short.': 1.000\n", + "--------------------------------------------------\n", + "'A B C D E F G' vs 'A B C D E F G': 1.000\n", + "--------------------------------------------------\n", + "'' vs '': 0.000\n", + "--------------------------------------------------\n" + ] + } + ], "source": [ - "import import_ipynb\n", - "#from notebooks.01_data_exploration import *\n", - "\n", "def jaccard_similarity(sent1, sent2):\n", " # make lowercase and split into words\n", " words1 = set(sent1.lower().split())\n", @@ -20,7 +105,7 @@ " union = words1.union(words2)\n", " return float(len(intersection)) / len(union) if union else 0.0\n", "\n", - "test_pairs = [\n", + "small_test_pairs = [\n", " (\"The cat sat on the mat.\", \"The cat sat on the mat.\"), # Copy\n", " (\"The cat sat on the mat.\", \"On the mat sat the cat.\"), # Same words rearranged\n", " (\"The cat sat on the mat.\", \"The dog ran in the park\") # Different\n", @@ -28,28 +113,469 @@ "\n", "for sent1, sent2 in test_pairs:\n", " similarity = jaccard_similarity(sent1, sent2)\n", - " print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n" + " print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n", + " print(\"-\"* 50)\n" + ] + }, + { + "cell_type": "markdown", + "id": "337a1072", + "metadata": {}, + "source": [ + "### --- Leneshtein Similarity ---\n", + " Character & Word" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "0b68fdcd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "'The cat sat on the mat.' vs 'The cat sat on the mat.':\n", + " Char similarity: 1.000 --- Word similarity: 1.000\n", + "--------------------------------------------------\n", + "'The cat sat on the mat.' vs 'The cat sat on the mat':\n", + " Char similarity: 0.957 --- Word similarity: 0.833\n", + "--------------------------------------------------\n", + "'The cat sat on the mat.' vs 'The cat sat on the mat.':\n", + " Char similarity: 0.821 --- Word similarity: 1.000\n", + "--------------------------------------------------\n", + "'The cat sat on the mat.' vs 'On the mat, the cat was sitting.':\n", + " Char similarity: 0.344 --- Word similarity: 0.143\n", + "--------------------------------------------------\n", + "'The cat sat on the mat.' vs 'The feline rested on the rug.':\n", + " Char similarity: 0.517 --- Word similarity: 0.500\n", + "--------------------------------------------------\n", + "'The quick brown fox jumps.' vs 'A fast brown fox leaps.':\n", + " Char similarity: 0.577 --- Word similarity: 0.400\n", + "--------------------------------------------------\n", + "'The cat sat on the mat.' vs 'The dog ran in the park.':\n", + " Char similarity: 0.625 --- Word similarity: 0.333\n", + "--------------------------------------------------\n", + "'I love programming.' vs 'She enjoys reading books.':\n", + " Char similarity: 0.200 --- Word similarity: 0.000\n", + "--------------------------------------------------\n", + "'The weather is nice today.' vs 'It's raining outside.':\n", + " Char similarity: 0.192 --- Word similarity: 0.000\n", + "--------------------------------------------------\n", + "'Short.' vs 'Short.':\n", + " Char similarity: 1.000 --- Word similarity: 1.000\n", + "--------------------------------------------------\n", + "'A B C D E F G' vs 'A B C D E F G':\n", + " Char similarity: 1.000 --- Word similarity: 1.000\n", + "--------------------------------------------------\n", + "'' vs '':\n", + " Char similarity: 1.000 --- Word similarity: 1.000\n", + "--------------------------------------------------\n" + ] + } ], - "id": "e60d024e969254a", + "source": [ + "def char_levenshtein_similarity(sent1, sent2):\n", + " \"\"\" Character based edit-distance similarity \"\"\"\n", + " if not sent1 and not sent2:\n", + " return 1.0\n", + " if not sent1 or not sent2:\n", + " return 0.0\n", + " \n", + " max_len = max(len(sent1), len(sent2))\n", + " edit_distance = distance.Levenshtein.distance(sent1, sent2)\n", + " return 1 - (edit_distance / max_len)\n", + "\n", + "def word_levenshtein_similarity(sent1, sent2):\n", + " \"\"\" Word based edit-distance similarity \"\"\"\n", + " words1 = sent1.lower().split()\n", + " words2 = sent2.lower().split()\n", + " \n", + " if not words1 and not words2:\n", + " return 1.0\n", + " if not words1 or not words2:\n", + " return 0.0\n", + " \n", + " max_len = max(len(words1), len(words2))\n", + " edit_distance = distance.Levenshtein.distance(words1, words2)\n", + " return 1 - (edit_distance / max_len)\n", + "\n", + "for sent1, sent2 in test_pairs:\n", + " char_similarity = char_levenshtein_similarity(sent1, sent2)\n", + " word_similarity = word_levenshtein_similarity(sent1, sent2)\n", + " print(f\"'{sent1}' vs '{sent2}':\") \n", + " print(f\" Char similarity: {char_similarity:.3f} --- Word similarity: {word_similarity:.3f}\") # 3 decimal place\n", + " print(\"-\"* 50)" + ] + }, + { + "cell_type": "markdown", + "id": "bae45c9a", + "metadata": {}, + "source": [ + "### --- Cosine Similarity ---" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "46a985b4", + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n", - "'The cat sat on the mat.' vs 'On the mat sat the cat.': 0.429\n", - "'The cat sat on the mat.' vs 'The dog ran in the park': 0.111\n" + "--------------------------------------------------\n", + "'The cat sat on the mat.' vs 'The cat sat on the mat': 1.000\n", + "--------------------------------------------------\n", + "'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n", + "--------------------------------------------------\n", + "'The cat sat on the mat.' vs 'On the mat, the cat was sitting.': 0.825\n", + "--------------------------------------------------\n", + "'The cat sat on the mat.' vs 'The feline rested on the rug.': 0.625\n", + "--------------------------------------------------\n", + "'The quick brown fox jumps.' vs 'A fast brown fox leaps.': 0.400\n", + "--------------------------------------------------\n", + "'The cat sat on the mat.' vs 'The dog ran in the park.': 0.500\n", + "--------------------------------------------------\n", + "'I love programming.' vs 'She enjoys reading books.': 0.000\n", + "--------------------------------------------------\n", + "'The weather is nice today.' vs 'It's raining outside.': 0.000\n", + "--------------------------------------------------\n", + "'Short.' vs 'Short.': 1.000\n", + "--------------------------------------------------\n", + "'A B C D E F G' vs 'A B C D E F G': 1.000\n", + "--------------------------------------------------\n", + "'' vs '': 1.000\n", + "--------------------------------------------------\n" ] } ], - "execution_count": 9 + "source": [ + "def cosine_similarity_bow(sent1, sent2):\n", + " \"\"\" dosine similarity using bag-of-words \"\"\"\n", + " words1 = sent1.lower().translate(str.maketrans('', '', string.punctuation)).split()\n", + " words2 = sent2.lower().translate(str.maketrans('', '', string.punctuation)).split()\n", + " \n", + " if not words1 and not words2:\n", + " return 1.0\n", + " \n", + " vocabulary = set(words1 + words2)\n", + " if not vocabulary:\n", + " return 0.0\n", + " \n", + " # Create frequency vectors\n", + " freq1 = Counter(words1)\n", + " freq2 = Counter(words2)\n", + " \n", + " # Convert to vectors\n", + " vec1 = np.array([freq1[word] for word in vocabulary])\n", + " vec2 = np.array([freq2[word] for word in vocabulary])\n", + " \n", + " # Compute cosine similarity\n", + " dot_product = np.dot(vec1, vec2)\n", + " norm1 = np.linalg.norm(vec1)\n", + " norm2 = np.linalg.norm(vec2)\n", + " \n", + " if norm1 == 0 or norm2 == 0:\n", + " return 0.0\n", + " \n", + " return dot_product / (norm1 * norm2)\n", + "\n", + "for sent1, sent2 in test_pairs:\n", + " similarity = cosine_similarity_bow(sent1, sent2)\n", + " print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n", + " print(\"-\"* 50)" + ] + }, + { + "cell_type": "markdown", + "id": "658276dc", + "metadata": {}, + "source": [ + "### --- Fuzzy ratios ---" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "7dc7ac2e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fuzzy ratio Examples\n", + "==========================================================================================\n", + "Case Ratio Partial Token Sort Token Set Description\n", + "==========================================================================================\n", + "The cat sat on the mat. 100 100 100 100 Exact copy\n", + "The cat sat on the mat. 97 100 97 97 Different content\n", + "The cat sat on the mat. 90 82 100 100 Same words, different order\n", + "The cat sat on the mat. 47 57 58 62 Different content\n", + "The cat sat on the mat. 61 60 53 60 Different content\n", + "The quick brown fox jumps. 61 70 57 57 Different content\n", + "The cat sat on the mat. 63 63 55 55 Different content\n", + "I love programming. 40 43 40 40 Different content\n", + "The weather is nice today. 38 47 29 29 Different content\n", + "Short. 100 100 100 100 Exact copy\n", + "A B C D E F G 100 100 100 100 Exact copy\n", + " 100 100 100 0 Exact copy\n" + ] + } + ], + "source": [ + "def fuzzy_ratio_similarity(sent1, sent2):\n", + " \"\"\"Fuzzy string matching ratio\"\"\"\n", + " return fuzz.ratio(sent1.lower(), sent2.lower()) / 100.0\n", + "\n", + "def fuzzy_partial_ratio(sent1, sent2):\n", + " \"\"\"Fuzzy partial string matching\"\"\"\n", + " return fuzz.partial_ratio(sent1.lower(), sent2.lower()) / 100.0\n", + "\n", + "def fuzzy_token_sort_ratio(sent1, sent2):\n", + " \"\"\"Fuzzy token sorting ratio (ignore order)\"\"\"\n", + " return fuzz.token_sort_ratio(sent1.lower(), sent2.lower()) / 100.0\n", + "\n", + "def fuzzy_token_set_ratio(sent1, sent2):\n", + " \"\"\" Fuzzy token set ratio (duplicates) \"\"\"\n", + " return fuzz.token_set_ratio(sent1.lower(), sent2.lower()) / 100.0\n", + "\n", + "print(\"Fuzzy ratio Examples\")\n", + "print(\"=\" * 90)\n", + "print(f\"{'Case':<30} {'Ratio':<8} {'Partial':<8} {'Token Sort':<12} {'Token Set':<8} Description\")\n", + "print(\"=\" * 90)\n", + "\n", + "for sent1, sent2 in test_pairs:\n", + " ratio = int(fuzz.ratio(sent1, sent2))\n", + " partial = int(fuzz.partial_ratio(sent1, sent2)) \n", + " token_sort = int(fuzz.token_sort_ratio(sent1, sent2))\n", + " token_set = int(fuzz.token_set_ratio(sent1, sent2))\n", + " \n", + " # description\n", + " if sent1 == sent2:\n", + " desc = \"Exact copy\"\n", + " elif sorted(sent1.split()) == sorted(sent2.split()):\n", + " desc = \"Same words, different order\"\n", + " elif set(sent1.split()).issubset(set(sent2.split())) or set(sent2.split()).issubset(set(sent1.split())):\n", + " desc = \"Subset relationship\"\n", + " else:\n", + " desc = \"Different content\"\n", + " \n", + " print(f\"{sent1[:28]:<30} {ratio:<8} {partial:<8} {token_sort:<12} {token_set:<8} {desc}\")" + ] + }, + { + "cell_type": "markdown", + "id": "a48774d4", + "metadata": {}, + "source": [ + "### --- Longest common sub-sequence ----" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "e6a4d4e2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n", + "--------------------------------------------------\n", + "'The cat sat on the mat.' vs 'The cat sat on the mat': 1.000\n", + "--------------------------------------------------\n", + "'The cat sat on the mat.' vs 'The cat sat on the mat.': 0.815\n", + "--------------------------------------------------\n", + "'The cat sat on the mat.' vs 'On the mat, the cat was sitting.': 0.433\n", + "--------------------------------------------------\n", + "'The cat sat on the mat.' vs 'The feline rested on the rug.': 0.536\n", + "--------------------------------------------------\n", + "'The quick brown fox jumps.' vs 'A fast brown fox leaps.': 0.560\n", + "--------------------------------------------------\n", + "'The cat sat on the mat.' vs 'The dog ran in the park.': 0.609\n", + "--------------------------------------------------\n", + "'I love programming.' vs 'She enjoys reading books.': 0.333\n", + "--------------------------------------------------\n", + "'The weather is nice today.' vs 'It's raining outside.': 0.360\n", + "--------------------------------------------------\n", + "'Short.' vs 'Short.': 1.000\n", + "--------------------------------------------------\n", + "'A B C D E F G' vs 'A B C D E F G': 1.000\n", + "--------------------------------------------------\n", + "'' vs '': 0.000\n", + "--------------------------------------------------\n" + ] + } + ], + "source": [ + "def longest_common_subsequence(sent1, sent2):\n", + " \"\"\" Longest common subsequence similarity \"\"\"\n", + " if not sent1 or not sent2:\n", + " return 0.0\n", + " \n", + " # Remove punctuation for better matching\n", + " sent1_clean = sent1.lower().translate(str.maketrans('', '', string.punctuation))\n", + " sent2_clean = sent2.lower().translate(str.maketrans('', '', string.punctuation))\n", + " \n", + " m, n = len(sent1_clean), len(sent2_clean)\n", + " dp = [[0] * (n + 1) for _ in range(m + 1)]\n", + " \n", + " for i in range(1, m + 1):\n", + " for j in range(1, n + 1):\n", + " if sent1_clean[i-1] == sent2_clean[j-1]:\n", + " dp[i][j] = dp[i-1][j-1] + 1\n", + " else:\n", + " dp[i][j] = max(dp[i-1][j], dp[i][j-1])\n", + " \n", + " lcs_length = dp[m][n]\n", + " return lcs_length / max(m, n) if max(m, n) > 0 else 0.0\n", + "\n", + "for sent1, sent2 in test_pairs:\n", + " similarity = longest_common_subsequence(sent1, sent2)\n", + " print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n", + " print(\"-\"* 50)" + ] + }, + { + "cell_type": "markdown", + "id": "1a532335", + "metadata": {}, + "source": [ + "### --- Containment Similarity ---\n", + "Percentage of Sentance A in Sentance B
\n", + "containment(A, B) = |words(A) ∩ words(B)| / |words(A)|" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "493979a4", + "metadata": {}, + "outputs": [], + "source": [ + "def containment_similarity(sent1, sent2):\n", + " # sent1 in sent2 (asymmetric)\n", + " \"\"\" What percentage of sent1's words are in sent2 \"\"\"\n", + " words1 = set(sent1.lower().translate(str.maketrans('', '', string.punctuation)).split())\n", + " words2 = set(sent2.lower().translate(str.maketrans('', '', string.punctuation)).split())\n", + " \n", + " if not words1:\n", + " return 0.0\n", + " \n", + " common = words1.intersection(words2)\n", + " return len(common) / len(words1)\n", + "\n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "912a9c7c", + "metadata": {}, + "source": [ + "### Evaluate BaseLine Methods\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b3d07562", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Pair Sentence 1 Sentence 2 \n", + "====================================================================================================\n", + "Pair 1: The cat sat on the mat. The cat sat on the mat. \n" + ] + }, + { + "ename": "TypeError", + "evalue": "tuple indices must be integers or slices, not dict", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mTypeError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[28]\u001b[39m\u001b[32m, line 44\u001b[39m\n\u001b[32m 41\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33m-\u001b[39m\u001b[33m\"\u001b[39m * \u001b[32m100\u001b[39m)\n\u001b[32m 43\u001b[39m results = evaluate_baseline_methods(test_pairs)\n\u001b[32m---> \u001b[39m\u001b[32m44\u001b[39m \u001b[43mprint_comparison_table\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresults\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_pairs\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[28]\u001b[39m\u001b[32m, line 39\u001b[39m, in \u001b[36mprint_comparison_table\u001b[39m\u001b[34m(results, pairs)\u001b[39m\n\u001b[32m 37\u001b[39m \u001b[38;5;66;03m# Print similarities for this pair\u001b[39;00m\n\u001b[32m 38\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m method_name \u001b[38;5;129;01min\u001b[39;00m results:\n\u001b[32m---> \u001b[39m\u001b[32m39\u001b[39m similarity = \u001b[43mresults\u001b[49m\u001b[43m[\u001b[49m\u001b[43mmethod_name\u001b[49m\u001b[43m]\u001b[49m[i]\n\u001b[32m 40\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m<40\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmethod_name\u001b[38;5;250m \u001b[39m+\u001b[38;5;250m \u001b[39m\u001b[33m'\u001b[39m\u001b[33m:\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m<20\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msimilarity\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m.3f\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 41\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33m-\u001b[39m\u001b[33m\"\u001b[39m * \u001b[32m100\u001b[39m)\n", + "\u001b[31mTypeError\u001b[39m: tuple indices must be integers or slices, not dict" + ] + } + ], + "source": [ + "def evaluate_baseline_methods(pairs):\n", + " \"\"\" Evaluate all baseline methods on test pairs\"\"\"\n", + " methods = {\n", + " 'Jaccard': jaccard_similarity,\n", + " 'Levenshtein (char)': char_levenshtein_similarity,\n", + " 'Levenshtein (word)': word_levenshtein_similarity,\n", + " 'Cosine BOW': cosine_similarity_bow,\n", + " 'Fuzzy Ratio': fuzzy_ratio_similarity,\n", + " 'Fuzzy Partial': fuzzy_partial_ratio,\n", + " 'Fuzzy Token Sort': fuzzy_token_sort_ratio,\n", + " 'Fuzzy Token Set': fuzzy_token_set_ratio,\n", + " 'LCS': longest_common_subsequence,\n", + " 'Containment': containment_similarity,\n", + " }\n", + " \n", + " results = {method: [] for method in methods}\n", + " \n", + " for sent1, sent2 in pairs:\n", + " for method_name, method_func in methods.items():\n", + " similarity = method_func(sent1, sent2)\n", + " results[method_name].append(similarity)\n", + " \n", + " return results, methods\n", + "\n", + "def print_comparison_table(results, pairs):\n", + " \"\"\" Print a formatted comparison table \"\"\"\n", + " print(f\"{'Pair':<40} {'Sentence 1':<30} {'Sentence 2':<30}\")\n", + " print(\"=\" * 100)\n", + " \n", + " for i, (sent1, sent2) in enumerate(pairs):\n", + " # Truncate long sentences for display\n", + " display_sent1 = sent1[:27] + \"...\" if len(sent1) > 30 else sent1\n", + " display_sent2 = sent2[:27] + \"...\" if len(sent2) > 30 else sent2\n", + " \n", + " print(f\"{f'Pair {i+1}:':<40} {display_sent1:<30} {display_sent2:<30}\")\n", + " \n", + " # Print similarities for this pair\n", + " for method_name in results:\n", + " similarity = results[method_name][i]\n", + " print(f\"{'':<40} {method_name + ':':<20} {similarity:.3f}\")\n", + " print(\"-\" * 100)\n", + "\n", + "results = evaluate_baseline_methods(test_pairs)\n", + "print_comparison_table(results, test_pairs)" + ] } ], "metadata": { "kernelspec": { - "name": "python3", + "display_name": ".venv", "language": "python", - "display_name": "Python 3 (ipykernel)" + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.7" } }, "nbformat": 4, diff --git a/requirments.txt b/requirments.txt index 4691ee1..0861ef8 100644 --- a/requirments.txt +++ b/requirments.txt @@ -1,9 +1,12 @@ -datasets -huggingface-hub pandas numpy scikit-learn -spacy matplotlib -seaborn -jupyter \ No newline at end of file + +spacy +tensor +datasets +jupyter + +huggingface-hub +seaborn \ No newline at end of file diff --git a/spacy_models b/spacy_models new file mode 100644 index 0000000..490c884 --- /dev/null +++ b/spacy_models @@ -0,0 +1,2 @@ +python -m spacy download en_core_web_lg +python -m spacy download en_core_web_trf \ No newline at end of file