Baseline experimentation on test data
This commit is contained in:
@@ -18,7 +18,19 @@
|
||||
"start_time": "2025-11-23T13:53:56.325948Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "ModuleNotFoundError",
|
||||
"evalue": "No module named 'spacy'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||||
"\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)",
|
||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtoken\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mspacy\u001b[39;00m\n\u001b[32m 3\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mspacy\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m displacy\n\u001b[32m 4\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mIPython\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdisplay\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m display, HTML\n",
|
||||
"\u001b[31mModuleNotFoundError\u001b[39m: No module named 'spacy'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import token\n",
|
||||
"import spacy\n",
|
||||
|
||||
@@ -1,17 +1,102 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "d2aa2997",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"done\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# pip install rapidfuzz scikit-learn numpy\n",
|
||||
"\n",
|
||||
"import numpy as np\n",
|
||||
"from collections import Counter\n",
|
||||
"import string\n",
|
||||
"from rapidfuzz import fuzz, distance\n",
|
||||
"\n",
|
||||
"test_pairs = [\n",
|
||||
" # Direct copies and near-copies\n",
|
||||
" (\"The cat sat on the mat.\", \"The cat sat on the mat.\"), # Exact copy\n",
|
||||
" (\"The cat sat on the mat.\", \"The cat sat on the mat\"), # No punctuation\n",
|
||||
" (\"The cat sat on the mat.\", \"The cat sat on the mat.\"), # Extra spaces\n",
|
||||
" \n",
|
||||
" # Paraphrases with same meaning\n",
|
||||
" (\"The cat sat on the mat.\", \"On the mat, the cat was sitting.\"), # Structural change\n",
|
||||
" (\"The cat sat on the mat.\", \"The feline rested on the rug.\"), # Synonym replacement\n",
|
||||
" (\"The quick brown fox jumps.\", \"A fast brown fox leaps.\"), # Partial synonym\n",
|
||||
" \n",
|
||||
" # Different sentences\n",
|
||||
" (\"The cat sat on the mat.\", \"The dog ran in the park.\"), # Different content\n",
|
||||
" (\"I love programming.\", \"She enjoys reading books.\"), # Completely different\n",
|
||||
" (\"The weather is nice today.\", \"It's raining outside.\"), # Opposite meaning\n",
|
||||
" \n",
|
||||
" # Edge cases\n",
|
||||
" (\"Short.\", \"Short.\"), # Very short\n",
|
||||
" (\"A B C D E F G\", \"A B C D E F G\"), # Repeated words\n",
|
||||
" (\"\", \"\"), # Empty strings\n",
|
||||
"]\n",
|
||||
"print(\"done\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b06d10d0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Jaccard Similarity "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "e60d024e969254a",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2025-11-19T10:01:11.039074Z",
|
||||
"start_time": "2025-11-19T10:01:09.613806Z"
|
||||
}
|
||||
},
|
||||
"cell_type": "code",
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'The cat sat on the mat': 0.667\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'On the mat, the cat was sitting.': 0.375\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'The feline rested on the rug.': 0.250\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The quick brown fox jumps.' vs 'A fast brown fox leaps.': 0.250\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'The dog ran in the park.': 0.111\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'I love programming.' vs 'She enjoys reading books.': 0.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The weather is nice today.' vs 'It's raining outside.': 0.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'Short.' vs 'Short.': 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'A B C D E F G' vs 'A B C D E F G': 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'' vs '': 0.000\n",
|
||||
"--------------------------------------------------\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import import_ipynb\n",
|
||||
"#from notebooks.01_data_exploration import *\n",
|
||||
"\n",
|
||||
"def jaccard_similarity(sent1, sent2):\n",
|
||||
" # make lowercase and split into words\n",
|
||||
" words1 = set(sent1.lower().split())\n",
|
||||
@@ -20,7 +105,7 @@
|
||||
" union = words1.union(words2)\n",
|
||||
" return float(len(intersection)) / len(union) if union else 0.0\n",
|
||||
"\n",
|
||||
"test_pairs = [\n",
|
||||
"small_test_pairs = [\n",
|
||||
" (\"The cat sat on the mat.\", \"The cat sat on the mat.\"), # Copy\n",
|
||||
" (\"The cat sat on the mat.\", \"On the mat sat the cat.\"), # Same words rearranged\n",
|
||||
" (\"The cat sat on the mat.\", \"The dog ran in the park\") # Different\n",
|
||||
@@ -28,28 +113,469 @@
|
||||
"\n",
|
||||
"for sent1, sent2 in test_pairs:\n",
|
||||
" similarity = jaccard_similarity(sent1, sent2)\n",
|
||||
" print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n"
|
||||
" print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n",
|
||||
" print(\"-\"* 50)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "337a1072",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### --- Leneshtein Similarity ---\n",
|
||||
" Character & Word"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "0b68fdcd",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"'The cat sat on the mat.' vs 'The cat sat on the mat.':\n",
|
||||
" Char similarity: 1.000 --- Word similarity: 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'The cat sat on the mat':\n",
|
||||
" Char similarity: 0.957 --- Word similarity: 0.833\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'The cat sat on the mat.':\n",
|
||||
" Char similarity: 0.821 --- Word similarity: 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'On the mat, the cat was sitting.':\n",
|
||||
" Char similarity: 0.344 --- Word similarity: 0.143\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'The feline rested on the rug.':\n",
|
||||
" Char similarity: 0.517 --- Word similarity: 0.500\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The quick brown fox jumps.' vs 'A fast brown fox leaps.':\n",
|
||||
" Char similarity: 0.577 --- Word similarity: 0.400\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'The dog ran in the park.':\n",
|
||||
" Char similarity: 0.625 --- Word similarity: 0.333\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'I love programming.' vs 'She enjoys reading books.':\n",
|
||||
" Char similarity: 0.200 --- Word similarity: 0.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The weather is nice today.' vs 'It's raining outside.':\n",
|
||||
" Char similarity: 0.192 --- Word similarity: 0.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'Short.' vs 'Short.':\n",
|
||||
" Char similarity: 1.000 --- Word similarity: 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'A B C D E F G' vs 'A B C D E F G':\n",
|
||||
" Char similarity: 1.000 --- Word similarity: 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'' vs '':\n",
|
||||
" Char similarity: 1.000 --- Word similarity: 1.000\n",
|
||||
"--------------------------------------------------\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"id": "e60d024e969254a",
|
||||
"source": [
|
||||
"def char_levenshtein_similarity(sent1, sent2):\n",
|
||||
" \"\"\" Character based edit-distance similarity \"\"\"\n",
|
||||
" if not sent1 and not sent2:\n",
|
||||
" return 1.0\n",
|
||||
" if not sent1 or not sent2:\n",
|
||||
" return 0.0\n",
|
||||
" \n",
|
||||
" max_len = max(len(sent1), len(sent2))\n",
|
||||
" edit_distance = distance.Levenshtein.distance(sent1, sent2)\n",
|
||||
" return 1 - (edit_distance / max_len)\n",
|
||||
"\n",
|
||||
"def word_levenshtein_similarity(sent1, sent2):\n",
|
||||
" \"\"\" Word based edit-distance similarity \"\"\"\n",
|
||||
" words1 = sent1.lower().split()\n",
|
||||
" words2 = sent2.lower().split()\n",
|
||||
" \n",
|
||||
" if not words1 and not words2:\n",
|
||||
" return 1.0\n",
|
||||
" if not words1 or not words2:\n",
|
||||
" return 0.0\n",
|
||||
" \n",
|
||||
" max_len = max(len(words1), len(words2))\n",
|
||||
" edit_distance = distance.Levenshtein.distance(words1, words2)\n",
|
||||
" return 1 - (edit_distance / max_len)\n",
|
||||
"\n",
|
||||
"for sent1, sent2 in test_pairs:\n",
|
||||
" char_similarity = char_levenshtein_similarity(sent1, sent2)\n",
|
||||
" word_similarity = word_levenshtein_similarity(sent1, sent2)\n",
|
||||
" print(f\"'{sent1}' vs '{sent2}':\") \n",
|
||||
" print(f\" Char similarity: {char_similarity:.3f} --- Word similarity: {word_similarity:.3f}\") # 3 decimal place\n",
|
||||
" print(\"-\"* 50)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "bae45c9a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### --- Cosine Similarity ---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "46a985b4",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
|
||||
"'The cat sat on the mat.' vs 'On the mat sat the cat.': 0.429\n",
|
||||
"'The cat sat on the mat.' vs 'The dog ran in the park': 0.111\n"
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'The cat sat on the mat': 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'On the mat, the cat was sitting.': 0.825\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'The feline rested on the rug.': 0.625\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The quick brown fox jumps.' vs 'A fast brown fox leaps.': 0.400\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'The dog ran in the park.': 0.500\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'I love programming.' vs 'She enjoys reading books.': 0.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The weather is nice today.' vs 'It's raining outside.': 0.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'Short.' vs 'Short.': 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'A B C D E F G' vs 'A B C D E F G': 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'' vs '': 1.000\n",
|
||||
"--------------------------------------------------\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"execution_count": 9
|
||||
"source": [
|
||||
"def cosine_similarity_bow(sent1, sent2):\n",
|
||||
" \"\"\" dosine similarity using bag-of-words \"\"\"\n",
|
||||
" words1 = sent1.lower().translate(str.maketrans('', '', string.punctuation)).split()\n",
|
||||
" words2 = sent2.lower().translate(str.maketrans('', '', string.punctuation)).split()\n",
|
||||
" \n",
|
||||
" if not words1 and not words2:\n",
|
||||
" return 1.0\n",
|
||||
" \n",
|
||||
" vocabulary = set(words1 + words2)\n",
|
||||
" if not vocabulary:\n",
|
||||
" return 0.0\n",
|
||||
" \n",
|
||||
" # Create frequency vectors\n",
|
||||
" freq1 = Counter(words1)\n",
|
||||
" freq2 = Counter(words2)\n",
|
||||
" \n",
|
||||
" # Convert to vectors\n",
|
||||
" vec1 = np.array([freq1[word] for word in vocabulary])\n",
|
||||
" vec2 = np.array([freq2[word] for word in vocabulary])\n",
|
||||
" \n",
|
||||
" # Compute cosine similarity\n",
|
||||
" dot_product = np.dot(vec1, vec2)\n",
|
||||
" norm1 = np.linalg.norm(vec1)\n",
|
||||
" norm2 = np.linalg.norm(vec2)\n",
|
||||
" \n",
|
||||
" if norm1 == 0 or norm2 == 0:\n",
|
||||
" return 0.0\n",
|
||||
" \n",
|
||||
" return dot_product / (norm1 * norm2)\n",
|
||||
"\n",
|
||||
"for sent1, sent2 in test_pairs:\n",
|
||||
" similarity = cosine_similarity_bow(sent1, sent2)\n",
|
||||
" print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n",
|
||||
" print(\"-\"* 50)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "658276dc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### --- Fuzzy ratios ---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"id": "7dc7ac2e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Fuzzy ratio Examples\n",
|
||||
"==========================================================================================\n",
|
||||
"Case Ratio Partial Token Sort Token Set Description\n",
|
||||
"==========================================================================================\n",
|
||||
"The cat sat on the mat. 100 100 100 100 Exact copy\n",
|
||||
"The cat sat on the mat. 97 100 97 97 Different content\n",
|
||||
"The cat sat on the mat. 90 82 100 100 Same words, different order\n",
|
||||
"The cat sat on the mat. 47 57 58 62 Different content\n",
|
||||
"The cat sat on the mat. 61 60 53 60 Different content\n",
|
||||
"The quick brown fox jumps. 61 70 57 57 Different content\n",
|
||||
"The cat sat on the mat. 63 63 55 55 Different content\n",
|
||||
"I love programming. 40 43 40 40 Different content\n",
|
||||
"The weather is nice today. 38 47 29 29 Different content\n",
|
||||
"Short. 100 100 100 100 Exact copy\n",
|
||||
"A B C D E F G 100 100 100 100 Exact copy\n",
|
||||
" 100 100 100 0 Exact copy\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def fuzzy_ratio_similarity(sent1, sent2):\n",
|
||||
" \"\"\"Fuzzy string matching ratio\"\"\"\n",
|
||||
" return fuzz.ratio(sent1.lower(), sent2.lower()) / 100.0\n",
|
||||
"\n",
|
||||
"def fuzzy_partial_ratio(sent1, sent2):\n",
|
||||
" \"\"\"Fuzzy partial string matching\"\"\"\n",
|
||||
" return fuzz.partial_ratio(sent1.lower(), sent2.lower()) / 100.0\n",
|
||||
"\n",
|
||||
"def fuzzy_token_sort_ratio(sent1, sent2):\n",
|
||||
" \"\"\"Fuzzy token sorting ratio (ignore order)\"\"\"\n",
|
||||
" return fuzz.token_sort_ratio(sent1.lower(), sent2.lower()) / 100.0\n",
|
||||
"\n",
|
||||
"def fuzzy_token_set_ratio(sent1, sent2):\n",
|
||||
" \"\"\" Fuzzy token set ratio (duplicates) \"\"\"\n",
|
||||
" return fuzz.token_set_ratio(sent1.lower(), sent2.lower()) / 100.0\n",
|
||||
"\n",
|
||||
"print(\"Fuzzy ratio Examples\")\n",
|
||||
"print(\"=\" * 90)\n",
|
||||
"print(f\"{'Case':<30} {'Ratio':<8} {'Partial':<8} {'Token Sort':<12} {'Token Set':<8} Description\")\n",
|
||||
"print(\"=\" * 90)\n",
|
||||
"\n",
|
||||
"for sent1, sent2 in test_pairs:\n",
|
||||
" ratio = int(fuzz.ratio(sent1, sent2))\n",
|
||||
" partial = int(fuzz.partial_ratio(sent1, sent2)) \n",
|
||||
" token_sort = int(fuzz.token_sort_ratio(sent1, sent2))\n",
|
||||
" token_set = int(fuzz.token_set_ratio(sent1, sent2))\n",
|
||||
" \n",
|
||||
" # description\n",
|
||||
" if sent1 == sent2:\n",
|
||||
" desc = \"Exact copy\"\n",
|
||||
" elif sorted(sent1.split()) == sorted(sent2.split()):\n",
|
||||
" desc = \"Same words, different order\"\n",
|
||||
" elif set(sent1.split()).issubset(set(sent2.split())) or set(sent2.split()).issubset(set(sent1.split())):\n",
|
||||
" desc = \"Subset relationship\"\n",
|
||||
" else:\n",
|
||||
" desc = \"Different content\"\n",
|
||||
" \n",
|
||||
" print(f\"{sent1[:28]:<30} {ratio:<8} {partial:<8} {token_sort:<12} {token_set:<8} {desc}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a48774d4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### --- Longest common sub-sequence ----"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"id": "e6a4d4e2",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'The cat sat on the mat': 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 0.815\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'On the mat, the cat was sitting.': 0.433\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'The feline rested on the rug.': 0.536\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The quick brown fox jumps.' vs 'A fast brown fox leaps.': 0.560\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'The dog ran in the park.': 0.609\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'I love programming.' vs 'She enjoys reading books.': 0.333\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The weather is nice today.' vs 'It's raining outside.': 0.360\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'Short.' vs 'Short.': 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'A B C D E F G' vs 'A B C D E F G': 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'' vs '': 0.000\n",
|
||||
"--------------------------------------------------\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def longest_common_subsequence(sent1, sent2):\n",
|
||||
" \"\"\" Longest common subsequence similarity \"\"\"\n",
|
||||
" if not sent1 or not sent2:\n",
|
||||
" return 0.0\n",
|
||||
" \n",
|
||||
" # Remove punctuation for better matching\n",
|
||||
" sent1_clean = sent1.lower().translate(str.maketrans('', '', string.punctuation))\n",
|
||||
" sent2_clean = sent2.lower().translate(str.maketrans('', '', string.punctuation))\n",
|
||||
" \n",
|
||||
" m, n = len(sent1_clean), len(sent2_clean)\n",
|
||||
" dp = [[0] * (n + 1) for _ in range(m + 1)]\n",
|
||||
" \n",
|
||||
" for i in range(1, m + 1):\n",
|
||||
" for j in range(1, n + 1):\n",
|
||||
" if sent1_clean[i-1] == sent2_clean[j-1]:\n",
|
||||
" dp[i][j] = dp[i-1][j-1] + 1\n",
|
||||
" else:\n",
|
||||
" dp[i][j] = max(dp[i-1][j], dp[i][j-1])\n",
|
||||
" \n",
|
||||
" lcs_length = dp[m][n]\n",
|
||||
" return lcs_length / max(m, n) if max(m, n) > 0 else 0.0\n",
|
||||
"\n",
|
||||
"for sent1, sent2 in test_pairs:\n",
|
||||
" similarity = longest_common_subsequence(sent1, sent2)\n",
|
||||
" print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n",
|
||||
" print(\"-\"* 50)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1a532335",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### --- Containment Similarity ---\n",
|
||||
"Percentage of Sentance A in Sentance B <br>\n",
|
||||
"containment(A, B) = |words(A) ∩ words(B)| / |words(A)|"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "493979a4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def containment_similarity(sent1, sent2):\n",
|
||||
" # sent1 in sent2 (asymmetric)\n",
|
||||
" \"\"\" What percentage of sent1's words are in sent2 \"\"\"\n",
|
||||
" words1 = set(sent1.lower().translate(str.maketrans('', '', string.punctuation)).split())\n",
|
||||
" words2 = set(sent2.lower().translate(str.maketrans('', '', string.punctuation)).split())\n",
|
||||
" \n",
|
||||
" if not words1:\n",
|
||||
" return 0.0\n",
|
||||
" \n",
|
||||
" common = words1.intersection(words2)\n",
|
||||
" return len(common) / len(words1)\n",
|
||||
"\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "912a9c7c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Evaluate BaseLine Methods\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b3d07562",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Pair Sentence 1 Sentence 2 \n",
|
||||
"====================================================================================================\n",
|
||||
"Pair 1: The cat sat on the mat. The cat sat on the mat. \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "TypeError",
|
||||
"evalue": "tuple indices must be integers or slices, not dict",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||||
"\u001b[31mTypeError\u001b[39m Traceback (most recent call last)",
|
||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[28]\u001b[39m\u001b[32m, line 44\u001b[39m\n\u001b[32m 41\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33m-\u001b[39m\u001b[33m\"\u001b[39m * \u001b[32m100\u001b[39m)\n\u001b[32m 43\u001b[39m results = evaluate_baseline_methods(test_pairs)\n\u001b[32m---> \u001b[39m\u001b[32m44\u001b[39m \u001b[43mprint_comparison_table\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresults\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_pairs\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[28]\u001b[39m\u001b[32m, line 39\u001b[39m, in \u001b[36mprint_comparison_table\u001b[39m\u001b[34m(results, pairs)\u001b[39m\n\u001b[32m 37\u001b[39m \u001b[38;5;66;03m# Print similarities for this pair\u001b[39;00m\n\u001b[32m 38\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m method_name \u001b[38;5;129;01min\u001b[39;00m results:\n\u001b[32m---> \u001b[39m\u001b[32m39\u001b[39m similarity = \u001b[43mresults\u001b[49m\u001b[43m[\u001b[49m\u001b[43mmethod_name\u001b[49m\u001b[43m]\u001b[49m[i]\n\u001b[32m 40\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m<40\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmethod_name\u001b[38;5;250m \u001b[39m+\u001b[38;5;250m \u001b[39m\u001b[33m'\u001b[39m\u001b[33m:\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m<20\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msimilarity\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m.3f\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 41\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33m-\u001b[39m\u001b[33m\"\u001b[39m * \u001b[32m100\u001b[39m)\n",
|
||||
"\u001b[31mTypeError\u001b[39m: tuple indices must be integers or slices, not dict"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def evaluate_baseline_methods(pairs):\n",
|
||||
" \"\"\" Evaluate all baseline methods on test pairs\"\"\"\n",
|
||||
" methods = {\n",
|
||||
" 'Jaccard': jaccard_similarity,\n",
|
||||
" 'Levenshtein (char)': char_levenshtein_similarity,\n",
|
||||
" 'Levenshtein (word)': word_levenshtein_similarity,\n",
|
||||
" 'Cosine BOW': cosine_similarity_bow,\n",
|
||||
" 'Fuzzy Ratio': fuzzy_ratio_similarity,\n",
|
||||
" 'Fuzzy Partial': fuzzy_partial_ratio,\n",
|
||||
" 'Fuzzy Token Sort': fuzzy_token_sort_ratio,\n",
|
||||
" 'Fuzzy Token Set': fuzzy_token_set_ratio,\n",
|
||||
" 'LCS': longest_common_subsequence,\n",
|
||||
" 'Containment': containment_similarity,\n",
|
||||
" }\n",
|
||||
" \n",
|
||||
" results = {method: [] for method in methods}\n",
|
||||
" \n",
|
||||
" for sent1, sent2 in pairs:\n",
|
||||
" for method_name, method_func in methods.items():\n",
|
||||
" similarity = method_func(sent1, sent2)\n",
|
||||
" results[method_name].append(similarity)\n",
|
||||
" \n",
|
||||
" return results, methods\n",
|
||||
"\n",
|
||||
"def print_comparison_table(results, pairs):\n",
|
||||
" \"\"\" Print a formatted comparison table \"\"\"\n",
|
||||
" print(f\"{'Pair':<40} {'Sentence 1':<30} {'Sentence 2':<30}\")\n",
|
||||
" print(\"=\" * 100)\n",
|
||||
" \n",
|
||||
" for i, (sent1, sent2) in enumerate(pairs):\n",
|
||||
" # Truncate long sentences for display\n",
|
||||
" display_sent1 = sent1[:27] + \"...\" if len(sent1) > 30 else sent1\n",
|
||||
" display_sent2 = sent2[:27] + \"...\" if len(sent2) > 30 else sent2\n",
|
||||
" \n",
|
||||
" print(f\"{f'Pair {i+1}:':<40} {display_sent1:<30} {display_sent2:<30}\")\n",
|
||||
" \n",
|
||||
" # Print similarities for this pair\n",
|
||||
" for method_name in results:\n",
|
||||
" similarity = results[method_name][i]\n",
|
||||
" print(f\"{'':<40} {method_name + ':':<20} {similarity:.3f}\")\n",
|
||||
" print(\"-\" * 100)\n",
|
||||
"\n",
|
||||
"results = evaluate_baseline_methods(test_pairs)\n",
|
||||
"print_comparison_table(results, test_pairs)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"name": "python3",
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"display_name": "Python 3 (ipykernel)"
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -1,9 +1,12 @@
|
||||
datasets
|
||||
huggingface-hub
|
||||
pandas
|
||||
numpy
|
||||
scikit-learn
|
||||
spacy
|
||||
matplotlib
|
||||
seaborn
|
||||
jupyter
|
||||
|
||||
spacy
|
||||
tensor
|
||||
datasets
|
||||
jupyter
|
||||
|
||||
huggingface-hub
|
||||
seaborn
|
||||
2
spacy_models
Normal file
2
spacy_models
Normal file
@@ -0,0 +1,2 @@
|
||||
python -m spacy download en_core_web_lg
|
||||
python -m spacy download en_core_web_trf
|
||||
Reference in New Issue
Block a user