Expanded on Baseline experimens, analyzed and compared
This commit is contained in:
@@ -2,7 +2,7 @@
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 1,
|
||||
"id": "d2aa2997",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -56,7 +56,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"execution_count": 58,
|
||||
"id": "e60d024e969254a",
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
@@ -69,30 +69,20 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'The cat sat on the mat': 0.667\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'On the mat, the cat was sitting.': 0.375\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'The feline rested on the rug.': 0.250\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The quick brown fox jumps.' vs 'A fast brown fox leaps.': 0.250\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'The dog ran in the park.': 0.111\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'I love programming.' vs 'She enjoys reading books.': 0.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The weather is nice today.' vs 'It's raining outside.': 0.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'Short.' vs 'Short.': 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'A B C D E F G' vs 'A B C D E F G': 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'' vs '': 0.000\n",
|
||||
"--------------------------------------------------\n"
|
||||
"Sentence 1 Sentence 2: Similarity Score:\n",
|
||||
"====================================================================================================\n",
|
||||
"The cat sat on the mat. vs The cat sat on the mat. 1.000\n",
|
||||
"The cat sat on the mat. vs The cat sat on the mat 0.667\n",
|
||||
"The cat sat on the mat. vs The cat sat on the mat. 1.000\n",
|
||||
"The cat sat on the mat. vs On the mat, the cat was sitting. 0.375\n",
|
||||
"The cat sat on the mat. vs The feline rested on the rug. 0.250\n",
|
||||
"The quick brown fox jumps. vs A fast brown fox leaps. 0.250\n",
|
||||
"The cat sat on the mat. vs The dog ran in the park. 0.111\n",
|
||||
"I love programming. vs She enjoys reading books. 0.000\n",
|
||||
"The weather is nice today. vs It's raining outside. 0.000\n",
|
||||
"Short. vs Short. 1.000\n",
|
||||
"A B C D E F G vs A B C D E F G 1.000\n",
|
||||
" vs 0.000\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -111,10 +101,13 @@
|
||||
" (\"The cat sat on the mat.\", \"The dog ran in the park\") # Different\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"print(f\"{'Sentence 1':<41} {'Sentence 2:':<40} {'Similarity Score:'}\")\n",
|
||||
"print(\"=\" * 100)\n",
|
||||
"\n",
|
||||
"for sent1, sent2 in test_pairs:\n",
|
||||
" similarity = jaccard_similarity(sent1, sent2)\n",
|
||||
" print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n",
|
||||
" print(\"-\"* 50)\n"
|
||||
" print(f\"{sent1:<30} {'vs':<10} {sent2:<40} {similarity:.3f}\") # 3 decimal places\n",
|
||||
" #print(\"-\"* 50)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -128,7 +121,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": 57,
|
||||
"id": "0b68fdcd",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -136,42 +129,20 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"'The cat sat on the mat.' vs 'The cat sat on the mat.':\n",
|
||||
" Char similarity: 1.000 --- Word similarity: 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'The cat sat on the mat':\n",
|
||||
" Char similarity: 0.957 --- Word similarity: 0.833\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'The cat sat on the mat.':\n",
|
||||
" Char similarity: 0.821 --- Word similarity: 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'On the mat, the cat was sitting.':\n",
|
||||
" Char similarity: 0.344 --- Word similarity: 0.143\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'The feline rested on the rug.':\n",
|
||||
" Char similarity: 0.517 --- Word similarity: 0.500\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The quick brown fox jumps.' vs 'A fast brown fox leaps.':\n",
|
||||
" Char similarity: 0.577 --- Word similarity: 0.400\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'The dog ran in the park.':\n",
|
||||
" Char similarity: 0.625 --- Word similarity: 0.333\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'I love programming.' vs 'She enjoys reading books.':\n",
|
||||
" Char similarity: 0.200 --- Word similarity: 0.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The weather is nice today.' vs 'It's raining outside.':\n",
|
||||
" Char similarity: 0.192 --- Word similarity: 0.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'Short.' vs 'Short.':\n",
|
||||
" Char similarity: 1.000 --- Word similarity: 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'A B C D E F G' vs 'A B C D E F G':\n",
|
||||
" Char similarity: 1.000 --- Word similarity: 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'' vs '':\n",
|
||||
" Char similarity: 1.000 --- Word similarity: 1.000\n",
|
||||
"--------------------------------------------------\n"
|
||||
"Sentence 1 Sentence 2: Similarities -> Char: Word:\n",
|
||||
"====================================================================================================\n",
|
||||
"The cat sat on the mat. vs The cat sat on the mat. 1.000 1.000 \n",
|
||||
"The cat sat on the mat. vs The cat sat on the mat 0.957 0.833 \n",
|
||||
"The cat sat on the mat. vs The cat sat on the mat. 0.821 1.000 \n",
|
||||
"The cat sat on the mat. vs On the mat, the cat was sitting. 0.344 0.143 \n",
|
||||
"The cat sat on the mat. vs The feline rested on the rug. 0.517 0.500 \n",
|
||||
"The quick brown fox jumps. vs A fast brown fox leaps. 0.577 0.400 \n",
|
||||
"The cat sat on the mat. vs The dog ran in the park. 0.625 0.333 \n",
|
||||
"I love programming. vs She enjoys reading books. 0.200 0.000 \n",
|
||||
"The weather is nice today. vs It's raining outside. 0.192 0.000 \n",
|
||||
"Short. vs Short. 1.000 1.000 \n",
|
||||
"A B C D E F G vs A B C D E F G 1.000 1.000 \n",
|
||||
" vs 1.000 1.000 \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -201,12 +172,16 @@
|
||||
" edit_distance = distance.Levenshtein.distance(words1, words2)\n",
|
||||
" return 1 - (edit_distance / max_len)\n",
|
||||
"\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"print(f\"{'Sentence 1':<41} {'Sentence 2:':<20} {'Similarities ->':<19} {'Char:':<10} {'Word:'}\")\n",
|
||||
"print(\"=\" * 100)\n",
|
||||
"\n",
|
||||
"for sent1, sent2 in test_pairs:\n",
|
||||
" char_similarity = char_levenshtein_similarity(sent1, sent2)\n",
|
||||
" word_similarity = word_levenshtein_similarity(sent1, sent2)\n",
|
||||
" print(f\"'{sent1}' vs '{sent2}':\") \n",
|
||||
" print(f\" Char similarity: {char_similarity:.3f} --- Word similarity: {word_similarity:.3f}\") # 3 decimal place\n",
|
||||
" print(\"-\"* 50)"
|
||||
" print(f\"{sent1:<30} {'vs':<10} {sent2:<40} {char_similarity:.3f}{'':<5} {word_similarity:.3F} \") # 3 decimal places\n",
|
||||
" #print(\"-\"* 50"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -219,7 +194,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"execution_count": 38,
|
||||
"id": "46a985b4",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -227,30 +202,20 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'The cat sat on the mat': 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'On the mat, the cat was sitting.': 0.825\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'The feline rested on the rug.': 0.625\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The quick brown fox jumps.' vs 'A fast brown fox leaps.': 0.400\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'The dog ran in the park.': 0.500\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'I love programming.' vs 'She enjoys reading books.': 0.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The weather is nice today.' vs 'It's raining outside.': 0.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'Short.' vs 'Short.': 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'A B C D E F G' vs 'A B C D E F G': 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'' vs '': 1.000\n",
|
||||
"--------------------------------------------------\n"
|
||||
"Sentence 1 Sentence 2: Similarity Score:\n",
|
||||
"====================================================================================================\n",
|
||||
"The cat sat on the mat. vs The cat sat on the mat. 1.000\n",
|
||||
"The cat sat on the mat. vs The cat sat on the mat 1.000\n",
|
||||
"The cat sat on the mat. vs The cat sat on the mat. 1.000\n",
|
||||
"The cat sat on the mat. vs On the mat, the cat was sitting. 0.825\n",
|
||||
"The cat sat on the mat. vs The feline rested on the rug. 0.625\n",
|
||||
"The quick brown fox jumps. vs A fast brown fox leaps. 0.400\n",
|
||||
"The cat sat on the mat. vs The dog ran in the park. 0.500\n",
|
||||
"I love programming. vs She enjoys reading books. 0.000\n",
|
||||
"The weather is nice today. vs It's raining outside. 0.000\n",
|
||||
"Short. vs Short. 1.000\n",
|
||||
"A B C D E F G vs A B C D E F G 1.000\n",
|
||||
" vs 1.000\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -285,10 +250,13 @@
|
||||
" \n",
|
||||
" return dot_product / (norm1 * norm2)\n",
|
||||
"\n",
|
||||
"print(f\"{'Sentence 1':<41} {'Sentence 2:':<40} {'Similarity Score:'}\")\n",
|
||||
"print(\"=\" * 100)\n",
|
||||
"\n",
|
||||
"for sent1, sent2 in test_pairs:\n",
|
||||
" similarity = cosine_similarity_bow(sent1, sent2)\n",
|
||||
" print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n",
|
||||
" print(\"-\"* 50)"
|
||||
" print(f\"{sent1:<30} {'vs':<10} {sent2:<40} {similarity:.3f}\") # 3 decimal places\n",
|
||||
" #print(\"-\"* 50)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -301,7 +269,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"execution_count": 5,
|
||||
"id": "7dc7ac2e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -379,7 +347,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"execution_count": 37,
|
||||
"id": "e6a4d4e2",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -387,30 +355,20 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'The cat sat on the mat': 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 0.815\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'On the mat, the cat was sitting.': 0.433\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'The feline rested on the rug.': 0.536\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The quick brown fox jumps.' vs 'A fast brown fox leaps.': 0.560\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The cat sat on the mat.' vs 'The dog ran in the park.': 0.609\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'I love programming.' vs 'She enjoys reading books.': 0.333\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'The weather is nice today.' vs 'It's raining outside.': 0.360\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'Short.' vs 'Short.': 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'A B C D E F G' vs 'A B C D E F G': 1.000\n",
|
||||
"--------------------------------------------------\n",
|
||||
"'' vs '': 0.000\n",
|
||||
"--------------------------------------------------\n"
|
||||
"Sentence 1 Sentence 2: Similarity Score:\n",
|
||||
"====================================================================================================\n",
|
||||
"The cat sat on the mat. vs The cat sat on the mat. 1.000\n",
|
||||
"The cat sat on the mat. vs The cat sat on the mat 1.000\n",
|
||||
"The cat sat on the mat. vs The cat sat on the mat. 0.815\n",
|
||||
"The cat sat on the mat. vs On the mat, the cat was sitting. 0.433\n",
|
||||
"The cat sat on the mat. vs The feline rested on the rug. 0.536\n",
|
||||
"The quick brown fox jumps. vs A fast brown fox leaps. 0.560\n",
|
||||
"The cat sat on the mat. vs The dog ran in the park. 0.609\n",
|
||||
"I love programming. vs She enjoys reading books. 0.333\n",
|
||||
"The weather is nice today. vs It's raining outside. 0.360\n",
|
||||
"Short. vs Short. 1.000\n",
|
||||
"A B C D E F G vs A B C D E F G 1.000\n",
|
||||
" vs 0.000\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -437,10 +395,13 @@
|
||||
" lcs_length = dp[m][n]\n",
|
||||
" return lcs_length / max(m, n) if max(m, n) > 0 else 0.0\n",
|
||||
"\n",
|
||||
"print(f\"{'Sentence 1':<41} {'Sentence 2:':<40} {'Similarity Score:'}\")\n",
|
||||
"print(\"=\" * 100)\n",
|
||||
"\n",
|
||||
"for sent1, sent2 in test_pairs:\n",
|
||||
" similarity = longest_common_subsequence(sent1, sent2)\n",
|
||||
" print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n",
|
||||
" print(\"-\"* 50)"
|
||||
" print(f\"{sent1:<30} {'vs':<10} {sent2:<40} {similarity:.3f}\") # 3 decimal places\n",
|
||||
" #print(\"-\"* 50)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -471,8 +432,7 @@
|
||||
" \n",
|
||||
" common = words1.intersection(words2)\n",
|
||||
" return len(common) / len(words1)\n",
|
||||
"\n",
|
||||
" "
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -485,7 +445,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 25,
|
||||
"id": "b3d07562",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@@ -495,19 +455,150 @@
|
||||
"text": [
|
||||
"Pair Sentence 1 Sentence 2 \n",
|
||||
"====================================================================================================\n",
|
||||
"Pair 1: The cat sat on the mat. The cat sat on the mat. \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "TypeError",
|
||||
"evalue": "tuple indices must be integers or slices, not dict",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||||
"\u001b[31mTypeError\u001b[39m Traceback (most recent call last)",
|
||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[28]\u001b[39m\u001b[32m, line 44\u001b[39m\n\u001b[32m 41\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33m-\u001b[39m\u001b[33m\"\u001b[39m * \u001b[32m100\u001b[39m)\n\u001b[32m 43\u001b[39m results = evaluate_baseline_methods(test_pairs)\n\u001b[32m---> \u001b[39m\u001b[32m44\u001b[39m \u001b[43mprint_comparison_table\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresults\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_pairs\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[28]\u001b[39m\u001b[32m, line 39\u001b[39m, in \u001b[36mprint_comparison_table\u001b[39m\u001b[34m(results, pairs)\u001b[39m\n\u001b[32m 37\u001b[39m \u001b[38;5;66;03m# Print similarities for this pair\u001b[39;00m\n\u001b[32m 38\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m method_name \u001b[38;5;129;01min\u001b[39;00m results:\n\u001b[32m---> \u001b[39m\u001b[32m39\u001b[39m similarity = \u001b[43mresults\u001b[49m\u001b[43m[\u001b[49m\u001b[43mmethod_name\u001b[49m\u001b[43m]\u001b[49m[i]\n\u001b[32m 40\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m<40\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmethod_name\u001b[38;5;250m \u001b[39m+\u001b[38;5;250m \u001b[39m\u001b[33m'\u001b[39m\u001b[33m:\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m<20\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msimilarity\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m.3f\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 41\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33m-\u001b[39m\u001b[33m\"\u001b[39m * \u001b[32m100\u001b[39m)\n",
|
||||
"\u001b[31mTypeError\u001b[39m: tuple indices must be integers or slices, not dict"
|
||||
"Pair 1: The cat sat on the mat. The cat sat on the mat. \n",
|
||||
" Jaccard: 1.000\n",
|
||||
" Levenshtein (char): 1.000\n",
|
||||
" Levenshtein (word): 1.000\n",
|
||||
" Cosine BOW: 1.000\n",
|
||||
" Fuzzy Ratio: 1.000\n",
|
||||
" Fuzzy Partial: 1.000\n",
|
||||
" Fuzzy Token Sort: 1.000\n",
|
||||
" Fuzzy Token Set: 1.000\n",
|
||||
" LCS: 1.000\n",
|
||||
" Containment: 1.000\n",
|
||||
"----------------------------------------------------------------------------------------------------\n",
|
||||
"Pair 2: The cat sat on the mat. The cat sat on the mat \n",
|
||||
" Jaccard: 0.667\n",
|
||||
" Levenshtein (char): 0.957\n",
|
||||
" Levenshtein (word): 0.833\n",
|
||||
" Cosine BOW: 1.000\n",
|
||||
" Fuzzy Ratio: 0.978\n",
|
||||
" Fuzzy Partial: 1.000\n",
|
||||
" Fuzzy Token Sort: 0.978\n",
|
||||
" Fuzzy Token Set: 0.973\n",
|
||||
" LCS: 1.000\n",
|
||||
" Containment: 1.000\n",
|
||||
"----------------------------------------------------------------------------------------------------\n",
|
||||
"Pair 3: The cat sat on the mat. The cat sat on the mat. \n",
|
||||
" Jaccard: 1.000\n",
|
||||
" Levenshtein (char): 0.821\n",
|
||||
" Levenshtein (word): 1.000\n",
|
||||
" Cosine BOW: 1.000\n",
|
||||
" Fuzzy Ratio: 0.902\n",
|
||||
" Fuzzy Partial: 0.826\n",
|
||||
" Fuzzy Token Sort: 1.000\n",
|
||||
" Fuzzy Token Set: 1.000\n",
|
||||
" LCS: 0.815\n",
|
||||
" Containment: 1.000\n",
|
||||
"----------------------------------------------------------------------------------------------------\n",
|
||||
"Pair 4: The cat sat on the mat. On the mat, the cat was sit...\n",
|
||||
" Jaccard: 0.375\n",
|
||||
" Levenshtein (char): 0.344\n",
|
||||
" Levenshtein (word): 0.143\n",
|
||||
" Cosine BOW: 0.825\n",
|
||||
" Fuzzy Ratio: 0.509\n",
|
||||
" Fuzzy Partial: 0.619\n",
|
||||
" Fuzzy Token Sort: 0.764\n",
|
||||
" Fuzzy Token Set: 0.723\n",
|
||||
" LCS: 0.433\n",
|
||||
" Containment: 0.800\n",
|
||||
"----------------------------------------------------------------------------------------------------\n",
|
||||
"Pair 5: The cat sat on the mat. The feline rested on the rug. \n",
|
||||
" Jaccard: 0.250\n",
|
||||
" Levenshtein (char): 0.517\n",
|
||||
" Levenshtein (word): 0.500\n",
|
||||
" Cosine BOW: 0.625\n",
|
||||
" Fuzzy Ratio: 0.615\n",
|
||||
" Fuzzy Partial: 0.605\n",
|
||||
" Fuzzy Token Sort: 0.538\n",
|
||||
" Fuzzy Token Set: 0.480\n",
|
||||
" LCS: 0.536\n",
|
||||
" Containment: 0.400\n",
|
||||
"----------------------------------------------------------------------------------------------------\n",
|
||||
"Pair 6: The quick brown fox jumps. A fast brown fox leaps. \n",
|
||||
" Jaccard: 0.250\n",
|
||||
" Levenshtein (char): 0.577\n",
|
||||
" Levenshtein (word): 0.400\n",
|
||||
" Cosine BOW: 0.400\n",
|
||||
" Fuzzy Ratio: 0.612\n",
|
||||
" Fuzzy Partial: 0.700\n",
|
||||
" Fuzzy Token Sort: 0.531\n",
|
||||
" Fuzzy Token Set: 0.562\n",
|
||||
" LCS: 0.560\n",
|
||||
" Containment: 0.400\n",
|
||||
"----------------------------------------------------------------------------------------------------\n",
|
||||
"Pair 7: The cat sat on the mat. The dog ran in the park. \n",
|
||||
" Jaccard: 0.111\n",
|
||||
" Levenshtein (char): 0.625\n",
|
||||
" Levenshtein (word): 0.333\n",
|
||||
" Cosine BOW: 0.500\n",
|
||||
" Fuzzy Ratio: 0.638\n",
|
||||
" Fuzzy Partial: 0.636\n",
|
||||
" Fuzzy Token Sort: 0.553\n",
|
||||
" Fuzzy Token Set: 0.462\n",
|
||||
" LCS: 0.609\n",
|
||||
" Containment: 0.200\n",
|
||||
"----------------------------------------------------------------------------------------------------\n",
|
||||
"Pair 8: I love programming. She enjoys reading books. \n",
|
||||
" Jaccard: 0.000\n",
|
||||
" Levenshtein (char): 0.200\n",
|
||||
" Levenshtein (word): 0.000\n",
|
||||
" Cosine BOW: 0.000\n",
|
||||
" Fuzzy Ratio: 0.409\n",
|
||||
" Fuzzy Partial: 0.432\n",
|
||||
" Fuzzy Token Sort: 0.364\n",
|
||||
" Fuzzy Token Set: 0.364\n",
|
||||
" LCS: 0.333\n",
|
||||
" Containment: 0.000\n",
|
||||
"----------------------------------------------------------------------------------------------------\n",
|
||||
"Pair 9: The weather is nice today. It's raining outside. \n",
|
||||
" Jaccard: 0.000\n",
|
||||
" Levenshtein (char): 0.192\n",
|
||||
" Levenshtein (word): 0.000\n",
|
||||
" Cosine BOW: 0.000\n",
|
||||
" Fuzzy Ratio: 0.426\n",
|
||||
" Fuzzy Partial: 0.514\n",
|
||||
" Fuzzy Token Sort: 0.340\n",
|
||||
" Fuzzy Token Set: 0.340\n",
|
||||
" LCS: 0.360\n",
|
||||
" Containment: 0.000\n",
|
||||
"----------------------------------------------------------------------------------------------------\n",
|
||||
"Pair 10: Short. Short. \n",
|
||||
" Jaccard: 1.000\n",
|
||||
" Levenshtein (char): 1.000\n",
|
||||
" Levenshtein (word): 1.000\n",
|
||||
" Cosine BOW: 1.000\n",
|
||||
" Fuzzy Ratio: 1.000\n",
|
||||
" Fuzzy Partial: 1.000\n",
|
||||
" Fuzzy Token Sort: 1.000\n",
|
||||
" Fuzzy Token Set: 1.000\n",
|
||||
" LCS: 1.000\n",
|
||||
" Containment: 1.000\n",
|
||||
"----------------------------------------------------------------------------------------------------\n",
|
||||
"Pair 11: A B C D E F G A B C D E F G \n",
|
||||
" Jaccard: 1.000\n",
|
||||
" Levenshtein (char): 1.000\n",
|
||||
" Levenshtein (word): 1.000\n",
|
||||
" Cosine BOW: 1.000\n",
|
||||
" Fuzzy Ratio: 1.000\n",
|
||||
" Fuzzy Partial: 1.000\n",
|
||||
" Fuzzy Token Sort: 1.000\n",
|
||||
" Fuzzy Token Set: 1.000\n",
|
||||
" LCS: 1.000\n",
|
||||
" Containment: 1.000\n",
|
||||
"----------------------------------------------------------------------------------------------------\n",
|
||||
"Pair 12: \n",
|
||||
" Jaccard: 0.000\n",
|
||||
" Levenshtein (char): 1.000\n",
|
||||
" Levenshtein (word): 1.000\n",
|
||||
" Cosine BOW: 1.000\n",
|
||||
" Fuzzy Ratio: 1.000\n",
|
||||
" Fuzzy Partial: 1.000\n",
|
||||
" Fuzzy Token Sort: 1.000\n",
|
||||
" Fuzzy Token Set: 0.000\n",
|
||||
" LCS: 0.000\n",
|
||||
" Containment: 0.000\n",
|
||||
"----------------------------------------------------------------------------------------------------\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -538,23 +629,23 @@
|
||||
"\n",
|
||||
"def print_comparison_table(results, pairs):\n",
|
||||
" \"\"\" Print a formatted comparison table \"\"\"\n",
|
||||
" print(f\"{'Pair':<40} {'Sentence 1':<30} {'Sentence 2':<30}\")\n",
|
||||
" print(f\"{'Pair':<30} {'Sentence 1':<30} {'Sentence 2':<30}\")\n",
|
||||
" print(\"=\" * 100)\n",
|
||||
" \n",
|
||||
" for i, (sent1, sent2) in enumerate(pairs):\n",
|
||||
" # Truncate long sentences for display\n",
|
||||
" # if sentence too long\n",
|
||||
" display_sent1 = sent1[:27] + \"...\" if len(sent1) > 30 else sent1\n",
|
||||
" display_sent2 = sent2[:27] + \"...\" if len(sent2) > 30 else sent2\n",
|
||||
" \n",
|
||||
" print(f\"{f'Pair {i+1}:':<40} {display_sent1:<30} {display_sent2:<30}\")\n",
|
||||
" print(f\"{f'Pair {i+1}:':<30} {display_sent1:<30} {display_sent2:<30}\")\n",
|
||||
" \n",
|
||||
" # Print similarities for this pair\n",
|
||||
" for method_name in results:\n",
|
||||
" similarity = results[method_name][i]\n",
|
||||
" print(f\"{'':<40} {method_name + ':':<20} {similarity:.3f}\")\n",
|
||||
" print(f\"{'':<30} {method_name + ':':<20} {similarity:.3f}\")\n",
|
||||
" print(\"-\" * 100)\n",
|
||||
"\n",
|
||||
"results = evaluate_baseline_methods(test_pairs)\n",
|
||||
"results, methods = evaluate_baseline_methods(test_pairs)\n",
|
||||
"print_comparison_table(results, test_pairs)"
|
||||
]
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user