Expanded on Baseline experimens, analyzed and compared
This commit is contained in:
@@ -2,7 +2,7 @@
|
|||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 1,
|
||||||
"id": "d2aa2997",
|
"id": "d2aa2997",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@@ -56,7 +56,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 15,
|
"execution_count": 58,
|
||||||
"id": "e60d024e969254a",
|
"id": "e60d024e969254a",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"ExecuteTime": {
|
"ExecuteTime": {
|
||||||
@@ -69,30 +69,20 @@
|
|||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
|
"Sentence 1 Sentence 2: Similarity Score:\n",
|
||||||
"--------------------------------------------------\n",
|
"====================================================================================================\n",
|
||||||
"'The cat sat on the mat.' vs 'The cat sat on the mat': 0.667\n",
|
"The cat sat on the mat. vs The cat sat on the mat. 1.000\n",
|
||||||
"--------------------------------------------------\n",
|
"The cat sat on the mat. vs The cat sat on the mat 0.667\n",
|
||||||
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
|
"The cat sat on the mat. vs The cat sat on the mat. 1.000\n",
|
||||||
"--------------------------------------------------\n",
|
"The cat sat on the mat. vs On the mat, the cat was sitting. 0.375\n",
|
||||||
"'The cat sat on the mat.' vs 'On the mat, the cat was sitting.': 0.375\n",
|
"The cat sat on the mat. vs The feline rested on the rug. 0.250\n",
|
||||||
"--------------------------------------------------\n",
|
"The quick brown fox jumps. vs A fast brown fox leaps. 0.250\n",
|
||||||
"'The cat sat on the mat.' vs 'The feline rested on the rug.': 0.250\n",
|
"The cat sat on the mat. vs The dog ran in the park. 0.111\n",
|
||||||
"--------------------------------------------------\n",
|
"I love programming. vs She enjoys reading books. 0.000\n",
|
||||||
"'The quick brown fox jumps.' vs 'A fast brown fox leaps.': 0.250\n",
|
"The weather is nice today. vs It's raining outside. 0.000\n",
|
||||||
"--------------------------------------------------\n",
|
"Short. vs Short. 1.000\n",
|
||||||
"'The cat sat on the mat.' vs 'The dog ran in the park.': 0.111\n",
|
"A B C D E F G vs A B C D E F G 1.000\n",
|
||||||
"--------------------------------------------------\n",
|
" vs 0.000\n"
|
||||||
"'I love programming.' vs 'She enjoys reading books.': 0.000\n",
|
|
||||||
"--------------------------------------------------\n",
|
|
||||||
"'The weather is nice today.' vs 'It's raining outside.': 0.000\n",
|
|
||||||
"--------------------------------------------------\n",
|
|
||||||
"'Short.' vs 'Short.': 1.000\n",
|
|
||||||
"--------------------------------------------------\n",
|
|
||||||
"'A B C D E F G' vs 'A B C D E F G': 1.000\n",
|
|
||||||
"--------------------------------------------------\n",
|
|
||||||
"'' vs '': 0.000\n",
|
|
||||||
"--------------------------------------------------\n"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@@ -111,10 +101,13 @@
|
|||||||
" (\"The cat sat on the mat.\", \"The dog ran in the park\") # Different\n",
|
" (\"The cat sat on the mat.\", \"The dog ran in the park\") # Different\n",
|
||||||
"]\n",
|
"]\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"print(f\"{'Sentence 1':<41} {'Sentence 2:':<40} {'Similarity Score:'}\")\n",
|
||||||
|
"print(\"=\" * 100)\n",
|
||||||
|
"\n",
|
||||||
"for sent1, sent2 in test_pairs:\n",
|
"for sent1, sent2 in test_pairs:\n",
|
||||||
" similarity = jaccard_similarity(sent1, sent2)\n",
|
" similarity = jaccard_similarity(sent1, sent2)\n",
|
||||||
" print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n",
|
" print(f\"{sent1:<30} {'vs':<10} {sent2:<40} {similarity:.3f}\") # 3 decimal places\n",
|
||||||
" print(\"-\"* 50)\n"
|
" #print(\"-\"* 50)\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -128,7 +121,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 14,
|
"execution_count": 57,
|
||||||
"id": "0b68fdcd",
|
"id": "0b68fdcd",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@@ -136,42 +129,20 @@
|
|||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"'The cat sat on the mat.' vs 'The cat sat on the mat.':\n",
|
"Sentence 1 Sentence 2: Similarities -> Char: Word:\n",
|
||||||
" Char similarity: 1.000 --- Word similarity: 1.000\n",
|
"====================================================================================================\n",
|
||||||
"--------------------------------------------------\n",
|
"The cat sat on the mat. vs The cat sat on the mat. 1.000 1.000 \n",
|
||||||
"'The cat sat on the mat.' vs 'The cat sat on the mat':\n",
|
"The cat sat on the mat. vs The cat sat on the mat 0.957 0.833 \n",
|
||||||
" Char similarity: 0.957 --- Word similarity: 0.833\n",
|
"The cat sat on the mat. vs The cat sat on the mat. 0.821 1.000 \n",
|
||||||
"--------------------------------------------------\n",
|
"The cat sat on the mat. vs On the mat, the cat was sitting. 0.344 0.143 \n",
|
||||||
"'The cat sat on the mat.' vs 'The cat sat on the mat.':\n",
|
"The cat sat on the mat. vs The feline rested on the rug. 0.517 0.500 \n",
|
||||||
" Char similarity: 0.821 --- Word similarity: 1.000\n",
|
"The quick brown fox jumps. vs A fast brown fox leaps. 0.577 0.400 \n",
|
||||||
"--------------------------------------------------\n",
|
"The cat sat on the mat. vs The dog ran in the park. 0.625 0.333 \n",
|
||||||
"'The cat sat on the mat.' vs 'On the mat, the cat was sitting.':\n",
|
"I love programming. vs She enjoys reading books. 0.200 0.000 \n",
|
||||||
" Char similarity: 0.344 --- Word similarity: 0.143\n",
|
"The weather is nice today. vs It's raining outside. 0.192 0.000 \n",
|
||||||
"--------------------------------------------------\n",
|
"Short. vs Short. 1.000 1.000 \n",
|
||||||
"'The cat sat on the mat.' vs 'The feline rested on the rug.':\n",
|
"A B C D E F G vs A B C D E F G 1.000 1.000 \n",
|
||||||
" Char similarity: 0.517 --- Word similarity: 0.500\n",
|
" vs 1.000 1.000 \n"
|
||||||
"--------------------------------------------------\n",
|
|
||||||
"'The quick brown fox jumps.' vs 'A fast brown fox leaps.':\n",
|
|
||||||
" Char similarity: 0.577 --- Word similarity: 0.400\n",
|
|
||||||
"--------------------------------------------------\n",
|
|
||||||
"'The cat sat on the mat.' vs 'The dog ran in the park.':\n",
|
|
||||||
" Char similarity: 0.625 --- Word similarity: 0.333\n",
|
|
||||||
"--------------------------------------------------\n",
|
|
||||||
"'I love programming.' vs 'She enjoys reading books.':\n",
|
|
||||||
" Char similarity: 0.200 --- Word similarity: 0.000\n",
|
|
||||||
"--------------------------------------------------\n",
|
|
||||||
"'The weather is nice today.' vs 'It's raining outside.':\n",
|
|
||||||
" Char similarity: 0.192 --- Word similarity: 0.000\n",
|
|
||||||
"--------------------------------------------------\n",
|
|
||||||
"'Short.' vs 'Short.':\n",
|
|
||||||
" Char similarity: 1.000 --- Word similarity: 1.000\n",
|
|
||||||
"--------------------------------------------------\n",
|
|
||||||
"'A B C D E F G' vs 'A B C D E F G':\n",
|
|
||||||
" Char similarity: 1.000 --- Word similarity: 1.000\n",
|
|
||||||
"--------------------------------------------------\n",
|
|
||||||
"'' vs '':\n",
|
|
||||||
" Char similarity: 1.000 --- Word similarity: 1.000\n",
|
|
||||||
"--------------------------------------------------\n"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@@ -201,12 +172,16 @@
|
|||||||
" edit_distance = distance.Levenshtein.distance(words1, words2)\n",
|
" edit_distance = distance.Levenshtein.distance(words1, words2)\n",
|
||||||
" return 1 - (edit_distance / max_len)\n",
|
" return 1 - (edit_distance / max_len)\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
" \n",
|
||||||
|
"\n",
|
||||||
|
"print(f\"{'Sentence 1':<41} {'Sentence 2:':<20} {'Similarities ->':<19} {'Char:':<10} {'Word:'}\")\n",
|
||||||
|
"print(\"=\" * 100)\n",
|
||||||
|
"\n",
|
||||||
"for sent1, sent2 in test_pairs:\n",
|
"for sent1, sent2 in test_pairs:\n",
|
||||||
" char_similarity = char_levenshtein_similarity(sent1, sent2)\n",
|
" char_similarity = char_levenshtein_similarity(sent1, sent2)\n",
|
||||||
" word_similarity = word_levenshtein_similarity(sent1, sent2)\n",
|
" word_similarity = word_levenshtein_similarity(sent1, sent2)\n",
|
||||||
" print(f\"'{sent1}' vs '{sent2}':\") \n",
|
" print(f\"{sent1:<30} {'vs':<10} {sent2:<40} {char_similarity:.3f}{'':<5} {word_similarity:.3F} \") # 3 decimal places\n",
|
||||||
" print(f\" Char similarity: {char_similarity:.3f} --- Word similarity: {word_similarity:.3f}\") # 3 decimal place\n",
|
" #print(\"-\"* 50"
|
||||||
" print(\"-\"* 50)"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -219,7 +194,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 16,
|
"execution_count": 38,
|
||||||
"id": "46a985b4",
|
"id": "46a985b4",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@@ -227,30 +202,20 @@
|
|||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
|
"Sentence 1 Sentence 2: Similarity Score:\n",
|
||||||
"--------------------------------------------------\n",
|
"====================================================================================================\n",
|
||||||
"'The cat sat on the mat.' vs 'The cat sat on the mat': 1.000\n",
|
"The cat sat on the mat. vs The cat sat on the mat. 1.000\n",
|
||||||
"--------------------------------------------------\n",
|
"The cat sat on the mat. vs The cat sat on the mat 1.000\n",
|
||||||
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
|
"The cat sat on the mat. vs The cat sat on the mat. 1.000\n",
|
||||||
"--------------------------------------------------\n",
|
"The cat sat on the mat. vs On the mat, the cat was sitting. 0.825\n",
|
||||||
"'The cat sat on the mat.' vs 'On the mat, the cat was sitting.': 0.825\n",
|
"The cat sat on the mat. vs The feline rested on the rug. 0.625\n",
|
||||||
"--------------------------------------------------\n",
|
"The quick brown fox jumps. vs A fast brown fox leaps. 0.400\n",
|
||||||
"'The cat sat on the mat.' vs 'The feline rested on the rug.': 0.625\n",
|
"The cat sat on the mat. vs The dog ran in the park. 0.500\n",
|
||||||
"--------------------------------------------------\n",
|
"I love programming. vs She enjoys reading books. 0.000\n",
|
||||||
"'The quick brown fox jumps.' vs 'A fast brown fox leaps.': 0.400\n",
|
"The weather is nice today. vs It's raining outside. 0.000\n",
|
||||||
"--------------------------------------------------\n",
|
"Short. vs Short. 1.000\n",
|
||||||
"'The cat sat on the mat.' vs 'The dog ran in the park.': 0.500\n",
|
"A B C D E F G vs A B C D E F G 1.000\n",
|
||||||
"--------------------------------------------------\n",
|
" vs 1.000\n"
|
||||||
"'I love programming.' vs 'She enjoys reading books.': 0.000\n",
|
|
||||||
"--------------------------------------------------\n",
|
|
||||||
"'The weather is nice today.' vs 'It's raining outside.': 0.000\n",
|
|
||||||
"--------------------------------------------------\n",
|
|
||||||
"'Short.' vs 'Short.': 1.000\n",
|
|
||||||
"--------------------------------------------------\n",
|
|
||||||
"'A B C D E F G' vs 'A B C D E F G': 1.000\n",
|
|
||||||
"--------------------------------------------------\n",
|
|
||||||
"'' vs '': 1.000\n",
|
|
||||||
"--------------------------------------------------\n"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@@ -285,10 +250,13 @@
|
|||||||
" \n",
|
" \n",
|
||||||
" return dot_product / (norm1 * norm2)\n",
|
" return dot_product / (norm1 * norm2)\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"print(f\"{'Sentence 1':<41} {'Sentence 2:':<40} {'Similarity Score:'}\")\n",
|
||||||
|
"print(\"=\" * 100)\n",
|
||||||
|
"\n",
|
||||||
"for sent1, sent2 in test_pairs:\n",
|
"for sent1, sent2 in test_pairs:\n",
|
||||||
" similarity = cosine_similarity_bow(sent1, sent2)\n",
|
" similarity = cosine_similarity_bow(sent1, sent2)\n",
|
||||||
" print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n",
|
" print(f\"{sent1:<30} {'vs':<10} {sent2:<40} {similarity:.3f}\") # 3 decimal places\n",
|
||||||
" print(\"-\"* 50)"
|
" #print(\"-\"* 50)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -301,7 +269,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 23,
|
"execution_count": 5,
|
||||||
"id": "7dc7ac2e",
|
"id": "7dc7ac2e",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@@ -379,7 +347,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 25,
|
"execution_count": 37,
|
||||||
"id": "e6a4d4e2",
|
"id": "e6a4d4e2",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@@ -387,30 +355,20 @@
|
|||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
|
"Sentence 1 Sentence 2: Similarity Score:\n",
|
||||||
"--------------------------------------------------\n",
|
"====================================================================================================\n",
|
||||||
"'The cat sat on the mat.' vs 'The cat sat on the mat': 1.000\n",
|
"The cat sat on the mat. vs The cat sat on the mat. 1.000\n",
|
||||||
"--------------------------------------------------\n",
|
"The cat sat on the mat. vs The cat sat on the mat 1.000\n",
|
||||||
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 0.815\n",
|
"The cat sat on the mat. vs The cat sat on the mat. 0.815\n",
|
||||||
"--------------------------------------------------\n",
|
"The cat sat on the mat. vs On the mat, the cat was sitting. 0.433\n",
|
||||||
"'The cat sat on the mat.' vs 'On the mat, the cat was sitting.': 0.433\n",
|
"The cat sat on the mat. vs The feline rested on the rug. 0.536\n",
|
||||||
"--------------------------------------------------\n",
|
"The quick brown fox jumps. vs A fast brown fox leaps. 0.560\n",
|
||||||
"'The cat sat on the mat.' vs 'The feline rested on the rug.': 0.536\n",
|
"The cat sat on the mat. vs The dog ran in the park. 0.609\n",
|
||||||
"--------------------------------------------------\n",
|
"I love programming. vs She enjoys reading books. 0.333\n",
|
||||||
"'The quick brown fox jumps.' vs 'A fast brown fox leaps.': 0.560\n",
|
"The weather is nice today. vs It's raining outside. 0.360\n",
|
||||||
"--------------------------------------------------\n",
|
"Short. vs Short. 1.000\n",
|
||||||
"'The cat sat on the mat.' vs 'The dog ran in the park.': 0.609\n",
|
"A B C D E F G vs A B C D E F G 1.000\n",
|
||||||
"--------------------------------------------------\n",
|
" vs 0.000\n"
|
||||||
"'I love programming.' vs 'She enjoys reading books.': 0.333\n",
|
|
||||||
"--------------------------------------------------\n",
|
|
||||||
"'The weather is nice today.' vs 'It's raining outside.': 0.360\n",
|
|
||||||
"--------------------------------------------------\n",
|
|
||||||
"'Short.' vs 'Short.': 1.000\n",
|
|
||||||
"--------------------------------------------------\n",
|
|
||||||
"'A B C D E F G' vs 'A B C D E F G': 1.000\n",
|
|
||||||
"--------------------------------------------------\n",
|
|
||||||
"'' vs '': 0.000\n",
|
|
||||||
"--------------------------------------------------\n"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@@ -437,10 +395,13 @@
|
|||||||
" lcs_length = dp[m][n]\n",
|
" lcs_length = dp[m][n]\n",
|
||||||
" return lcs_length / max(m, n) if max(m, n) > 0 else 0.0\n",
|
" return lcs_length / max(m, n) if max(m, n) > 0 else 0.0\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"print(f\"{'Sentence 1':<41} {'Sentence 2:':<40} {'Similarity Score:'}\")\n",
|
||||||
|
"print(\"=\" * 100)\n",
|
||||||
|
"\n",
|
||||||
"for sent1, sent2 in test_pairs:\n",
|
"for sent1, sent2 in test_pairs:\n",
|
||||||
" similarity = longest_common_subsequence(sent1, sent2)\n",
|
" similarity = longest_common_subsequence(sent1, sent2)\n",
|
||||||
" print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n",
|
" print(f\"{sent1:<30} {'vs':<10} {sent2:<40} {similarity:.3f}\") # 3 decimal places\n",
|
||||||
" print(\"-\"* 50)"
|
" #print(\"-\"* 50)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -471,8 +432,7 @@
|
|||||||
" \n",
|
" \n",
|
||||||
" common = words1.intersection(words2)\n",
|
" common = words1.intersection(words2)\n",
|
||||||
" return len(common) / len(words1)\n",
|
" return len(common) / len(words1)\n",
|
||||||
"\n",
|
"\n"
|
||||||
" "
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -485,7 +445,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 25,
|
||||||
"id": "b3d07562",
|
"id": "b3d07562",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@@ -495,19 +455,150 @@
|
|||||||
"text": [
|
"text": [
|
||||||
"Pair Sentence 1 Sentence 2 \n",
|
"Pair Sentence 1 Sentence 2 \n",
|
||||||
"====================================================================================================\n",
|
"====================================================================================================\n",
|
||||||
"Pair 1: The cat sat on the mat. The cat sat on the mat. \n"
|
"Pair 1: The cat sat on the mat. The cat sat on the mat. \n",
|
||||||
]
|
" Jaccard: 1.000\n",
|
||||||
},
|
" Levenshtein (char): 1.000\n",
|
||||||
{
|
" Levenshtein (word): 1.000\n",
|
||||||
"ename": "TypeError",
|
" Cosine BOW: 1.000\n",
|
||||||
"evalue": "tuple indices must be integers or slices, not dict",
|
" Fuzzy Ratio: 1.000\n",
|
||||||
"output_type": "error",
|
" Fuzzy Partial: 1.000\n",
|
||||||
"traceback": [
|
" Fuzzy Token Sort: 1.000\n",
|
||||||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
" Fuzzy Token Set: 1.000\n",
|
||||||
"\u001b[31mTypeError\u001b[39m Traceback (most recent call last)",
|
" LCS: 1.000\n",
|
||||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[28]\u001b[39m\u001b[32m, line 44\u001b[39m\n\u001b[32m 41\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33m-\u001b[39m\u001b[33m\"\u001b[39m * \u001b[32m100\u001b[39m)\n\u001b[32m 43\u001b[39m results = evaluate_baseline_methods(test_pairs)\n\u001b[32m---> \u001b[39m\u001b[32m44\u001b[39m \u001b[43mprint_comparison_table\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresults\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtest_pairs\u001b[49m\u001b[43m)\u001b[49m\n",
|
" Containment: 1.000\n",
|
||||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[28]\u001b[39m\u001b[32m, line 39\u001b[39m, in \u001b[36mprint_comparison_table\u001b[39m\u001b[34m(results, pairs)\u001b[39m\n\u001b[32m 37\u001b[39m \u001b[38;5;66;03m# Print similarities for this pair\u001b[39;00m\n\u001b[32m 38\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m method_name \u001b[38;5;129;01min\u001b[39;00m results:\n\u001b[32m---> \u001b[39m\u001b[32m39\u001b[39m similarity = \u001b[43mresults\u001b[49m\u001b[43m[\u001b[49m\u001b[43mmethod_name\u001b[49m\u001b[43m]\u001b[49m[i]\n\u001b[32m 40\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[33m'\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m<40\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmethod_name\u001b[38;5;250m \u001b[39m+\u001b[38;5;250m \u001b[39m\u001b[33m'\u001b[39m\u001b[33m:\u001b[39m\u001b[33m'\u001b[39m\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m<20\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msimilarity\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m.3f\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 41\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33m-\u001b[39m\u001b[33m\"\u001b[39m * \u001b[32m100\u001b[39m)\n",
|
"----------------------------------------------------------------------------------------------------\n",
|
||||||
"\u001b[31mTypeError\u001b[39m: tuple indices must be integers or slices, not dict"
|
"Pair 2: The cat sat on the mat. The cat sat on the mat \n",
|
||||||
|
" Jaccard: 0.667\n",
|
||||||
|
" Levenshtein (char): 0.957\n",
|
||||||
|
" Levenshtein (word): 0.833\n",
|
||||||
|
" Cosine BOW: 1.000\n",
|
||||||
|
" Fuzzy Ratio: 0.978\n",
|
||||||
|
" Fuzzy Partial: 1.000\n",
|
||||||
|
" Fuzzy Token Sort: 0.978\n",
|
||||||
|
" Fuzzy Token Set: 0.973\n",
|
||||||
|
" LCS: 1.000\n",
|
||||||
|
" Containment: 1.000\n",
|
||||||
|
"----------------------------------------------------------------------------------------------------\n",
|
||||||
|
"Pair 3: The cat sat on the mat. The cat sat on the mat. \n",
|
||||||
|
" Jaccard: 1.000\n",
|
||||||
|
" Levenshtein (char): 0.821\n",
|
||||||
|
" Levenshtein (word): 1.000\n",
|
||||||
|
" Cosine BOW: 1.000\n",
|
||||||
|
" Fuzzy Ratio: 0.902\n",
|
||||||
|
" Fuzzy Partial: 0.826\n",
|
||||||
|
" Fuzzy Token Sort: 1.000\n",
|
||||||
|
" Fuzzy Token Set: 1.000\n",
|
||||||
|
" LCS: 0.815\n",
|
||||||
|
" Containment: 1.000\n",
|
||||||
|
"----------------------------------------------------------------------------------------------------\n",
|
||||||
|
"Pair 4: The cat sat on the mat. On the mat, the cat was sit...\n",
|
||||||
|
" Jaccard: 0.375\n",
|
||||||
|
" Levenshtein (char): 0.344\n",
|
||||||
|
" Levenshtein (word): 0.143\n",
|
||||||
|
" Cosine BOW: 0.825\n",
|
||||||
|
" Fuzzy Ratio: 0.509\n",
|
||||||
|
" Fuzzy Partial: 0.619\n",
|
||||||
|
" Fuzzy Token Sort: 0.764\n",
|
||||||
|
" Fuzzy Token Set: 0.723\n",
|
||||||
|
" LCS: 0.433\n",
|
||||||
|
" Containment: 0.800\n",
|
||||||
|
"----------------------------------------------------------------------------------------------------\n",
|
||||||
|
"Pair 5: The cat sat on the mat. The feline rested on the rug. \n",
|
||||||
|
" Jaccard: 0.250\n",
|
||||||
|
" Levenshtein (char): 0.517\n",
|
||||||
|
" Levenshtein (word): 0.500\n",
|
||||||
|
" Cosine BOW: 0.625\n",
|
||||||
|
" Fuzzy Ratio: 0.615\n",
|
||||||
|
" Fuzzy Partial: 0.605\n",
|
||||||
|
" Fuzzy Token Sort: 0.538\n",
|
||||||
|
" Fuzzy Token Set: 0.480\n",
|
||||||
|
" LCS: 0.536\n",
|
||||||
|
" Containment: 0.400\n",
|
||||||
|
"----------------------------------------------------------------------------------------------------\n",
|
||||||
|
"Pair 6: The quick brown fox jumps. A fast brown fox leaps. \n",
|
||||||
|
" Jaccard: 0.250\n",
|
||||||
|
" Levenshtein (char): 0.577\n",
|
||||||
|
" Levenshtein (word): 0.400\n",
|
||||||
|
" Cosine BOW: 0.400\n",
|
||||||
|
" Fuzzy Ratio: 0.612\n",
|
||||||
|
" Fuzzy Partial: 0.700\n",
|
||||||
|
" Fuzzy Token Sort: 0.531\n",
|
||||||
|
" Fuzzy Token Set: 0.562\n",
|
||||||
|
" LCS: 0.560\n",
|
||||||
|
" Containment: 0.400\n",
|
||||||
|
"----------------------------------------------------------------------------------------------------\n",
|
||||||
|
"Pair 7: The cat sat on the mat. The dog ran in the park. \n",
|
||||||
|
" Jaccard: 0.111\n",
|
||||||
|
" Levenshtein (char): 0.625\n",
|
||||||
|
" Levenshtein (word): 0.333\n",
|
||||||
|
" Cosine BOW: 0.500\n",
|
||||||
|
" Fuzzy Ratio: 0.638\n",
|
||||||
|
" Fuzzy Partial: 0.636\n",
|
||||||
|
" Fuzzy Token Sort: 0.553\n",
|
||||||
|
" Fuzzy Token Set: 0.462\n",
|
||||||
|
" LCS: 0.609\n",
|
||||||
|
" Containment: 0.200\n",
|
||||||
|
"----------------------------------------------------------------------------------------------------\n",
|
||||||
|
"Pair 8: I love programming. She enjoys reading books. \n",
|
||||||
|
" Jaccard: 0.000\n",
|
||||||
|
" Levenshtein (char): 0.200\n",
|
||||||
|
" Levenshtein (word): 0.000\n",
|
||||||
|
" Cosine BOW: 0.000\n",
|
||||||
|
" Fuzzy Ratio: 0.409\n",
|
||||||
|
" Fuzzy Partial: 0.432\n",
|
||||||
|
" Fuzzy Token Sort: 0.364\n",
|
||||||
|
" Fuzzy Token Set: 0.364\n",
|
||||||
|
" LCS: 0.333\n",
|
||||||
|
" Containment: 0.000\n",
|
||||||
|
"----------------------------------------------------------------------------------------------------\n",
|
||||||
|
"Pair 9: The weather is nice today. It's raining outside. \n",
|
||||||
|
" Jaccard: 0.000\n",
|
||||||
|
" Levenshtein (char): 0.192\n",
|
||||||
|
" Levenshtein (word): 0.000\n",
|
||||||
|
" Cosine BOW: 0.000\n",
|
||||||
|
" Fuzzy Ratio: 0.426\n",
|
||||||
|
" Fuzzy Partial: 0.514\n",
|
||||||
|
" Fuzzy Token Sort: 0.340\n",
|
||||||
|
" Fuzzy Token Set: 0.340\n",
|
||||||
|
" LCS: 0.360\n",
|
||||||
|
" Containment: 0.000\n",
|
||||||
|
"----------------------------------------------------------------------------------------------------\n",
|
||||||
|
"Pair 10: Short. Short. \n",
|
||||||
|
" Jaccard: 1.000\n",
|
||||||
|
" Levenshtein (char): 1.000\n",
|
||||||
|
" Levenshtein (word): 1.000\n",
|
||||||
|
" Cosine BOW: 1.000\n",
|
||||||
|
" Fuzzy Ratio: 1.000\n",
|
||||||
|
" Fuzzy Partial: 1.000\n",
|
||||||
|
" Fuzzy Token Sort: 1.000\n",
|
||||||
|
" Fuzzy Token Set: 1.000\n",
|
||||||
|
" LCS: 1.000\n",
|
||||||
|
" Containment: 1.000\n",
|
||||||
|
"----------------------------------------------------------------------------------------------------\n",
|
||||||
|
"Pair 11: A B C D E F G A B C D E F G \n",
|
||||||
|
" Jaccard: 1.000\n",
|
||||||
|
" Levenshtein (char): 1.000\n",
|
||||||
|
" Levenshtein (word): 1.000\n",
|
||||||
|
" Cosine BOW: 1.000\n",
|
||||||
|
" Fuzzy Ratio: 1.000\n",
|
||||||
|
" Fuzzy Partial: 1.000\n",
|
||||||
|
" Fuzzy Token Sort: 1.000\n",
|
||||||
|
" Fuzzy Token Set: 1.000\n",
|
||||||
|
" LCS: 1.000\n",
|
||||||
|
" Containment: 1.000\n",
|
||||||
|
"----------------------------------------------------------------------------------------------------\n",
|
||||||
|
"Pair 12: \n",
|
||||||
|
" Jaccard: 0.000\n",
|
||||||
|
" Levenshtein (char): 1.000\n",
|
||||||
|
" Levenshtein (word): 1.000\n",
|
||||||
|
" Cosine BOW: 1.000\n",
|
||||||
|
" Fuzzy Ratio: 1.000\n",
|
||||||
|
" Fuzzy Partial: 1.000\n",
|
||||||
|
" Fuzzy Token Sort: 1.000\n",
|
||||||
|
" Fuzzy Token Set: 0.000\n",
|
||||||
|
" LCS: 0.000\n",
|
||||||
|
" Containment: 0.000\n",
|
||||||
|
"----------------------------------------------------------------------------------------------------\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@@ -538,23 +629,23 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"def print_comparison_table(results, pairs):\n",
|
"def print_comparison_table(results, pairs):\n",
|
||||||
" \"\"\" Print a formatted comparison table \"\"\"\n",
|
" \"\"\" Print a formatted comparison table \"\"\"\n",
|
||||||
" print(f\"{'Pair':<40} {'Sentence 1':<30} {'Sentence 2':<30}\")\n",
|
" print(f\"{'Pair':<30} {'Sentence 1':<30} {'Sentence 2':<30}\")\n",
|
||||||
" print(\"=\" * 100)\n",
|
" print(\"=\" * 100)\n",
|
||||||
" \n",
|
" \n",
|
||||||
" for i, (sent1, sent2) in enumerate(pairs):\n",
|
" for i, (sent1, sent2) in enumerate(pairs):\n",
|
||||||
" # Truncate long sentences for display\n",
|
" # if sentence too long\n",
|
||||||
" display_sent1 = sent1[:27] + \"...\" if len(sent1) > 30 else sent1\n",
|
" display_sent1 = sent1[:27] + \"...\" if len(sent1) > 30 else sent1\n",
|
||||||
" display_sent2 = sent2[:27] + \"...\" if len(sent2) > 30 else sent2\n",
|
" display_sent2 = sent2[:27] + \"...\" if len(sent2) > 30 else sent2\n",
|
||||||
" \n",
|
" \n",
|
||||||
" print(f\"{f'Pair {i+1}:':<40} {display_sent1:<30} {display_sent2:<30}\")\n",
|
" print(f\"{f'Pair {i+1}:':<30} {display_sent1:<30} {display_sent2:<30}\")\n",
|
||||||
" \n",
|
" \n",
|
||||||
" # Print similarities for this pair\n",
|
" # Print similarities for this pair\n",
|
||||||
" for method_name in results:\n",
|
" for method_name in results:\n",
|
||||||
" similarity = results[method_name][i]\n",
|
" similarity = results[method_name][i]\n",
|
||||||
" print(f\"{'':<40} {method_name + ':':<20} {similarity:.3f}\")\n",
|
" print(f\"{'':<30} {method_name + ':':<20} {similarity:.3f}\")\n",
|
||||||
" print(\"-\" * 100)\n",
|
" print(\"-\" * 100)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"results = evaluate_baseline_methods(test_pairs)\n",
|
"results, methods = evaluate_baseline_methods(test_pairs)\n",
|
||||||
"print_comparison_table(results, test_pairs)"
|
"print_comparison_table(results, test_pairs)"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user