57 lines
1.9 KiB
Plaintext
57 lines
1.9 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"metadata": {
|
|
"ExecuteTime": {
|
|
"end_time": "2025-11-19T00:00:04.962487Z",
|
|
"start_time": "2025-11-19T00:00:04.958995Z"
|
|
}
|
|
},
|
|
"cell_type": "code",
|
|
"source": [
|
|
"import import_ipynb\n",
|
|
"from notebooks.01_data_exploration import *\n",
|
|
"\n",
|
|
"def jaccard_similarity(sent1, sent2):\n",
|
|
" # make lowercase and split into words\n",
|
|
" words1 = set(sent1.lower().split())\n",
|
|
" words2 = set(sent2.lower().split())\n",
|
|
" intersection = words1.intersection(words2)\n",
|
|
" union = words1.union(words2)\n",
|
|
" return float(len(intersection)) / len(union) if union else 0.0\n",
|
|
"\n",
|
|
"test_pairs = [\n",
|
|
" (\"The cat sat on the mat.\", \"The cat sat on the mat.\"), # Copy\n",
|
|
" (\"The cat sat on the mat.\", \"On the mat sat the cat.\"), # Same words rearranged\n",
|
|
" (\"The cat sat on the mat.\", \"The dog ran in the park\") # Different\n",
|
|
"]\n",
|
|
"\n",
|
|
"for sent1, sent2 in test_pairs:\n",
|
|
" similarity = jaccard_similarity(sent1, sent2)\n",
|
|
" print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n"
|
|
],
|
|
"id": "e60d024e969254a",
|
|
"outputs": [
|
|
{
|
|
"ename": "SyntaxError",
|
|
"evalue": "invalid decimal literal (2501033926.py, line 2)",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
" \u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[8]\u001B[39m\u001B[32m, line 2\u001B[39m\n\u001B[31m \u001B[39m\u001B[31mfrom notebooks.01_data_exploration import *\u001B[39m\n ^\n\u001B[31mSyntaxError\u001B[39m\u001B[31m:\u001B[39m invalid decimal literal\n"
|
|
]
|
|
}
|
|
],
|
|
"execution_count": 8
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"name": "python3",
|
|
"language": "python",
|
|
"display_name": "Python 3 (ipykernel)"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|