Testing parsing and jaccard similarity in notebooks

This commit is contained in:
Henry Dowd
2025-11-18 23:25:04 +00:00
parent 42deee19f6
commit 8d6b1cab2c
3 changed files with 142 additions and 0 deletions

View File

@@ -0,0 +1,54 @@
{
"cells": [
{
"metadata": {
"ExecuteTime": {
"end_time": "2025-11-18T23:15:35.056834Z",
"start_time": "2025-11-18T23:15:35.051218Z"
}
},
"cell_type": "code",
"source": [
"def jaccard_similarity(sent1, sent2):\n",
" # make lowercase and split into words\n",
" words1 = set(sent1.lower().split())\n",
" words2 = set(sent2.lower().split())\n",
" intersection = words1.intersection(words2)\n",
" union = words1.union(words2)\n",
" return float(len(intersection)) / len(union) if union else 0.0\n",
"\n",
"test_pairs = [\n",
" (\"The cat sat on the mat.\", \"The cat sat on the mat.\"), # Copy\n",
" (\"The cat sat on the mat.\", \"On the mat sat the cat.\"), # Same words rearranged\n",
" (\"The cat sat on the mat.\", \"The dog ran in the park\") # Different\n",
"]\n",
"\n",
"for sent1, sent2 in test_pairs:\n",
" similarity = jaccard_similarity(sent1, sent2)\n",
" print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n"
],
"id": "e60d024e969254a",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
"'The cat sat on the mat.' vs 'On the mat sat the cat.': 0.429\n",
"'The cat sat on the mat.' vs 'The dog ran in the park': 0.111\n"
]
}
],
"execution_count": 7
}
],
"metadata": {
"kernelspec": {
"name": "python3",
"language": "python",
"display_name": "Python 3 (ipykernel)"
}
},
"nbformat": 4,
"nbformat_minor": 5
}