Testing parsing and jaccard similarity in notebooks
This commit is contained in:
@@ -0,0 +1,73 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"metadata": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": "Keep punctuation for direct copy detection but remove for semantic/keyword based methods",
|
||||||
|
"id": "1c26616777253f10"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2025-11-18T23:01:17.888318Z",
|
||||||
|
"start_time": "2025-11-18T23:01:16.494987Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"import spacy\n",
|
||||||
|
"\n",
|
||||||
|
"nlp = spacy.load(\"en_core_web_md\") # Can swap for large model if required\n",
|
||||||
|
"\n",
|
||||||
|
"test_sentences = [\n",
|
||||||
|
" \"The cat sat on the mat.\",\n",
|
||||||
|
" \"On the mat, the cat was sitting.\",\n",
|
||||||
|
" \"A completely different sentence about something else.\"\n",
|
||||||
|
"]\n",
|
||||||
|
"\n",
|
||||||
|
"for sent in test_sentences:\n",
|
||||||
|
" doc = nlp(sent)\n",
|
||||||
|
" print(f\"Sentence: {sent}\")\n",
|
||||||
|
" print(f\"Tokens: {[token.text for token in doc]}\")\n",
|
||||||
|
" print(\"---\")\n",
|
||||||
|
"\n"
|
||||||
|
],
|
||||||
|
"id": "e003ac06a58cfbb4",
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Sentence: The cat sat on the mat.\n",
|
||||||
|
"Tokens: ['The', 'cat', 'sat', 'on', 'the', 'mat', '.']\n",
|
||||||
|
"---\n",
|
||||||
|
"Sentence: On the mat, the cat was sitting.\n",
|
||||||
|
"Tokens: ['On', 'the', 'mat', ',', 'the', 'cat', 'was', 'sitting', '.']\n",
|
||||||
|
"---\n",
|
||||||
|
"Sentence: A completely different sentence about something else.\n",
|
||||||
|
"Tokens: ['A', 'completely', 'different', 'sentence', 'about', 'something', 'else', '.']\n",
|
||||||
|
"---\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"execution_count": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"metadata": {},
|
||||||
|
"cell_type": "code",
|
||||||
|
"outputs": [],
|
||||||
|
"execution_count": null,
|
||||||
|
"source": "",
|
||||||
|
"id": "83fc18c9de2e354"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"name": "python3",
|
||||||
|
"language": "python",
|
||||||
|
"display_name": "Python 3 (ipykernel)"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
|
|||||||
@@ -0,0 +1,54 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2025-11-18T23:15:35.056834Z",
|
||||||
|
"start_time": "2025-11-18T23:15:35.051218Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"def jaccard_similarity(sent1, sent2):\n",
|
||||||
|
" # make lowercase and split into words\n",
|
||||||
|
" words1 = set(sent1.lower().split())\n",
|
||||||
|
" words2 = set(sent2.lower().split())\n",
|
||||||
|
" intersection = words1.intersection(words2)\n",
|
||||||
|
" union = words1.union(words2)\n",
|
||||||
|
" return float(len(intersection)) / len(union) if union else 0.0\n",
|
||||||
|
"\n",
|
||||||
|
"test_pairs = [\n",
|
||||||
|
" (\"The cat sat on the mat.\", \"The cat sat on the mat.\"), # Copy\n",
|
||||||
|
" (\"The cat sat on the mat.\", \"On the mat sat the cat.\"), # Same words rearranged\n",
|
||||||
|
" (\"The cat sat on the mat.\", \"The dog ran in the park\") # Different\n",
|
||||||
|
"]\n",
|
||||||
|
"\n",
|
||||||
|
"for sent1, sent2 in test_pairs:\n",
|
||||||
|
" similarity = jaccard_similarity(sent1, sent2)\n",
|
||||||
|
" print(f\"'{sent1}' vs '{sent2}': {similarity:.3f}\") # 3 decimal places\n"
|
||||||
|
],
|
||||||
|
"id": "e60d024e969254a",
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n",
|
||||||
|
"'The cat sat on the mat.' vs 'On the mat sat the cat.': 0.429\n",
|
||||||
|
"'The cat sat on the mat.' vs 'The dog ran in the park': 0.111\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"execution_count": 7
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"name": "python3",
|
||||||
|
"language": "python",
|
||||||
|
"display_name": "Python 3 (ipykernel)"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
|
|||||||
@@ -0,0 +1,15 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"metadata": {},
|
||||||
|
"cell_type": "code",
|
||||||
|
"outputs": [],
|
||||||
|
"execution_count": null,
|
||||||
|
"source": "",
|
||||||
|
"id": "8a3c4314a90086fe"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user