reserch text

This commit is contained in:
Henry Dowd
2025-11-29 14:54:32 +00:00
parent fb68bc869a
commit 02cdc7bac6
7 changed files with 447 additions and 210 deletions

View File

@@ -10,6 +10,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"id": "12579bf734bb1a92",
"metadata": {
"ExecuteTime": {
@@ -17,11 +18,13 @@
"start_time": "2025-11-23T13:53:56.325948Z"
}
},
"outputs": [],
"source": [
"import token\n",
"import spacy\n",
"from spacy import displacy\n",
"from IPython.display import display, HTML\n",
"import torch\n",
"\n",
"nlp = spacy.load(\"en_core_web_md\") # Medium size model\n",
"\n",
@@ -30,9 +33,7 @@
" \"On the mat, the cat was sitting.\",\n",
" \"A completely different sentence about something else.\"\n",
"]"
],
"outputs": [],
"execution_count": 1
]
},
{
"cell_type": "markdown",
@@ -44,6 +45,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"id": "e003ac06a58cfbb4",
"metadata": {
"ExecuteTime": {
@@ -51,14 +53,6 @@
"start_time": "2025-11-23T13:54:12.896440Z"
}
},
"source": [
"\n",
"for sent in test_sentences:\n",
" doc = nlp(sent)\n",
" print(f\"Sentence: {sent}\")\n",
" print(f\"Tokens: {[token.text for token in doc]}\")\n",
" print(\"---\")\n"
],
"outputs": [
{
"name": "stdout",
@@ -76,10 +70,18 @@
]
}
],
"execution_count": 2
"source": [
"\n",
"for sent in test_sentences:\n",
" doc = nlp(sent)\n",
" print(f\"Sentence: {sent}\")\n",
" print(f\"Tokens: {[token.text for token in doc]}\")\n",
" print(\"---\")\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "5e488a878a5cfccb",
"metadata": {
"ExecuteTime": {
@@ -87,6 +89,39 @@
"start_time": "2025-11-23T13:55:22.744266Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--------------------------------------------------\n",
"Sentence: The cat sat on the mat.\n",
"--- Direct Sentence ---\n",
"the cat sat on the mat.\n",
"--- Semantic Sentence ---\n",
"cat sit mat\n",
"--- Syntactic Sentence ---\n",
"the cat sit on the mat .\n",
"--------------------------------------------------\n",
"Sentence: On the mat, the cat was sitting.\n",
"--- Direct Sentence ---\n",
"on the mat, the cat was sitting.\n",
"--- Semantic Sentence ---\n",
"mat cat sit\n",
"--- Syntactic Sentence ---\n",
"on the mat , the cat be sit .\n",
"--------------------------------------------------\n",
"Sentence: A completely different sentence about something else.\n",
"--- Direct Sentence ---\n",
"a completely different sentence about something else.\n",
"--- Semantic Sentence ---\n",
"completely different sentence\n",
"--- Syntactic Sentence ---\n",
"a completely different sentence about something else .\n",
"--------------------------------------------------\n"
]
}
],
"source": [
"\n",
"class TextPreprocessor:\n",
@@ -157,44 +192,11 @@
"# print(\"--- Syntactic Analysis ---\")\n",
"# print(f\"Preprocessed Sentence: {preprocessor.syntactic_analysis(sent)}\")\n",
"# print(\"-\" * 50)"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--------------------------------------------------\n",
"Sentence: The cat sat on the mat.\n",
"--- Direct Sentence ---\n",
"the cat sat on the mat.\n",
"--- Semantic Sentence ---\n",
"cat sit mat\n",
"--- Syntactic Sentence ---\n",
"the cat sit on the mat .\n",
"--------------------------------------------------\n",
"Sentence: On the mat, the cat was sitting.\n",
"--- Direct Sentence ---\n",
"on the mat, the cat was sitting.\n",
"--- Semantic Sentence ---\n",
"mat cat sit\n",
"--- Syntactic Sentence ---\n",
"on the mat , the cat be sit .\n",
"--------------------------------------------------\n",
"Sentence: A completely different sentence about something else.\n",
"--- Direct Sentence ---\n",
"a completely different sentence about something else.\n",
"--- Semantic Sentence ---\n",
"completely different sentence\n",
"--- Syntactic Sentence ---\n",
"a completely different sentence about something else .\n",
"--------------------------------------------------\n"
]
}
],
"execution_count": 3
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "83fc18c9de2e354",
"metadata": {
"ExecuteTime": {
@@ -202,24 +204,6 @@
"start_time": "2025-11-23T13:55:33.565711Z"
}
},
"source": [
"\n",
"def extract_parse_tree(text):\n",
" doc = nlp(text)\n",
"\n",
" print(f\"Sentence: {text}\")\n",
" print(\"\\nDependenct Parse Tree:\")\n",
" print(\"-\" * 50)\n",
"\n",
" for token in doc:\n",
" print(f\"{token.text:<12} {token.dep_:<12} {token.head.text:<12} {[child.text for child in token.children]}\")\n",
"\n",
" return doc\n",
"\n",
"for sentence in processed_syntactic:\n",
" doc = extract_parse_tree(sentence)\n",
" print(\"\\n\" + \"=\"*60 + \"\\n\")"
],
"outputs": [
{
"name": "stdout",
@@ -273,7 +257,24 @@
]
}
],
"execution_count": 4
"source": [
"\n",
"def extract_parse_tree(text):\n",
" doc = nlp(text)\n",
"\n",
" print(f\"Sentence: {text}\")\n",
" print(\"\\nDependenct Parse Tree:\")\n",
" print(\"-\" * 50)\n",
"\n",
" for token in doc:\n",
" print(f\"{token.text:<12} {token.dep_:<12} {token.head.text:<12} {[child.text for child in token.children]}\")\n",
"\n",
" return doc\n",
"\n",
"for sentence in processed_syntactic:\n",
" doc = extract_parse_tree(sentence)\n",
" print(\"\\n\" + \"=\"*60 + \"\\n\")"
]
},
{
"cell_type": "markdown",
@@ -285,6 +286,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"id": "e413238c1af12f62",
"metadata": {
"ExecuteTime": {
@@ -292,22 +294,6 @@
"start_time": "2025-11-23T13:56:21.702279Z"
}
},
"source": [
"\n",
"\n",
"def visualize_parse_tree(text):\n",
" doc = nlp(text)\n",
" html = displacy.render(doc, style=\"dep\", jupyter=False, options={\"distance\": 100})\n",
" display(HTML(html))\n",
"\n",
"\n",
"\n",
"for sentence in processed_syntactic:\n",
" print(f\"Sentence: {sentence}\")\n",
" print(\"---\")\n",
" print(f\"Processed Sentence: \" + sentence)\n",
" visualize_parse_tree(sentence)"
],
"outputs": [
{
"name": "stdout",
@@ -320,11 +306,8 @@
},
{
"data": {
"text/plain": [
"<IPython.core.display.HTML object>"
],
"text/html": [
"<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" xml:lang=\"en\" id=\"4ec65c2317524f6dab93c3963fcc5973-0\" class=\"displacy\" width=\"650\" height=\"237.0\" direction=\"ltr\" style=\"max-width: none; height: 237.0px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr\">\n",
"<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" xml:lang=\"en\" id=\"7988d30657e54e32939db20da966a81e-0\" class=\"displacy\" width=\"650\" height=\"237.0\" direction=\"ltr\" style=\"max-width: none; height: 237.0px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr\">\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"147.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"50\">the</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"50\">DET</tspan>\n",
@@ -356,52 +339,52 @@
"</text>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-4ec65c2317524f6dab93c3963fcc5973-0-0\" stroke-width=\"2px\" d=\"M70,102.0 C70,52.0 145.0,52.0 145.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <path class=\"displacy-arc\" id=\"arrow-7988d30657e54e32939db20da966a81e-0-0\" stroke-width=\"2px\" d=\"M70,102.0 C70,52.0 145.0,52.0 145.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-4ec65c2317524f6dab93c3963fcc5973-0-0\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
" <textPath xlink:href=\"#arrow-7988d30657e54e32939db20da966a81e-0-0\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M70,104.0 L62,92.0 78,92.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-4ec65c2317524f6dab93c3963fcc5973-0-1\" stroke-width=\"2px\" d=\"M170,102.0 C170,52.0 245.0,52.0 245.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <path class=\"displacy-arc\" id=\"arrow-7988d30657e54e32939db20da966a81e-0-1\" stroke-width=\"2px\" d=\"M170,102.0 C170,52.0 245.0,52.0 245.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-4ec65c2317524f6dab93c3963fcc5973-0-1\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">nsubj</textPath>\n",
" <textPath xlink:href=\"#arrow-7988d30657e54e32939db20da966a81e-0-1\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">nsubj</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M170,104.0 L162,92.0 178,92.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-4ec65c2317524f6dab93c3963fcc5973-0-2\" stroke-width=\"2px\" d=\"M270,102.0 C270,52.0 345.0,52.0 345.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <path class=\"displacy-arc\" id=\"arrow-7988d30657e54e32939db20da966a81e-0-2\" stroke-width=\"2px\" d=\"M270,102.0 C270,52.0 345.0,52.0 345.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-4ec65c2317524f6dab93c3963fcc5973-0-2\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">prep</textPath>\n",
" <textPath xlink:href=\"#arrow-7988d30657e54e32939db20da966a81e-0-2\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">prep</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M345.0,104.0 L353.0,92.0 337.0,92.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-4ec65c2317524f6dab93c3963fcc5973-0-3\" stroke-width=\"2px\" d=\"M470,102.0 C470,52.0 545.0,52.0 545.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <path class=\"displacy-arc\" id=\"arrow-7988d30657e54e32939db20da966a81e-0-3\" stroke-width=\"2px\" d=\"M470,102.0 C470,52.0 545.0,52.0 545.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-4ec65c2317524f6dab93c3963fcc5973-0-3\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
" <textPath xlink:href=\"#arrow-7988d30657e54e32939db20da966a81e-0-3\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M470,104.0 L462,92.0 478,92.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-4ec65c2317524f6dab93c3963fcc5973-0-4\" stroke-width=\"2px\" d=\"M370,102.0 C370,2.0 550.0,2.0 550.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <path class=\"displacy-arc\" id=\"arrow-7988d30657e54e32939db20da966a81e-0-4\" stroke-width=\"2px\" d=\"M370,102.0 C370,2.0 550.0,2.0 550.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-4ec65c2317524f6dab93c3963fcc5973-0-4\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">pobj</textPath>\n",
" <textPath xlink:href=\"#arrow-7988d30657e54e32939db20da966a81e-0-4\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">pobj</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M550.0,104.0 L558.0,92.0 542.0,92.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"</svg>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data",
"jetTransient": {
"display_id": null
}
"output_type": "display_data"
},
{
"name": "stdout",
@@ -414,11 +397,8 @@
},
{
"data": {
"text/plain": [
"<IPython.core.display.HTML object>"
],
"text/html": [
"<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" xml:lang=\"en\" id=\"239d41ebf05341878765b3474355df7c-0\" class=\"displacy\" width=\"750\" height=\"287.0\" direction=\"ltr\" style=\"max-width: none; height: 287.0px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr\">\n",
"<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" xml:lang=\"en\" id=\"ede226a392d6400bb9fe1654f4f5b08c-0\" class=\"displacy\" width=\"750\" height=\"287.0\" direction=\"ltr\" style=\"max-width: none; height: 287.0px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr\">\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"197.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"50\">on</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"50\">ADP</tspan>\n",
@@ -455,60 +435,60 @@
"</text>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-239d41ebf05341878765b3474355df7c-0-0\" stroke-width=\"2px\" d=\"M70,152.0 C70,2.0 650.0,2.0 650.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <path class=\"displacy-arc\" id=\"arrow-ede226a392d6400bb9fe1654f4f5b08c-0-0\" stroke-width=\"2px\" d=\"M70,152.0 C70,2.0 650.0,2.0 650.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-239d41ebf05341878765b3474355df7c-0-0\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">prep</textPath>\n",
" <textPath xlink:href=\"#arrow-ede226a392d6400bb9fe1654f4f5b08c-0-0\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">prep</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M70,154.0 L62,142.0 78,142.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-239d41ebf05341878765b3474355df7c-0-1\" stroke-width=\"2px\" d=\"M170,152.0 C170,102.0 240.0,102.0 240.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <path class=\"displacy-arc\" id=\"arrow-ede226a392d6400bb9fe1654f4f5b08c-0-1\" stroke-width=\"2px\" d=\"M170,152.0 C170,102.0 240.0,102.0 240.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-239d41ebf05341878765b3474355df7c-0-1\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
" <textPath xlink:href=\"#arrow-ede226a392d6400bb9fe1654f4f5b08c-0-1\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M170,154.0 L162,142.0 178,142.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-239d41ebf05341878765b3474355df7c-0-2\" stroke-width=\"2px\" d=\"M70,152.0 C70,52.0 245.0,52.0 245.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <path class=\"displacy-arc\" id=\"arrow-ede226a392d6400bb9fe1654f4f5b08c-0-2\" stroke-width=\"2px\" d=\"M70,152.0 C70,52.0 245.0,52.0 245.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-239d41ebf05341878765b3474355df7c-0-2\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">pobj</textPath>\n",
" <textPath xlink:href=\"#arrow-ede226a392d6400bb9fe1654f4f5b08c-0-2\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">pobj</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M245.0,154.0 L253.0,142.0 237.0,142.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-239d41ebf05341878765b3474355df7c-0-3\" stroke-width=\"2px\" d=\"M370,152.0 C370,102.0 440.0,102.0 440.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <path class=\"displacy-arc\" id=\"arrow-ede226a392d6400bb9fe1654f4f5b08c-0-3\" stroke-width=\"2px\" d=\"M370,152.0 C370,102.0 440.0,102.0 440.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-239d41ebf05341878765b3474355df7c-0-3\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
" <textPath xlink:href=\"#arrow-ede226a392d6400bb9fe1654f4f5b08c-0-3\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M370,154.0 L362,142.0 378,142.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-239d41ebf05341878765b3474355df7c-0-4\" stroke-width=\"2px\" d=\"M470,152.0 C470,52.0 645.0,52.0 645.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <path class=\"displacy-arc\" id=\"arrow-ede226a392d6400bb9fe1654f4f5b08c-0-4\" stroke-width=\"2px\" d=\"M470,152.0 C470,52.0 645.0,52.0 645.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-239d41ebf05341878765b3474355df7c-0-4\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">nsubj</textPath>\n",
" <textPath xlink:href=\"#arrow-ede226a392d6400bb9fe1654f4f5b08c-0-4\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">nsubj</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M470,154.0 L462,142.0 478,142.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-239d41ebf05341878765b3474355df7c-0-5\" stroke-width=\"2px\" d=\"M570,152.0 C570,102.0 640.0,102.0 640.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <path class=\"displacy-arc\" id=\"arrow-ede226a392d6400bb9fe1654f4f5b08c-0-5\" stroke-width=\"2px\" d=\"M570,152.0 C570,102.0 640.0,102.0 640.0,152.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-239d41ebf05341878765b3474355df7c-0-5\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">aux</textPath>\n",
" <textPath xlink:href=\"#arrow-ede226a392d6400bb9fe1654f4f5b08c-0-5\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">aux</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M570,154.0 L562,142.0 578,142.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"</svg>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data",
"jetTransient": {
"display_id": null
}
"output_type": "display_data"
},
{
"name": "stdout",
@@ -521,11 +501,8 @@
},
{
"data": {
"text/plain": [
"<IPython.core.display.HTML object>"
],
"text/html": [
"<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" xml:lang=\"en\" id=\"645c3d0343ff46cfb12d7ba372193893-0\" class=\"displacy\" width=\"750\" height=\"237.0\" direction=\"ltr\" style=\"max-width: none; height: 237.0px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr\">\n",
"<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" xml:lang=\"en\" id=\"e2bf7e2546b1463e841e53291bfb9bb2-0\" class=\"displacy\" width=\"750\" height=\"237.0\" direction=\"ltr\" style=\"max-width: none; height: 237.0px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr\">\n",
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"147.0\">\n",
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"50\">a</tspan>\n",
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"50\">DET</tspan>\n",
@@ -562,63 +539,78 @@
"</text>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-645c3d0343ff46cfb12d7ba372193893-0-0\" stroke-width=\"2px\" d=\"M70,102.0 C70,2.0 350.0,2.0 350.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <path class=\"displacy-arc\" id=\"arrow-e2bf7e2546b1463e841e53291bfb9bb2-0-0\" stroke-width=\"2px\" d=\"M70,102.0 C70,2.0 350.0,2.0 350.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-645c3d0343ff46cfb12d7ba372193893-0-0\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
" <textPath xlink:href=\"#arrow-e2bf7e2546b1463e841e53291bfb9bb2-0-0\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M70,104.0 L62,92.0 78,92.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-645c3d0343ff46cfb12d7ba372193893-0-1\" stroke-width=\"2px\" d=\"M170,102.0 C170,52.0 245.0,52.0 245.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <path class=\"displacy-arc\" id=\"arrow-e2bf7e2546b1463e841e53291bfb9bb2-0-1\" stroke-width=\"2px\" d=\"M170,102.0 C170,52.0 245.0,52.0 245.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-645c3d0343ff46cfb12d7ba372193893-0-1\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">advmod</textPath>\n",
" <textPath xlink:href=\"#arrow-e2bf7e2546b1463e841e53291bfb9bb2-0-1\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">advmod</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M170,104.0 L162,92.0 178,92.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-645c3d0343ff46cfb12d7ba372193893-0-2\" stroke-width=\"2px\" d=\"M270,102.0 C270,52.0 345.0,52.0 345.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <path class=\"displacy-arc\" id=\"arrow-e2bf7e2546b1463e841e53291bfb9bb2-0-2\" stroke-width=\"2px\" d=\"M270,102.0 C270,52.0 345.0,52.0 345.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-645c3d0343ff46cfb12d7ba372193893-0-2\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">amod</textPath>\n",
" <textPath xlink:href=\"#arrow-e2bf7e2546b1463e841e53291bfb9bb2-0-2\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">amod</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M270,104.0 L262,92.0 278,92.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-645c3d0343ff46cfb12d7ba372193893-0-3\" stroke-width=\"2px\" d=\"M370,102.0 C370,52.0 445.0,52.0 445.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <path class=\"displacy-arc\" id=\"arrow-e2bf7e2546b1463e841e53291bfb9bb2-0-3\" stroke-width=\"2px\" d=\"M370,102.0 C370,52.0 445.0,52.0 445.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-645c3d0343ff46cfb12d7ba372193893-0-3\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">prep</textPath>\n",
" <textPath xlink:href=\"#arrow-e2bf7e2546b1463e841e53291bfb9bb2-0-3\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">prep</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M445.0,104.0 L453.0,92.0 437.0,92.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-645c3d0343ff46cfb12d7ba372193893-0-4\" stroke-width=\"2px\" d=\"M470,102.0 C470,52.0 545.0,52.0 545.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <path class=\"displacy-arc\" id=\"arrow-e2bf7e2546b1463e841e53291bfb9bb2-0-4\" stroke-width=\"2px\" d=\"M470,102.0 C470,52.0 545.0,52.0 545.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-645c3d0343ff46cfb12d7ba372193893-0-4\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">pobj</textPath>\n",
" <textPath xlink:href=\"#arrow-e2bf7e2546b1463e841e53291bfb9bb2-0-4\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">pobj</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M545.0,104.0 L553.0,92.0 537.0,92.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"\n",
"<g class=\"displacy-arrow\">\n",
" <path class=\"displacy-arc\" id=\"arrow-645c3d0343ff46cfb12d7ba372193893-0-5\" stroke-width=\"2px\" d=\"M570,102.0 C570,52.0 645.0,52.0 645.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <path class=\"displacy-arc\" id=\"arrow-e2bf7e2546b1463e841e53291bfb9bb2-0-5\" stroke-width=\"2px\" d=\"M570,102.0 C570,52.0 645.0,52.0 645.0,102.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
" <textPath xlink:href=\"#arrow-645c3d0343ff46cfb12d7ba372193893-0-5\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">advmod</textPath>\n",
" <textPath xlink:href=\"#arrow-e2bf7e2546b1463e841e53291bfb9bb2-0-5\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">advmod</textPath>\n",
" </text>\n",
" <path class=\"displacy-arrowhead\" d=\"M645.0,104.0 L653.0,92.0 637.0,92.0\" fill=\"currentColor\"/>\n",
"</g>\n",
"</svg>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data",
"jetTransient": {
"display_id": null
}
"output_type": "display_data"
}
],
"execution_count": 6
"source": [
"\n",
"\n",
"def visualize_parse_tree(text):\n",
" doc = nlp(text)\n",
" html = displacy.render(doc, style=\"dep\", jupyter=False, options={\"distance\": 100})\n",
" display(HTML(html))\n",
"\n",
"\n",
"\n",
"for sentence in processed_syntactic:\n",
" print(f\"Sentence: {sentence}\")\n",
" print(\"---\")\n",
" print(f\"Processed Sentence: \" + sentence)\n",
" visualize_parse_tree(sentence)"
]
},
{
"cell_type": "code",
@@ -631,9 +623,21 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,

View File

@@ -1,35 +1,55 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "1638b7b97e3bd6f",
"metadata": {
"ExecuteTime": {
"end_time": "2025-11-22T11:40:21.711998Z",
"start_time": "2025-11-22T11:40:20.129376Z"
}
},
"cell_type": "code",
"outputs": [],
"source": [
"import spacy\n",
"nlp = spacy.load(\"en_core_web_md\") # Medium model"
],
"id": "1638b7b97e3bd6f",
"outputs": [],
"execution_count": 11
"nlp = spacy.load(\"en_core_web_lg\") # Medium model"
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Test word vectors",
"id": "b79941bf4553fd6"
"id": "b79941bf4553fd6",
"metadata": {},
"source": [
"Test word vectors"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "8a3c4314a90086fe",
"metadata": {
"ExecuteTime": {
"end_time": "2025-11-22T11:47:39.286432Z",
"start_time": "2025-11-22T11:47:39.271377Z"
}
},
"cell_type": "code",
"outputs": [
{
"ename": "ValueError",
"evalue": "[E010] Word vectors set to length 0. This may be because you don't have a model installed or loaded, or because your model doesn't include word vectors. For more info, see the docs:\nhttps://spacy.io/usage/models",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mValueError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[9]\u001b[39m\u001b[32m, line 9\u001b[39m\n\u001b[32m 7\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m word2 \u001b[38;5;129;01min\u001b[39;00m words:\n\u001b[32m 8\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m word1 != word2:\n\u001b[32m----> \u001b[39m\u001b[32m9\u001b[39m similarity = \u001b[43mnlp\u001b[49m\u001b[43m.\u001b[49m\u001b[43mvocab\u001b[49m\u001b[43m[\u001b[49m\u001b[43mword1\u001b[49m\u001b[43m]\u001b[49m\u001b[43m.\u001b[49m\u001b[43msimilarity\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnlp\u001b[49m\u001b[43m.\u001b[49m\u001b[43mvocab\u001b[49m\u001b[43m[\u001b[49m\u001b[43mword2\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 10\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mword1\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m - \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mword2\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msimilarity\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m.3f\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/code/plagiarism-detector/.venv/lib/python3.13/site-packages/spacy/lexeme.pyx:146\u001b[39m, in \u001b[36mspacy.lexeme.Lexeme.similarity\u001b[39m\u001b[34m()\u001b[39m\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/code/plagiarism-detector/.venv/lib/python3.13/site-packages/spacy/lexeme.pyx:164\u001b[39m, in \u001b[36mspacy.lexeme.Lexeme.vector_norm.__get__\u001b[39m\u001b[34m()\u001b[39m\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/code/plagiarism-detector/.venv/lib/python3.13/site-packages/spacy/lexeme.pyx:176\u001b[39m, in \u001b[36mspacy.lexeme.Lexeme.vector.__get__\u001b[39m\u001b[34m()\u001b[39m\n",
"\u001b[31mValueError\u001b[39m: [E010] Word vectors set to length 0. This may be because you don't have a model installed or loaded, or because your model doesn't include word vectors. For more info, see the docs:\nhttps://spacy.io/usage/models"
]
}
],
"source": [
"def test_word_vectors(word):\n",
" print(word, nlp.vocab[word].vector.shape)\n",
@@ -43,62 +63,27 @@
" print(f\"{word1} - {word2}: {similarity:.3f}\")\n",
"\n",
"\n"
],
"id": "8a3c4314a90086fe",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"cat - dog: 1.000\n",
"cat - feline: 0.363\n",
"cat - feral: 0.483\n",
"cat - vehicle: 0.078\n",
"cat - car: 0.193\n",
"dog - cat: 1.000\n",
"dog - feline: 0.363\n",
"dog - feral: 0.483\n",
"dog - vehicle: 0.078\n",
"dog - car: 0.193\n",
"feline - cat: 0.363\n",
"feline - dog: 0.363\n",
"feline - feral: 0.412\n",
"feline - vehicle: 0.180\n",
"feline - car: 0.050\n",
"feral - cat: 0.483\n",
"feral - dog: 0.483\n",
"feral - feline: 0.412\n",
"feral - vehicle: 0.175\n",
"feral - car: 0.161\n",
"vehicle - cat: 0.078\n",
"vehicle - dog: 0.078\n",
"vehicle - feline: 0.180\n",
"vehicle - feral: 0.175\n",
"vehicle - car: 0.205\n",
"car - cat: 0.193\n",
"car - dog: 0.193\n",
"car - feline: 0.050\n",
"car - feral: 0.161\n",
"car - vehicle: 0.205\n"
]
}
],
"execution_count": 15
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "Simple averaging",
"id": "8f32b5695f554268"
"id": "8f32b5695f554268",
"metadata": {},
"source": [
"Simple averaging"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "68a6757447e4a1c7",
"metadata": {
"ExecuteTime": {
"end_time": "2025-11-18T23:45:03.085563Z",
"start_time": "2025-11-18T23:45:03.082190Z"
}
},
"cell_type": "code",
"outputs": [],
"source": [
"def sentence_similarity_avg(sent1, sent2):\n",
" doc1 = nlp(sent1)\n",
@@ -118,35 +103,46 @@
" #cosine similarity\n",
" from sklearn.metrics.pairwise import cosine_similarity\n",
" return cosine_similarity([avg1], [avg2])[0][0]\n"
],
"id": "68a6757447e4a1c7",
"outputs": [],
"execution_count": 3
]
},
{
"metadata": {},
"cell_type": "markdown",
"source": "SIF - Smooth Inverse Similarity",
"id": "a9c3aa050f5bc0fe"
"id": "a9c3aa050f5bc0fe",
"metadata": {},
"source": [
"SIF - Smooth Inverse Similarity"
]
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"id": "c100956f89d9b581",
"metadata": {},
"outputs": [],
"source": [
"def sentence_similarity_sif(sent1, sent2):\n",
" doc1 = nlp(sent1)\n",
" doc2 = nlp(sent2)"
],
"id": "c100956f89d9b581"
]
}
],
"metadata": {
"kernelspec": {
"name": "python3",
"display_name": ".venv",
"language": "python",
"display_name": "Python 3 (ipykernel)"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,