diff --git a/notebooks/01_data_exploration.ipynb b/notebooks/01_data_exploration.ipynb index 07cffdd..229a62f 100644 --- a/notebooks/01_data_exploration.ipynb +++ b/notebooks/01_data_exploration.ipynb @@ -9,12 +9,13 @@ { "metadata": { "ExecuteTime": { - "end_time": "2025-11-18T23:01:17.888318Z", - "start_time": "2025-11-18T23:01:16.494987Z" + "end_time": "2025-11-19T00:00:02.012627Z", + "start_time": "2025-11-19T00:00:00.160731Z" } }, "cell_type": "code", "source": [ + "import import_ipynb\n", "import spacy\n", "\n", "nlp = spacy.load(\"en_core_web_md\") # Can swap for large model if required\n", @@ -50,7 +51,7 @@ ] } ], - "execution_count": 1 + "execution_count": 2 }, { "metadata": {}, diff --git a/notebooks/02_baseline_experiments.ipynb b/notebooks/02_baseline_experiments.ipynb index 34f5390..126ec8a 100644 --- a/notebooks/02_baseline_experiments.ipynb +++ b/notebooks/02_baseline_experiments.ipynb @@ -3,12 +3,15 @@ { "metadata": { "ExecuteTime": { - "end_time": "2025-11-18T23:15:35.056834Z", - "start_time": "2025-11-18T23:15:35.051218Z" + "end_time": "2025-11-19T00:00:04.962487Z", + "start_time": "2025-11-19T00:00:04.958995Z" } }, "cell_type": "code", "source": [ + "import import_ipynb\n", + "from notebooks.01_data_exploration import *\n", + "\n", "def jaccard_similarity(sent1, sent2):\n", " # make lowercase and split into words\n", " words1 = set(sent1.lower().split())\n", @@ -30,16 +33,15 @@ "id": "e60d024e969254a", "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "'The cat sat on the mat.' vs 'The cat sat on the mat.': 1.000\n", - "'The cat sat on the mat.' vs 'On the mat sat the cat.': 0.429\n", - "'The cat sat on the mat.' vs 'The dog ran in the park': 0.111\n" + "ename": "SyntaxError", + "evalue": "invalid decimal literal (2501033926.py, line 2)", + "output_type": "error", + "traceback": [ + " \u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[8]\u001B[39m\u001B[32m, line 2\u001B[39m\n\u001B[31m \u001B[39m\u001B[31mfrom notebooks.01_data_exploration import *\u001B[39m\n ^\n\u001B[31mSyntaxError\u001B[39m\u001B[31m:\u001B[39m invalid decimal literal\n" ] } ], - "execution_count": 7 + "execution_count": 8 } ], "metadata": { diff --git a/notebooks/03_semantic_methods.ipynb b/notebooks/03_semantic_methods.ipynb index 08f9bd0..09d9e85 100644 --- a/notebooks/03_semantic_methods.ipynb +++ b/notebooks/03_semantic_methods.ipynb @@ -2,14 +2,112 @@ "cells": [ { "metadata": {}, + "cell_type": "markdown", + "source": "Test word vectors", + "id": "b79941bf4553fd6" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-11-18T23:28:55.563335Z", + "start_time": "2025-11-18T23:28:53.763429Z" + } + }, "cell_type": "code", + "source": [ + "import import_ipynb\n", + "from notebooks.02_baseline_experiments.ipynb import *\n", + "\n", + "import spacy\n", + "nlp = spacy.load(\"en_core_web_md\")\n", + "\n", + "words = [\"cat\", \"dog\", \"feline\", \"vehicle\", \"car\"]\n", + "# Test work similarities\n", + "for word1 in words:\n", + " for word2 in words:\n", + " if word1 != word2:\n", + " similarity = nlp.vocab[word1].similarity(nlp.vocab[word2])\n", + " print(f\"{word1} - {word2}: {similarity:.3f}\")\n", + "\n", + "\n" + ], + "id": "8a3c4314a90086fe", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cat - dog: 1.000\n", + "cat - feline: 0.363\n", + "cat - vehicle: 0.078\n", + "cat - car: 0.193\n", + "dog - cat: 1.000\n", + "dog - feline: 0.363\n", + "dog - vehicle: 0.078\n", + "dog - car: 0.193\n", + "feline - cat: 0.363\n", + "feline - dog: 0.363\n", + "feline - vehicle: 0.180\n", + "feline - car: 0.050\n", + "vehicle - cat: 0.078\n", + "vehicle - dog: 0.078\n", + "vehicle - feline: 0.180\n", + "vehicle - car: 0.205\n", + "car - cat: 0.193\n", + "car - dog: 0.193\n", + "car - feline: 0.050\n", + "car - vehicle: 0.205\n" + ] + } + ], + "execution_count": 1 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "Simple averaging", + "id": "8f32b5695f554268" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-11-18T23:45:03.085563Z", + "start_time": "2025-11-18T23:45:03.082190Z" + } + }, + "cell_type": "code", + "source": [ + "def sentence_similarity_ang(sent1, sent2):\n", + " doc1 = nlp(sent1)\n", + " doc2 = nlp(sent2)\n", + "\n", + " # Vectors for each word, filter out words without vectors (medium model)\n", + " vecs1 = [token.vector for token in doc1 if token.has_vector]\n", + " vecs2 = [token.vector for token in doc2 if token.has_vector]\n", + "\n", + " if not vecs1 or not vecs2:\n", + " return 0.0\n", + "\n", + " # Average vectors\n", + " avg1 = sum(vecs1) / len(vecs1)\n", + " avg2 = sum(vecs2) / len(vecs2)\n", + "\n", + " #cosine similarity\n", + " from sklearn.metrics.pairwise import cosine_similarity\n", + " return cosine_similarity([avg1], [avg2])[0][0]\n" + ], + "id": "68a6757447e4a1c7", "outputs": [], - "execution_count": null, - "source": "", - "id": "8a3c4314a90086fe" + "execution_count": 3 } ], - "metadata": {}, + "metadata": { + "kernelspec": { + "name": "python3", + "language": "python", + "display_name": "Python 3 (ipykernel)" + } + }, "nbformat": 4, "nbformat_minor": 5 } diff --git a/notebooks/04_fusion_model.ipynb b/notebooks/04_fusion_model.ipynb index e69de29..ef126da 100644 --- a/notebooks/04_fusion_model.ipynb +++ b/notebooks/04_fusion_model.ipynb @@ -0,0 +1,48 @@ +{ + "cells": [ + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "import import_ipynb\n", + "from notebooks.03_semantic_methods.ipynb import *\n", + "\n", + "def extract_all_features(sentence_pairs):\n", + " features = []\n", + " for sent1, sent2 in sentence_pairs:\n", + " feature_vector = [\n", + " jaccard_similarity(sent1, sent2),\n", + " sentence_similarity_avg(sent1, sent2),\n", + " sentence_similarity_sif(sent1, sent2),\n", + " syntactic_similarity(sent1, sent2)\n", + " ]" + ], + "id": "1c45d83192facfc6" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "labled_pairs = []\n", + "\n", + "X = extract_all_features(labled_pairs)\n", + "y = [0,1,0,1...] #Lables for pairs\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n", + "model = LogisticRegression()\n", + "model.fit(X_train, y_train)\n" + ], + "id": "9665682bd5a7951e" + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +}