diff --git a/basic-testing/parse_file.py b/basic-testing/parse_file.py new file mode 100644 index 0000000..5113254 --- /dev/null +++ b/basic-testing/parse_file.py @@ -0,0 +1,6 @@ +import pandas as pd + +def load_msr_data(file_path): + """Load the MSR Paraphrase Corpus from a TSV file.""" + df = pd.read_csv("../data/processed/msr_paraphrase_train.txt", sep='\t', quoting=3) + return df \ No newline at end of file diff --git a/basic-testing/parse_tree.py b/basic-testing/parse_tree.py new file mode 100644 index 0000000..60285ea --- /dev/null +++ b/basic-testing/parse_tree.py @@ -0,0 +1,29 @@ +import spacy +from spacy import displacy + +# Load the model +nlp = spacy.load("en_core_web_sm") + +def extract_parse_tree(text): + """Extract basic parse tree information""" + doc = nlp(text) + + print(f"Sentence: {text}") + print("\nDependency Parse Tree:") + print("-" * 50) + + for token in doc: + print(f"{token.text:<12} {token.dep_:<12} {token.head.text:<12} {[child.text for child in token.children]}") + + return doc + +# Test with some sentences +test_sentences = [ + "The cat sat on the mat.", + "A quick brown fox jumps over the lazy dog.", + "She gave him the book yesterday." +] + +for sentence in test_sentences: + doc = extract_parse_tree(sentence) + print("\n" + "="*60 + "\n") \ No newline at end of file diff --git a/data/processed/msr_test.pkl b/data/processed/msr_test.pkl new file mode 100644 index 0000000..4ab7f43 Binary files /dev/null and b/data/processed/msr_test.pkl differ diff --git a/data/processed/msr_train.pkl b/data/processed/msr_train.pkl new file mode 100644 index 0000000..b86de6c Binary files /dev/null and b/data/processed/msr_train.pkl differ diff --git a/data_preprocessing/msr_data_to_pickle.py b/data_preprocessing/msr_data_to_pickle.py new file mode 100644 index 0000000..f6ccbde --- /dev/null +++ b/data_preprocessing/msr_data_to_pickle.py @@ -0,0 +1,53 @@ +import pandas as pd +import os + +raw_data_path = "./data/raw/" +processed_data_path = "./data/processed/" + +def load_msr_data(file_path): + """Load the MSR Paraphrase Corpus from a TSV file.""" + df = pd.read_csv(file_path, sep='\t', quoting=3) # quoting=3 for ignoring quotes + + print(f"Loaded {len(df)} sentence pairs") + #print(f"Positive examples (paraphrases): {df['quality'].sum()}") + #print(f"Negative examples: {len(df) - df['quality'].sum()}") + + return df + + +def save_to_pickle(df, pickle_path): + """Save the DataFrame to a pickle file.""" + df.to_pickle(pickle_path) + print(f"DataFrame saved to {pickle_path}") + + +def load_and_save_data(): + """Load paraphrase data from user input and save as pickle""" + print("Enter current relative path to MSR Corpus\n") + relative_path = input("./ : ").strip() + #full_path = os.path.join(raw_data_path, relative_path) + + try: + df = load_msr_data(relative_path) + except Exception as e: + print(f"❌ Error loading data: {e}") + return None + + pkl_save_path = input("Enter relative path to save pickle: ").strip() + if not os.path.isdir(pkl_save_path): + print(f"❌ Directory does not exist: ./{pkl_save_path}") + return None + + pkl_filename = input("Enter pickle filename: ").strip() + ".pkl" + full_pkl_path = os.path.join(pkl_save_path, pkl_filename) + + try: + save_to_pickle(df, full_pkl_path) + except Exception as e: + print(f"❌ Error saving pickle: {e}") + return None + + print("✅ Data loading and saving completed successfully.") + print(f"Pickle saved at: {full_pkl_path}") + +load_and_save_data() \ No newline at end of file diff --git a/dataset_testing.py b/dataset_testing.py deleted file mode 100644 index d4ee313..0000000 --- a/dataset_testing.py +++ /dev/null @@ -1,42 +0,0 @@ -import spacy -from datasets import load_dataset -from tools import parser - -# Load spaCy and dataset -nlp = spacy.load("en_core_web_sm") -dataset = load_dataset("glue", "mrpc") - -def process_sentence_pair(sentence1, sentence2): - """Parse both sentences and extract their dependency structures""" - - # Parse both sentences - doc1 = nlp(sentence1) - doc2 = nlp(sentence2) - - # Extract dependency graphs - deps1 = parser.extract_dependency_relationships(doc1) - deps2 = parser.extract_dependency_relationships(doc2) - - return { - 'sentence1': sentence1, - 'sentence2': sentence2, - 'dependencies1': deps1, - 'dependencies2': deps2, - 'doc1': doc1, - 'doc2': doc2 - } - -# Process a few examples from the dataset -print("Processing MRPC examples...") -for i in range(5): # Just do first 5 examples - example = dataset['train'][i] - result = process_sentence_pair(example['sentence1'], example['sentence2']) - - print(f"\nExample {i+1}:") - print(f"Sentence 1: {result['sentence1']}") - print(f"Sentence 2: {result['sentence2']}") - print(f"Label: {example['label']} (1=paraphrase, 0=not paraphrase)") - - print(f"\nDependencies for Sentence 1:") - for dep in result['dependencies1'][:55]: # Show first 55 dependencies (Likely All) - print(f" {dep['word']} --{dep['dep_type']}--> {dep['head']}") \ No newline at end of file diff --git a/requirments.txt b/requirments.txt index e69de29..0005b3d 100644 --- a/requirments.txt +++ b/requirments.txt @@ -0,0 +1,8 @@ +datasets +huggingface-hub +pandas +numpy +scikit-learn +spacy +matplotlib +seaborn \ No newline at end of file diff --git a/tools/__init__.py b/tools/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tools/__pycache__/__init__.cpython-313.pyc b/tools/__pycache__/__init__.cpython-313.pyc deleted file mode 100644 index 443a12c..0000000 Binary files a/tools/__pycache__/__init__.cpython-313.pyc and /dev/null differ diff --git a/tools/__pycache__/parser.cpython-313.pyc b/tools/__pycache__/parser.cpython-313.pyc deleted file mode 100644 index b58fcc9..0000000 Binary files a/tools/__pycache__/parser.cpython-313.pyc and /dev/null differ diff --git a/tools/parser.py b/tools/parser.py deleted file mode 100644 index 63fbfe8..0000000 --- a/tools/parser.py +++ /dev/null @@ -1,35 +0,0 @@ -import spacy - -# English model -nlp = spacy.load("en_core_web_sm") - -# Parse a single sentence -def parse_sentence(sentence): - doc = nlp(sentence) - - print("Token-by-token analysis:") - for token in doc: - print(f"Text: {token.text:<12} Dep: {token.dep_:<10} Head: {token.head.text:<10} POS: {token.pos_:<8}") - - return doc - -def extract_dependency_relationships(doc): - """Extract dependency relationships for graph representation""" - dependencies = [] - - for token in doc: - # Skip punctuation - if token.is_punct: - continue - - dependency = { - 'word': token.text, - 'lemma': token.lemma_, - 'dep_type': token.dep_, - 'head': token.head.text, - 'head_lemma': token.head.lemma_, - 'pos': token.pos_ - } - dependencies.append(dependency) - - return dependencies