simple sentance parser and dataset tester program

This commit is contained in:
Henry Dowd
2025-10-08 14:55:08 +01:00
parent ed7046a8c0
commit 188a8e5852
11 changed files with 92 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
venv
.vscode

0
__init__.py Normal file
View File

Binary file not shown.

14
main.py Normal file
View File

@@ -0,0 +1,14 @@
from tools.parser import parse_sentence, extract_dependency_relationships
def main():
sentence = "The quick brown fox jumps over the lazy dog."
doc = parse_sentence(sentence)
dependencies = extract_dependency_relationships(doc)
print("\nDependency relationships:")
for dep in dependencies:
print(f"{dep['word']} --{dep['dep_type']}--> {dep['head']}")
if __name__ == "__main__":
main()

0
testing/__init__.py Normal file
View File

Binary file not shown.

View File

@@ -0,0 +1,41 @@
import spacy
from tools import parser
# Load spaCy and dataset
nlp = spacy.load("en_core_web_sm")
dataset = parser.load_dataset("glue", "mrpc")
def process_sentence_pair(sentence1, sentence2):
"""Parse both sentences and extract their dependency structures"""
# Parse both sentences
doc1 = nlp(sentence1)
doc2 = nlp(sentence2)
# Extract dependency graphs
deps1 = parser.extract_dependency_relationships(doc1)
deps2 = parser.extract_dependency_relationships(doc2)
return {
'sentence1': sentence1,
'sentence2': sentence2,
'dependencies1': deps1,
'dependencies2': deps2,
'doc1': doc1,
'doc2': doc2
}
# Process a few examples from the dataset
print("Processing MRPC examples...")
for i in range(3): # Just do first 3 examples
example = dataset['train'][i]
result = process_sentence_pair(example['sentence1'], example['sentence2'])
print(f"\nExample {i+1}:")
print(f"Sentence 1: {result['sentence1']}")
print(f"Sentence 2: {result['sentence2']}")
print(f"Label: {example['label']} (1=paraphrase, 0=not paraphrase)")
print(f"\nDependencies for Sentence 1:")
for dep in result['dependencies1'][:5]: # Show first 5 dependencies
print(f" {dep['word']} --{dep['dep_type']}--> {dep['head']}")

0
tools/__init__.py Normal file
View File

Binary file not shown.

Binary file not shown.

35
tools/parser.py Normal file
View File

@@ -0,0 +1,35 @@
import spacy
# English model
nlp = spacy.load("en_core_web_sm")
# Parse a single sentence
def parse_sentence(sentence):
doc = nlp(sentence)
print("Token-by-token analysis:")
for token in doc:
print(f"Text: {token.text:<12} Dep: {token.dep_:<10} Head: {token.head.text:<10} POS: {token.pos_:<8}")
return doc
def extract_dependency_relationships(doc):
"""Extract dependency relationships for graph representation"""
dependencies = []
for token in doc:
# Skip punctuation
if token.is_punct:
continue
dependency = {
'word': token.text,
'lemma': token.lemma_,
'dep_type': token.dep_,
'head': token.head.text,
'head_lemma': token.head.lemma_,
'pos': token.pos_
}
dependencies.append(dependency)
return dependencies