42 lines
1.3 KiB
Python
42 lines
1.3 KiB
Python
import spacy
|
|
from datasets import load_dataset
|
|
from tools import parser
|
|
|
|
# Load spaCy and dataset
|
|
nlp = spacy.load("en_core_web_sm")
|
|
dataset = load_dataset("glue", "mrpc")
|
|
|
|
def process_sentence_pair(sentence1, sentence2):
|
|
"""Parse both sentences and extract their dependency structures"""
|
|
|
|
# Parse both sentences
|
|
doc1 = nlp(sentence1)
|
|
doc2 = nlp(sentence2)
|
|
|
|
# Extract dependency graphs
|
|
deps1 = parser.extract_dependency_relationships(doc1)
|
|
deps2 = parser.extract_dependency_relationships(doc2)
|
|
|
|
return {
|
|
'sentence1': sentence1,
|
|
'sentence2': sentence2,
|
|
'dependencies1': deps1,
|
|
'dependencies2': deps2,
|
|
'doc1': doc1,
|
|
'doc2': doc2
|
|
}
|
|
|
|
# Process a few examples from the dataset
|
|
print("Processing MRPC examples...")
|
|
for i in range(5): # Just do first 5 examples
|
|
example = dataset['train'][i]
|
|
result = process_sentence_pair(example['sentence1'], example['sentence2'])
|
|
|
|
print(f"\nExample {i+1}:")
|
|
print(f"Sentence 1: {result['sentence1']}")
|
|
print(f"Sentence 2: {result['sentence2']}")
|
|
print(f"Label: {example['label']} (1=paraphrase, 0=not paraphrase)")
|
|
|
|
print(f"\nDependencies for Sentence 1:")
|
|
for dep in result['dependencies1'][:55]: # Show first 55 dependencies (Likely All)
|
|
print(f" {dep['word']} --{dep['dep_type']}--> {dep['head']}") |