import os
import io
from torchnlp.download import download_file_maybe_extract
def get_label_str(label, fine_grained=False):
pre = 'very ' if fine_grained else ''
return {
'0': pre + 'negative',
'1': 'negative',
'2': 'neutral',
'3': 'positive',
'4': pre + 'positive',
None: None
}[label]
def parse_tree(data, subtrees=False, fine_grained=False):
# https://github.com/pytorch/text/blob/6476392a801f51794c90378dd23489578896c6f2/torchtext/data/example.py#L56
try:
from nltk.tree import Tree
except ImportError:
print("Please install NLTK. " "See the docs at http://nltk.org for more information.")
raise
tree = Tree.fromstring(data)
if subtrees:
return [{
'text': ' '.join(t.leaves()),
'label': get_label_str(t.label(), fine_grained=fine_grained)
} for t in tree.subtrees()]
return {
'text': ' '.join(tree.leaves()),
'label': get_label_str(tree.label(), fine_grained=fine_grained)
}
[docs]def smt_dataset(directory='data/',
train=False,
dev=False,
test=False,
train_filename='train.txt',
dev_filename='dev.txt',
test_filename='test.txt',
extracted_name='trees',
check_files=['trees/train.txt'],
url='http://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip',
fine_grained=False,
subtrees=False):
"""
Load the Stanford Sentiment Treebank dataset.
Semantic word spaces have been very useful but cannot express the meaning of longer phrases in
a principled way. Further progress towards understanding compositionality in tasks such as
sentiment detection requires richer supervised training and evaluation resources and more
powerful models of composition. To remedy this, we introduce a Sentiment Treebank. It includes
fine grained sentiment labels for 215,154 phrases in the parse trees of 11,855 sentences and
presents new challenges for sentiment compositionality.
**Reference**:
https://nlp.stanford.edu/sentiment/index.html
**Citation:**
Richard Socher, Alex Perelygin, Jean Y. Wu, Jason Chuang, Christopher D. Manning,
Andrew Y. Ng and Christopher Potts. Recursive Deep Models for Semantic Compositionality Over a
Sentiment Treebank
Args:
directory (str, optional): Directory to cache the dataset.
train (bool, optional): If to load the training split of the dataset.
dev (bool, optional): If to load the development split of the dataset.
test (bool, optional): If to load the test split of the dataset.
train_filename (str, optional): The filename of the training split.
dev_filename (str, optional): The filename of the development split.
test_filename (str, optional): The filename of the test split.
extracted_name (str, optional): Name of the extracted dataset directory.
check_files (str, optional): Check if these files exist, then this download was successful.
url (str, optional): URL of the dataset `tar.gz` file.
subtrees (bool, optional): Whether to include sentiment-tagged subphrases in addition to
complete examples.
fine_grained (bool, optional): Whether to use 5-class instead of 3-class labeling.
Returns:
:class:`tuple` of :class:`iterable` or :class:`iterable`:
Returns between one and all dataset splits (train, dev and test) depending on if their
respective boolean argument is ``True``.
Example:
>>> from torchnlp.datasets import smt_dataset # doctest: +SKIP
>>> train = smt_dataset(train=True) # doctest: +SKIP
>>> train[5] # doctest: +SKIP
{
'text': "Whether or not you 're enlightened by any of Derrida 's lectures on ...",
'label': 'positive'
}
"""
download_file_maybe_extract(url=url, directory=directory, check_files=check_files)
ret = []
splits = [(train, train_filename), (dev, dev_filename), (test, test_filename)]
splits = [f for (requested, f) in splits if requested]
for filename in splits:
full_path = os.path.join(directory, extracted_name, filename)
examples = []
with io.open(full_path, encoding='utf-8') as f:
for line in f:
line = line.strip()
if subtrees:
examples.extend(parse_tree(line, subtrees=subtrees, fine_grained=fine_grained))
else:
examples.append(parse_tree(line, subtrees=subtrees, fine_grained=fine_grained))
ret.append(examples)
if len(ret) == 1:
return ret[0]
else:
return tuple(ret)