Source code for torchnlp.datasets.snli

import os
import io

import json

from torchnlp.download import download_file_maybe_extract


[docs]def snli_dataset(directory='data/',
                 train=False,
                 dev=False,
                 test=False,
                 train_filename='snli_1.0_train.jsonl',
                 dev_filename='snli_1.0_dev.jsonl',
                 test_filename='snli_1.0_test.jsonl',
                 extracted_name='snli_1.0',
                 check_files=['snli_1.0/snli_1.0_train.jsonl'],
                 url='http://nlp.stanford.edu/projects/snli/snli_1.0.zip'):
    """
    Load the Stanford Natural Language Inference (SNLI) dataset.

    The SNLI corpus (version 1.0) is a collection of 570k human-written English sentence pairs
    manually labeled for balanced classification with the labels entailment, contradiction, and
    neutral, supporting the task of natural language inference (NLI), also known as recognizing
    textual entailment (RTE). We aim for it to serve both as a benchmark for evaluating
    representational systems for text, especially including those induced by representation
    learning methods, as well as a resource for developing NLP models of any kind.

    **Reference:** https://nlp.stanford.edu/projects/snli/

    **Citation:**
    Samuel R. Bowman, Gabor Angeli, Christopher Potts, and Christopher D. Manning. 2015. A large
    annotated corpus for learning natural language inference. In Proceedings of the 2015 Conference
    on Empirical Methods in Natural Language Processing (EMNLP).

    Args:
        directory (str, optional): Directory to cache the dataset.
        train (bool, optional): If to load the training split of the dataset.
        dev (bool, optional): If to load the development split of the dataset.
        test (bool, optional): If to load the test split of the dataset.
        train_filename (str, optional): The filename of the training split.
        dev_filename (str, optional): The filename of the development split.
        test_filename (str, optional): The filename of the test split.
        extracted_name (str, optional): Name of the extracted dataset directory.
        check_files (str, optional): Check if these files exist, then this download was successful.
        url (str, optional): URL of the dataset `tar.gz` file.

    Returns:
        :class:`tuple` of :class:`iterable` or :class:`iterable`:
        Returns between one and all dataset splits (train, dev and test) depending on if their
        respective boolean argument is ``True``.

    Example:
        >>> from torchnlp.datasets import snli_dataset  # doctest: +SKIP
        >>> train = snli_dataset(train=True)  # doctest: +SKIP
        >>> train[0]  # doctest: +SKIP
        {
          'premise': 'Kids are on a amusement ride.',
          'hypothesis': 'A car is broke down on the side of the road.',
          'label': 'contradiction',
          'premise_transitions': ['shift', 'shift', 'shift', 'shift', 'shift', 'shift', ...],
          'hypothesis_transitions': ['shift', 'shift', 'shift', 'shift', 'shift', 'shift', ...],
        }
    """
    download_file_maybe_extract(url=url, directory=directory, check_files=check_files)

    get_transitions = lambda parse: ['reduce' if t == ')' else 'shift' for t in parse if t != '(']
    ret = []
    splits = [(train, train_filename), (dev, dev_filename), (test, test_filename)]
    splits = [f for (requested, f) in splits if requested]
    for filename in splits:
        full_path = os.path.join(directory, extracted_name, filename)
        examples = []
        with io.open(full_path, encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                line = json.loads(line)
                examples.append({
                    'premise': line['sentence1'],
                    'hypothesis': line['sentence2'],
                    'label': line['gold_label'],
                    'premise_transitions': get_transitions(line['sentence1_binary_parse']),
                    'hypothesis_transitions': get_transitions(line['sentence2_binary_parse'])
                })
        ret.append(examples)

    if len(ret) == 1:
        return ret[0]
    else:
        return tuple(ret)