Source code for torchnlp.datasets.squad

import os
import json
from torchnlp.download import download_file_maybe_extract


[docs]def squad_dataset(directory='data/', train=False, dev=False, train_filename='train-v2.0.json', dev_filename='dev-v2.0.json', check_files_train=['train-v2.0.json'], check_files_dev=['dev-v2.0.json'], url_train='https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json', url_dev='https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json'): """ Load the Stanford Question Answering Dataset (SQuAD) dataset. Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable. SQuAD2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable questions written adversarially by crowdworkers to look similar to answerable ones. To do well on SQuAD2.0, systems must not only answer questions when possible, but also determine when no answer is supported by the paragraph and abstain from answering. **Reference:** https://rajpurkar.github.io/SQuAD-explorer/ **Citation:** Rajpurkar, P., Jia, R. and Liang, P., 2018. Know what you don't know: Unanswerable questions for SQuAD. arXiv preprint arXiv:1806.03822. Args: directory (str, optional): Directory to cache the dataset. train (bool, optional): If to load the training split of the dataset. dev (bool, optional): If to load the development split of the dataset. train_filename (str, optional): The filename of the training split. dev_filename (str, optional): The filename of the development split. check_files_train (list, optional): All train filenames check_files_dev (list, optional): All development filenames url_train (str, optional): URL of the train dataset `.json` file. url_dev (str, optional): URL of the dev dataset `.json` file. Returns: :class:`tuple` of :class:`iterable` or :class:`iterable`: Returns between one and all dataset splits (train and dev) depending on if their respective boolean argument is ``True``. Example: >>> from torchnlp.datasets import squad_dataset # doctest: +SKIP >>> train = squad_dataset(train=True) # doctest: +SKIP >>> train[0]['paragraphs'][0]['qas'][0]['question'] # doctest: +SKIP 'When did Beyonce start becoming popular?' >>> train[0]['paragraphs'][0]['qas'][0]['answers'][0] # doctest: +SKIP {'text': 'in the late 1990s', 'answer_start': 269} """ download_file_maybe_extract(url=url_dev, directory=directory, check_files=check_files_dev) download_file_maybe_extract(url=url_train, directory=directory, check_files=check_files_train) ret = [] splits = [(train, train_filename), (dev, dev_filename)] splits = [f for (requested, f) in splits if requested] for filename in splits: full_path = os.path.join(directory, filename) with open(full_path, 'r') as temp: ret.append(json.load(temp)['data']) if len(ret) == 1: return ret[0] else: return tuple(ret)