Source code for torchnlp.word_to_vector.bpemb

from torchnlp.word_to_vector.pretrained_word_vectors import _PretrainedWordVectors

# List of all 275 supported languages from
    'ab', 'ace', 'ady', 'af', 'ak', 'als', 'am', 'an', 'ang', 'ar', 'arc', 'arz', 'as', 'ast',
    'atj', 'av', 'ay', 'az', 'azb', 'ba', 'bar', 'bcl', 'be', 'bg', 'bi', 'bjn', 'bm', 'bn', 'bo',
    'bpy', 'br', 'bs', 'bug', 'bxr', 'ca', 'cdo', 'ce', 'ceb', 'ch', 'chr', 'chy', 'ckb', 'co',
    'cr', 'crh', 'cs', 'csb', 'cu', 'cv', 'cy', 'da', 'de', 'din', 'diq', 'dsb', 'dty', 'dv', 'dz',
    'ee', 'el', 'en', 'eo', 'es', 'et', 'eu', 'ext', 'fa', 'ff', 'fi', 'fj', 'fo', 'fr', 'frp',
    'frr', 'fur', 'fy', 'ga', 'gag', 'gan', 'gd', 'gl', 'glk', 'gn', 'gom', 'got', 'gu', 'gv', 'ha',
    'hak', 'haw', 'he', 'hi', 'hif', 'hr', 'hsb', 'ht', 'hu', 'hy', 'ia', 'id', 'ie', 'ig', 'ik',
    'ilo', 'io', 'is', 'it', 'iu', 'ja', 'jam', 'jbo', 'jv', 'ka', 'kaa', 'kab', 'kbd', 'kbp', 'kg',
    'ki', 'kk', 'kl', 'km', 'kn', 'ko', 'koi', 'krc', 'ks', 'ksh', 'ku', 'kv', 'kw', 'ky', 'la',
    'lad', 'lb', 'lbe', 'lez', 'lg', 'li', 'lij', 'lmo', 'ln', 'lo', 'lrc', 'lt', 'ltg', 'lv',
    'mai', 'mdf', 'mg', 'mh', 'mhr', 'mi', 'min', 'mk', 'ml', 'mn', 'mr', 'mrj', 'ms', 'mt', 'mwl',
    'my', 'myv', 'mzn', 'na', 'nap', 'nds', 'ne', 'new', 'ng', 'nl', 'nn', 'no', 'nov', 'nrm',
    'nso', 'nv', 'ny', 'oc', 'olo', 'om', 'or', 'os', 'pa', 'pag', 'pam', 'pap', 'pcd', 'pdc',
    'pfl', 'pi', 'pih', 'pl', 'pms', 'pnb', 'pnt', 'ps', 'pt', 'qu', 'rm', 'rmy', 'rn', 'ro', 'ru',
    'rue', 'rw', 'sa', 'sah', 'sc', 'scn', 'sco', 'sd', 'se', 'sg', 'sh', 'si', 'sk', 'sl', 'sm',
    'sn', 'so', 'sq', 'sr', 'srn', 'ss', 'st', 'stq', 'su', 'sv', 'sw', 'szl', 'ta', 'tcy', 'te',
    'tet', 'tg', 'th', 'ti', 'tk', 'tl', 'tn', 'to', 'tpi', 'tr', 'ts', 'tt', 'tum', 'tw', 'ty',
    'tyv', 'udm', 'ug', 'uk', 'ur', 'uz', 've', 'vec', 'vep', 'vi', 'vls', 'vo', 'wa', 'war', 'wo',
    'wuu', 'xal', 'xh', 'xmf', 'yi', 'yo', 'za', 'zea', 'zh', 'zu'

# All supported vector dimensionalities for which embeddings were trained
SUPPORTED_DIMS = [25, 50, 100, 200, 300]

# All supported number of merge operations for which embeddings were trained
SUPPORTED_MERGE_OPS = [1000, 3000, 5000, 10000, 25000, 50000, 100000, 200000]

[docs]class BPEmb(_PretrainedWordVectors): """ Byte-Pair Encoding (BPE) embeddings trained on Wikipedia for 275 languages A collection of pre-trained subword unit embeddings in 275 languages, based on Byte-Pair Encoding (BPE). In an evaluation using fine-grained entity typing as testbed, BPEmb performs competitively, and for some languages better than alternative subword approaches, while requiring vastly fewer resources and no tokenization. References: * * Args: language (str, optional): Language of the corpus on which the embeddings have been trained dim (int, optional): Dimensionality of the embeddings merge_ops (int, optional): Number of merge operations used by the tokenizer Example: >>> from torchnlp.word_to_vector import BPEmb # doctest: +SKIP >>> vectors = BPEmb(dim=25) # doctest: +SKIP >>> subwords = "▁mel ford shire".split() # doctest: +SKIP >>> vectors[subwords] # doctest: +SKIP Columns 0 to 9 -0.5859 -0.1803 0.2623 -0.6052 0.0194 -0.2795 0.2716 -0.2957 -0.0492 1.0934 0.3848 -0.2412 1.0599 -0.8588 -1.2596 -0.2534 -0.5704 0.2168 -0.1718 1.2675 1.4407 -0.0996 1.2239 -0.5085 -0.7542 -0.9628 -1.7177 0.0618 -0.4025 1.0405 ... Columns 20 to 24 -0.0022 0.4820 -0.5156 -0.0564 0.4300 0.0355 -0.2257 0.1323 0.6053 -0.8878 -0.0167 -0.3686 0.9666 0.2497 -1.2239 [torch.FloatTensor of size 3x25] """ url_base = '{language}/' file_name = '{language}.wiki.bpe.op{merge_ops}.d{dim}.w2v.txt' zip_extension = '.tar.gz' def __init__(self, language='en', dim=300, merge_ops=50000, **kwargs): # Check if all parameters are valid if language not in SUPPORTED_LANGUAGES: raise ValueError(("Language '%s' not supported. Use one of the " "following options instead:\n%s") % (language, SUPPORTED_LANGUAGES)) if dim not in SUPPORTED_DIMS: raise ValueError( ("Embedding dimensionality of '%d' not supported. " "Use one of the following options instead:\n%s") % (dim, SUPPORTED_DIMS)) if merge_ops not in SUPPORTED_MERGE_OPS: raise ValueError(("Number of '%d' merge operations not supported. " "Use one of the following options instead:\n%s") % (merge_ops, SUPPORTED_MERGE_OPS)) format_map = {'language': language, 'merge_ops': merge_ops, 'dim': dim} # Assemble file name to locally store embeddings under name = self.file_name.format_map(format_map) # Assemble URL to download the embeddings form url = ( self.url_base.format_map(format_map) + self.file_name.format_map(format_map) + self.zip_extension) super(BPEmb, self).__init__(name, url=url, **kwargs)