Source code for torchnlp.word_to_vector.bpemb

from torchnlp.word_to_vector.pretrained_word_vectors import _PretrainedWordVectors

# List of all 275 supported languages from http://cosyne.h-its.org/bpemb/data/
SUPPORTED_LANGUAGES = [
    'ab', 'ace', 'ady', 'af', 'ak', 'als', 'am', 'an', 'ang', 'ar', 'arc', 'arz', 'as', 'ast',
    'atj', 'av', 'ay', 'az', 'azb', 'ba', 'bar', 'bcl', 'be', 'bg', 'bi', 'bjn', 'bm', 'bn', 'bo',
    'bpy', 'br', 'bs', 'bug', 'bxr', 'ca', 'cdo', 'ce', 'ceb', 'ch', 'chr', 'chy', 'ckb', 'co',
    'cr', 'crh', 'cs', 'csb', 'cu', 'cv', 'cy', 'da', 'de', 'din', 'diq', 'dsb', 'dty', 'dv', 'dz',
    'ee', 'el', 'en', 'eo', 'es', 'et', 'eu', 'ext', 'fa', 'ff', 'fi', 'fj', 'fo', 'fr', 'frp',
    'frr', 'fur', 'fy', 'ga', 'gag', 'gan', 'gd', 'gl', 'glk', 'gn', 'gom', 'got', 'gu', 'gv', 'ha',
    'hak', 'haw', 'he', 'hi', 'hif', 'hr', 'hsb', 'ht', 'hu', 'hy', 'ia', 'id', 'ie', 'ig', 'ik',
    'ilo', 'io', 'is', 'it', 'iu', 'ja', 'jam', 'jbo', 'jv', 'ka', 'kaa', 'kab', 'kbd', 'kbp', 'kg',
    'ki', 'kk', 'kl', 'km', 'kn', 'ko', 'koi', 'krc', 'ks', 'ksh', 'ku', 'kv', 'kw', 'ky', 'la',
    'lad', 'lb', 'lbe', 'lez', 'lg', 'li', 'lij', 'lmo', 'ln', 'lo', 'lrc', 'lt', 'ltg', 'lv',
    'mai', 'mdf', 'mg', 'mh', 'mhr', 'mi', 'min', 'mk', 'ml', 'mn', 'mr', 'mrj', 'ms', 'mt', 'mwl',
    'my', 'myv', 'mzn', 'na', 'nap', 'nds', 'ne', 'new', 'ng', 'nl', 'nn', 'no', 'nov', 'nrm',
    'nso', 'nv', 'ny', 'oc', 'olo', 'om', 'or', 'os', 'pa', 'pag', 'pam', 'pap', 'pcd', 'pdc',
    'pfl', 'pi', 'pih', 'pl', 'pms', 'pnb', 'pnt', 'ps', 'pt', 'qu', 'rm', 'rmy', 'rn', 'ro', 'ru',
    'rue', 'rw', 'sa', 'sah', 'sc', 'scn', 'sco', 'sd', 'se', 'sg', 'sh', 'si', 'sk', 'sl', 'sm',
    'sn', 'so', 'sq', 'sr', 'srn', 'ss', 'st', 'stq', 'su', 'sv', 'sw', 'szl', 'ta', 'tcy', 'te',
    'tet', 'tg', 'th', 'ti', 'tk', 'tl', 'tn', 'to', 'tpi', 'tr', 'ts', 'tt', 'tum', 'tw', 'ty',
    'tyv', 'udm', 'ug', 'uk', 'ur', 'uz', 've', 'vec', 'vep', 'vi', 'vls', 'vo', 'wa', 'war', 'wo',
    'wuu', 'xal', 'xh', 'xmf', 'yi', 'yo', 'za', 'zea', 'zh', 'zu'
]

# All supported vector dimensionalities for which embeddings were trained
SUPPORTED_DIMS = [25, 50, 100, 200, 300]

# All supported number of merge operations for which embeddings were trained
SUPPORTED_MERGE_OPS = [1000, 3000, 5000, 10000, 25000, 50000, 100000, 200000]


[docs]class BPEmb(_PretrainedWordVectors):
    """
    Byte-Pair Encoding (BPE) embeddings trained on Wikipedia for 275 languages

    A collection of pre-trained subword unit embeddings in 275 languages, based
    on Byte-Pair Encoding (BPE). In an evaluation using fine-grained entity typing as testbed,
    BPEmb performs competitively, and for some languages better than alternative subword
    approaches, while requiring vastly fewer resources and no tokenization.

    References:
        * https://arxiv.org/abs/1710.02187
        * https://github.com/bheinzerling/bpemb

    Args:
        language (str, optional): Language of the corpus on which the embeddings
            have been trained
        dim (int, optional): Dimensionality of the embeddings
        merge_ops (int, optional): Number of merge operations used by the
            tokenizer

    Example:
        >>> from torchnlp.word_to_vector import BPEmb  # doctest: +SKIP
        >>> vectors = BPEmb(dim=25)  # doctest: +SKIP
        >>> subwords = "▁mel ford shire".split()  # doctest: +SKIP
        >>> vectors[subwords]  # doctest: +SKIP
        Columns 0 to 9
        -0.5859 -0.1803  0.2623 -0.6052  0.0194 -0.2795  0.2716 -0.2957 -0.0492
        1.0934
         0.3848 -0.2412  1.0599 -0.8588 -1.2596 -0.2534 -0.5704  0.2168 -0.1718
        1.2675
         1.4407 -0.0996  1.2239 -0.5085 -0.7542 -0.9628 -1.7177  0.0618 -0.4025
        1.0405
        ...
        Columns 20 to 24
        -0.0022  0.4820 -0.5156 -0.0564  0.4300
         0.0355 -0.2257  0.1323  0.6053 -0.8878
        -0.0167 -0.3686  0.9666  0.2497 -1.2239
        [torch.FloatTensor of size 3x25]
    """
    url_base = 'http://cosyne.h-its.org/bpemb/data/{language}/'
    file_name = '{language}.wiki.bpe.op{merge_ops}.d{dim}.w2v.txt'
    zip_extension = '.tar.gz'

    def __init__(self, language='en', dim=300, merge_ops=50000, **kwargs):
        # Check if all parameters are valid
        if language not in SUPPORTED_LANGUAGES:
            raise ValueError(("Language '%s' not supported. Use one of the "
                              "following options instead:\n%s") % (language, SUPPORTED_LANGUAGES))
        if dim not in SUPPORTED_DIMS:
            raise ValueError(
                ("Embedding dimensionality of '%d' not supported. "
                 "Use one of the following options instead:\n%s") % (dim, SUPPORTED_DIMS))
        if merge_ops not in SUPPORTED_MERGE_OPS:
            raise ValueError(("Number of '%d' merge operations not supported. "
                              "Use one of the following options instead:\n%s") %
                             (merge_ops, SUPPORTED_MERGE_OPS))

        format_map = {'language': language, 'merge_ops': merge_ops, 'dim': dim}

        # Assemble file name to locally store embeddings under
        name = self.file_name.format_map(format_map)
        # Assemble URL to download the embeddings form
        url = (
            self.url_base.format_map(format_map) + self.file_name.format_map(format_map) +
            self.zip_extension)

        super(BPEmb, self).__init__(name, url=url, **kwargs)