from torchnlp.word_to_vector.pretrained_word_vectors import _PretrainedWordVectors
# List of all 275 supported languages from http://cosyne.h-its.org/bpemb/data/
SUPPORTED_LANGUAGES = [
'ab', 'ace', 'ady', 'af', 'ak', 'als', 'am', 'an', 'ang', 'ar', 'arc', 'arz', 'as', 'ast',
'atj', 'av', 'ay', 'az', 'azb', 'ba', 'bar', 'bcl', 'be', 'bg', 'bi', 'bjn', 'bm', 'bn', 'bo',
'bpy', 'br', 'bs', 'bug', 'bxr', 'ca', 'cdo', 'ce', 'ceb', 'ch', 'chr', 'chy', 'ckb', 'co',
'cr', 'crh', 'cs', 'csb', 'cu', 'cv', 'cy', 'da', 'de', 'din', 'diq', 'dsb', 'dty', 'dv', 'dz',
'ee', 'el', 'en', 'eo', 'es', 'et', 'eu', 'ext', 'fa', 'ff', 'fi', 'fj', 'fo', 'fr', 'frp',
'frr', 'fur', 'fy', 'ga', 'gag', 'gan', 'gd', 'gl', 'glk', 'gn', 'gom', 'got', 'gu', 'gv', 'ha',
'hak', 'haw', 'he', 'hi', 'hif', 'hr', 'hsb', 'ht', 'hu', 'hy', 'ia', 'id', 'ie', 'ig', 'ik',
'ilo', 'io', 'is', 'it', 'iu', 'ja', 'jam', 'jbo', 'jv', 'ka', 'kaa', 'kab', 'kbd', 'kbp', 'kg',
'ki', 'kk', 'kl', 'km', 'kn', 'ko', 'koi', 'krc', 'ks', 'ksh', 'ku', 'kv', 'kw', 'ky', 'la',
'lad', 'lb', 'lbe', 'lez', 'lg', 'li', 'lij', 'lmo', 'ln', 'lo', 'lrc', 'lt', 'ltg', 'lv',
'mai', 'mdf', 'mg', 'mh', 'mhr', 'mi', 'min', 'mk', 'ml', 'mn', 'mr', 'mrj', 'ms', 'mt', 'mwl',
'my', 'myv', 'mzn', 'na', 'nap', 'nds', 'ne', 'new', 'ng', 'nl', 'nn', 'no', 'nov', 'nrm',
'nso', 'nv', 'ny', 'oc', 'olo', 'om', 'or', 'os', 'pa', 'pag', 'pam', 'pap', 'pcd', 'pdc',
'pfl', 'pi', 'pih', 'pl', 'pms', 'pnb', 'pnt', 'ps', 'pt', 'qu', 'rm', 'rmy', 'rn', 'ro', 'ru',
'rue', 'rw', 'sa', 'sah', 'sc', 'scn', 'sco', 'sd', 'se', 'sg', 'sh', 'si', 'sk', 'sl', 'sm',
'sn', 'so', 'sq', 'sr', 'srn', 'ss', 'st', 'stq', 'su', 'sv', 'sw', 'szl', 'ta', 'tcy', 'te',
'tet', 'tg', 'th', 'ti', 'tk', 'tl', 'tn', 'to', 'tpi', 'tr', 'ts', 'tt', 'tum', 'tw', 'ty',
'tyv', 'udm', 'ug', 'uk', 'ur', 'uz', 've', 'vec', 'vep', 'vi', 'vls', 'vo', 'wa', 'war', 'wo',
'wuu', 'xal', 'xh', 'xmf', 'yi', 'yo', 'za', 'zea', 'zh', 'zu'
]
# All supported vector dimensionalities for which embeddings were trained
SUPPORTED_DIMS = [25, 50, 100, 200, 300]
# All supported number of merge operations for which embeddings were trained
SUPPORTED_MERGE_OPS = [1000, 3000, 5000, 10000, 25000, 50000, 100000, 200000]
[docs]class BPEmb(_PretrainedWordVectors):
"""
Byte-Pair Encoding (BPE) embeddings trained on Wikipedia for 275 languages
A collection of pre-trained subword unit embeddings in 275 languages, based
on Byte-Pair Encoding (BPE). In an evaluation using fine-grained entity typing as testbed,
BPEmb performs competitively, and for some languages better than alternative subword
approaches, while requiring vastly fewer resources and no tokenization.
References:
* https://arxiv.org/abs/1710.02187
* https://github.com/bheinzerling/bpemb
Args:
language (str, optional): Language of the corpus on which the embeddings
have been trained
dim (int, optional): Dimensionality of the embeddings
merge_ops (int, optional): Number of merge operations used by the
tokenizer
Example:
>>> from torchnlp.word_to_vector import BPEmb # doctest: +SKIP
>>> vectors = BPEmb(dim=25) # doctest: +SKIP
>>> subwords = "▁mel ford shire".split() # doctest: +SKIP
>>> vectors[subwords] # doctest: +SKIP
Columns 0 to 9
-0.5859 -0.1803 0.2623 -0.6052 0.0194 -0.2795 0.2716 -0.2957 -0.0492
1.0934
0.3848 -0.2412 1.0599 -0.8588 -1.2596 -0.2534 -0.5704 0.2168 -0.1718
1.2675
1.4407 -0.0996 1.2239 -0.5085 -0.7542 -0.9628 -1.7177 0.0618 -0.4025
1.0405
...
Columns 20 to 24
-0.0022 0.4820 -0.5156 -0.0564 0.4300
0.0355 -0.2257 0.1323 0.6053 -0.8878
-0.0167 -0.3686 0.9666 0.2497 -1.2239
[torch.FloatTensor of size 3x25]
"""
url_base = 'http://cosyne.h-its.org/bpemb/data/{language}/'
file_name = '{language}.wiki.bpe.op{merge_ops}.d{dim}.w2v.txt'
zip_extension = '.tar.gz'
def __init__(self, language='en', dim=300, merge_ops=50000, **kwargs):
# Check if all parameters are valid
if language not in SUPPORTED_LANGUAGES:
raise ValueError(("Language '%s' not supported. Use one of the "
"following options instead:\n%s") % (language, SUPPORTED_LANGUAGES))
if dim not in SUPPORTED_DIMS:
raise ValueError(
("Embedding dimensionality of '%d' not supported. "
"Use one of the following options instead:\n%s") % (dim, SUPPORTED_DIMS))
if merge_ops not in SUPPORTED_MERGE_OPS:
raise ValueError(("Number of '%d' merge operations not supported. "
"Use one of the following options instead:\n%s") %
(merge_ops, SUPPORTED_MERGE_OPS))
format_map = {'language': language, 'merge_ops': merge_ops, 'dim': dim}
# Assemble file name to locally store embeddings under
name = self.file_name.format_map(format_map)
# Assemble URL to download the embeddings form
url = (
self.url_base.format_map(format_map) + self.file_name.format_map(format_map) +
self.zip_extension)
super(BPEmb, self).__init__(name, url=url, **kwargs)