一、bytes_to_unicode
def bytes_to_unicode():
"""
Returns list of utf-8 byte and a corresponding list of unicode strings.
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a signficant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
"""
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8+n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
可能的问题:
1.Unicod != ACSII,ASCII是Unicode的子集
编码 | 范围 | 说明 |
---|---|---|
ASCII | 0 ~ 127 | 最早的字符编码,只支持英语字母、数字、标点、控制字符 |
Unicode | 0 ~ 1,114,111(约百万个码位) | 全球通用字符编码,包含了 ASCII 全部内容,还支持中文、日文、表情符号等 |
2.bs是idx,cs是对应的字符
3.为什么bs里append的是b而cs是2**8+n?
因为bs定义时会跳过一些不安全的控制字符,但是又希望bs是从0到255没有空缺,所以在cs中就会用安全的ASCII码之外的字符代替
4.cs = bs[:]
语法 | 解释 |
---|---|
cs = bs |
两个变量指向 同一个列表,修改一个会影响另一个(引用赋值) |
cs = bs[:] |
复制了一个 新的列表,内容一样,但两个变量互不影响(浅拷贝) |
5.一些函数
问题 | 解答 |
---|---|
ord() |
字符 → Unicode 整数,比如 ord('A') = 65 |
chr() |
Unicode 整数 → 字符,比如 chr(65) = 'A' |
dict(zip(bs, cs)) |
把两个列表配对,创建 字节 → 字符 的映射字典 |
二、get_pairs
def get_pairs(word):
"""Return set of symbol pairs in a word.
Word is represented as tuple of symbols (symbols being variable-length strings).
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
这里应该好理解,就是给定一个由多个符号(symbol)组成的单词(word),返回它所有相邻符号对的集合。下面是举例:
word = ('ab', 'cd', 'e')
get_pairs(word)
→ {('ab', 'cd'), ('cd', 'e')}
三、basic_clean + whitespace_clean
函数或方法 | 作用 | 示例输入 | 示例输出 | 备注 |
---|---|---|---|---|
ftfy.fix_text(text) |
修复 Unicode 编码错误、替换乱码字符 | "âhelloâ" |
"“hello”" |
修复网页/数据库导出乱码最常用 |
html.unescape(text) |
将 HTML 实体编码还原为正常字符 | "<div>" |
"<div>" |
常用于网页文本解析 |
text.strip() |
去除字符串前后的空白字符(包括 \n , \t , 空格) |
" hello world \n" |
"hello world" |
不影响中间空格 |
re.sub(r'\s+', ' ', text) |
将所有连续的空白字符替换为一个空格 | "This\nis\t\ta test" |
"This is a test" |
中间所有空白都变成一个普通空格 |
四、bpe
word = tuple(token[:-1]) + (token[-1] + '</w>',)
举例:token = 'low'
→ ('l', 'o', 'w</w>')
为什么不是word = tuple(token) + ( '</w>',),这样的话就会把最后一个字符和'</w>'分开,导致无法标志单词结束的位置
pairs = get_pairs(word)
举例:('l', 'o', 'w</w>')
→ {('l','o'), ('o','w</w>')}
if not pairs:
return token+'</w>'
举例:
输入 token |
处理后 word |
pairs |
输出 |
---|---|---|---|
"a" |
('a</w>',) |
set() (空集) |
"a</w>" |
下面用“lowered”来举例bpe的完整过程:
轮次 | 当前 word |
合并的 bigram | 合并后结果 |
---|---|---|---|
1 | ('l','o','w','e','r','e','d</w>') |
('l','o') |
('lo','w','e','r','e','d</w>') |
2 | ('lo','w','e','r','e','d</w>') |
('lo','w') |
('low','e','r','e','d</w>') |
3 | ('low','e','r','e','d</w>') |
('e','r') |
('low','er','e','d</w>') |
4 | ('low','er','e','d</w>') |
⛔无匹配 | 循环结束 |
def bpe(self, token):
if token in self.cache:
return self.cache[token]
word = tuple(token[:-1]) + (token[-1] + '</w>',)
pairs = get_pairs(word)
if not pairs:
return token+'</w>'
while True:
bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
new_word.extend(word[i:j])
i = j
except:
new_word.extend(word[i:])
break
if word[i] == first and i < len(word)-1 and word[i+1] == second:
new_word.append(first+second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = ' '.join(word)
self.cache[token] = word
return word