defread_time_machine(): #@save """将时间机器数据集加载到文本行的列表中""" withopen(d2l.download('time_machine'), 'r') as f: lines = f.readlines() return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]
lines = read_time_machine()
2. 词元化
将每一行又分解为若干词语:
1 2 3 4 5 6 7 8 9 10
deftokenize(lines, token='word'): #@save """将文本行拆分为单词或字符词元""" if token == 'word': return [line.split() for line in lines] elif token == 'char': return [list(line) for line in lines] else: print('错误:未知词元类型:' + token)
classVocab: #@save """文本词表""" def__init__(self, tokens=None, min_freq=0, reserved_tokens=None): if tokens isNone: tokens = [] if reserved_tokens isNone: reserved_tokens = [] # 按出现频率排序 counter = count_corpus(tokens) self._token_freqs = sorted(counter.items(), key=lambda x: x[1], reverse=True) # 未知词元的索引为0 self.idx_to_token = ['<unk>'] + reserved_tokens self.token_to_idx = {token: idx for idx, token inenumerate(self.idx_to_token)} for token, freq in self._token_freqs: if freq < min_freq: break if token notin self.token_to_idx: self.idx_to_token.append(token) self.token_to_idx[token] = len(self.idx_to_token) - 1
def__len__(self): returnlen(self.idx_to_token)
def__getitem__(self, tokens): ifnotisinstance(tokens, (list, tuple)): return self.token_to_idx.get(tokens, self.unk) return [self.__getitem__(token) for token in tokens]
defto_tokens(self, indices): ifnotisinstance(indices, (list, tuple)): return self.idx_to_token[indices] return [self.idx_to_token[index] for index in indices]
defcount_corpus(tokens): #@save """统计词元的频率""" # 这里的tokens是1D列表或2D列表 iflen(tokens) == 0orisinstance(tokens[0], list): # 将词元列表展平成一个列表 tokens = [token for line in tokens for token in line] return collections.Counter(tokens)