这里用的预训练Word2Vec语料为 sgns.weibo.bigram-char,建立一个字典 embedding_index,用来记录 word 与 vec 的映射。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences
embeddings_index = {} withopen('path-to/sgns.weibo.bigram-char', encoding='utf-8') as f: lines = f.readlines() lines = [l.strip() for l in lines] print(lines[0]) # 195197 300,共有195197个词,embedding-dim为300 for line in lines[1:]: word, coefs = line.split(maxsplit=1) coefs = np.fromstring(coefs, 'f', sep=' ') # 转为vec embeddings_index[word] = coefs print('Found %s word vectors.' % len(embeddings_index)) # Found 195197 word vectors.
we = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
生成 embedding_matrix
生成 embedding_matrix ,**维度为 (单词数量,embedding 长度)**。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
num_words = min(MAX_NUM_WORDS, len(word_index) + 1) embedding_matrix = np.zeros((num_words, EMBEDDING_DIM)) for word, i in word_index.items(): if i >= MAX_NUM_WORDS: continue embedding_vector = embeddings_index.get(word) if embedding_vector isnotNone: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector
# 保存embedding_matrix we np.save('./we_embedding_matrix_{}.npy'.format(embedding_matrix.shape), embedding_matrix) np.save('./we_{}.npy'.format(we.shape), we)
import joblib # 保存 token 到 id 的映射 joblib.dump(word_index, 'we_word_index.pkl')
位置向量 Position
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
import json import numpy as np import joblib
withopen('./preprocess/dataset.json', 'r') as f: dataset = json.load(f)
deffind_index(word, we_arr): assertlen(we_arr) == MAX_SEQUENCE_LENGTH value = word_index[word] index = np.where(we_arr==value)[0] iflen(index) != 0: index = index[0] return index else: # 不可达 return -1
get_context_value 用于返回 we_arr 中坐标为 index 左、中、右的 token id:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
defget_context_value(index, we_arr): if index == -1: return0, 0, 0 curr = we_arr[index] try: left = we_arr[index-1] except: left = 0 try: right = we_arr[index+1] except: right = 0
for i, piece inenumerate(dataset): triggers = piece['triggers'] for trigger in triggers: # trigger词 t_word = trigger['event_trigger'] trigger_index = find_index(t_word, we[i])
for i, piece inenumerate(dataset): triggers = piece['triggers'] for trigger in triggers: # trigger词 t_word = trigger['event_trigger'] trigger_index = find_index(t_word, we[i])
for sent in dataset: for trigger in sent['triggers']: t = trigger['event'] if t notin event_type.keys(): event_type[t] = 1 else: event_type[t] += 1 events = list(event_type.keys()) events.sort()
for i, piece inenumerate(dataset): for trigger in piece['triggers']: j = find_index(trigger['event_trigger'], we[i]) t = event2id[trigger['event']] label[i][j] = t