import json
import numpy as np
import joblib

with open('./preprocess/dataset.json', 'r') as f:
    dataset = json.load(f)

len(dataset)
# 1665

MAX_SEQUENCE_LENGTH = 85	# 句子最大长度
EVENT_TYPE = 7		# 事件类型数量

Word2Vec

生成 embedding_index

这里用的预训练Word2Vec语料为 sgns.weibo.bigram-char，建立一个字典 embedding_index，用来记录 word 与 vec 的映射。

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

embeddings_index = {}
with open('path-to/sgns.weibo.bigram-char', encoding='utf-8') as f:
    lines = f.readlines()
    lines = [l.strip() for l in lines]
    print(lines[0])	# 195197 300，共有195197个词，embedding-dim为300
    
    for line in lines[1:]:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')	# 转为vec
        embeddings_index[word] = coefs
       
print('Found %s word vectors.' % len(embeddings_index))
# Found 195197 word vectors.

填充句子

首先定义最大的词数为6000，embedding-dim 为 300。并且用 texts 记录每句话：

MAX_NUM_WORDS = 6000
EMBEDDING_DIM = 300

texts = []

for piece in dataset:
    texts.append(piece['sentence_words'])
    
len(texts)	# 1665

进行分词，得到 word_index 用于记录在 Tokenizer 中 token 和 id 的映射关系。

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on(texts)	# 分词
sequences = tokenizer.texts_to_sequences(texts)		# 将中文句子转为 word_id 序列

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
# Found 6476 unique tokens.

最后利用 pad_sequences 填充句子至数据集中的最大句子长度（85）：

1	we = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

生成 embedding_matrix

生成 embedding_matrix ，**维度为 (单词数量，embedding 长度)**。

num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# 保存embedding_matrix we
np.save('./we_embedding_matrix_{}.npy'.format(embedding_matrix.shape), embedding_matrix)
np.save('./we_{}.npy'.format(we.shape), we)

import joblib
# 保存 token 到 id 的映射
joblib.dump(word_index, 'we_word_index.pkl')

位置向量 Position

import json
import numpy as np
import joblib

with open('./preprocess/dataset.json', 'r') as f:
    dataset = json.load(f)

    
len(dataset)	# 1665

# we: 填充后的句子，type为list，元素为token的id
we = np.load('./we_(1665, 85).npy')
we.shape	# (1665, 85)

# token与id的映射
word_index = joblib.load('./we_word_index.pkl')
len(word_index)		# 6476

cal_distance_arr 函数

cal_distance_arr 函数用来计算位置信息，

def cal_distance_arr(word, we_arr):
    assert len(we_arr) == MAX_SEQUENCE_LENGTH
    arr = np.arange(MAX_SEQUENCE_LENGTH)
    
    value = word_index[word]	# 对应的id
    index = np.where(we_arr==value)[0]	# 取第一个位置

    if len(index) != 0:
        index = index[0]	# word 的 index
        
        arr = np.abs(arr - index)	# arr 为与 word 的距离（绝对值）
        arr[np.where(we_arr==0)[0]] = -1	# 填充的部分置为 -1
        return index, arr
    else:
        # 不可达
        return -1, np.zeros(MAX_SEQUENCE_LENGTH) - 1   
    
cal_distance_arr('领导',we[0])
# (69,
#  array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
#         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
#         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
#         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  5,  4,  3,  2,
#          1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]))

计算位置向量

初始化位置向量矩阵，前两维代表第 i 个句子的第 j 个（触发）词的位置向量，后两维代表触发词位置向量和论元的位置向量。

1
2
3

position = np.zeros((len(dataset), MAX_SEQUENCE_LENGTH, MAX_SEQUENCE_LENGTH, 2))
position.shape
# (1665, 85, 85, 2)

计算触发词和对应论元的位置向量：

for i, piece in enumerate(dataset):
    triggers = piece['triggers']
    for trigger in triggers:
        # trigger词
        t_word = trigger['event_trigger']
        trigger_index, trigger_arr = cal_distance_arr(t_word, we[i])

        # arguments词
        try:
            a_word = trigger['event_arguments']
            argument_index, argument_arr = cal_distance_arr(a_word, we[i])
        except:
            argument_arr = np.zeros(MAX_SEQUENCE_LENGTH) - 1

        if trigger_index == -1:
            continue

        position[i][trigger_index] = np.concatenate(
            [trigger_arr[:, None], argument_arr[:, None]], axis=-1)
        
np.save('./position_{}.npy'.format(position.shape), position)

Lexical Level Feature

find_index 函数和 get_context_value 函数

find_index 函数用于获取 word 在 we_arr 中的下标，若不可达则返回 -1：

def find_index(word, we_arr):
    assert len(we_arr) == MAX_SEQUENCE_LENGTH
    
    value = word_index[word]
    index = np.where(we_arr==value)[0]
    if len(index) != 0:
        index = index[0]
        return index
    else:
        # 不可达
        return -1

get_context_value 用于返回 we_arr 中坐标为 index 左、中、右的 token id：

def get_context_value(index, we_arr):
    if index == -1:
        return 0, 0, 0
    
    curr = we_arr[index]
    try:
        left = we_arr[index-1]
    except:
        left = 0
    try:
        right = we_arr[index+1]
    except:
        right = 0

    return left, curr, right

计算 Lexical 矩阵

首先初始化 Lexical 矩阵，前两维代表第 i 个句子的第 j 个（触发）词的 Lexical 向量，最后一维表示触发词和论元的左、中、右 token_id（get_context_value 返回的）。

1
2
3

lexical = np.zeros((len(dataset), MAX_SEQUENCE_LENGTH, 6))
lexical.shape
# (1665, 85, 6)

计算 Lexical 矩阵：

for i, piece in enumerate(dataset):
    triggers = piece['triggers']
    for trigger in triggers:
        # trigger词
        t_word = trigger['event_trigger']
        trigger_index = find_index(t_word, we[i])

        # arguments词
        try:
            a_word = trigger['event_arguments']
            argument_index = find_index(a_word, we[i])
        except:
            argument_index = -1
            
        if trigger_index == -1:
            continue
        
        # 触发词的
        lexical[i][trigger_index][:3] = get_context_value(trigger_index, we[i])
        # 论元的
        lexical[i][trigger_index][3:] = get_context_value(argument_index, we[i])
   
np.save('./lexical_{}.npy'.format(lexical.shape), lexical)

用于 DynamicPooling：Mask

因为在动态池化时，把每一个 feature map 分为了 3 部分（被触发词和论元划分），所以需要计算 Mask 矩阵用于动态池化操作。

gen_mask_arr 函数

gen_mask_arr 函数用于产生 mask_arr，输入中 t_index、a_index 分别是触发词的下标、论元的下标。

def gen_mask_arr(t_index, a_index):
    mask_arr = np.ones((3, MAX_SEQUENCE_LENGTH))
    
    partition = [t_index, a_index]
    partition.sort()
    low, high = partition
    low = 0 if low < 0 else low
    
    mask_arr[0][0:low] = 0
    mask_arr[1][low:high] = 0
    mask_arr[2][high:] = 0
    
    return mask_arr

计算 Mask 矩阵

首先初始化 Mask 矩阵，“3” 代表了被分为三个部分。

1
2
3

mask = np.ones((len(dataset), MAX_SEQUENCE_LENGTH, 3, MAX_SEQUENCE_LENGTH))
mask.shape
# (1665, 85, 3, 85)

计算 Mask 矩阵。

for i, piece in enumerate(dataset):
    triggers = piece['triggers']
    for trigger in triggers:
        # trigger词
        t_word = trigger['event_trigger']
        trigger_index = find_index(t_word, we[i])

        # arguments词
        try:
            a_word = trigger['event_arguments']
            argument_index = find_index(a_word, we[i])
        except:
            argument_index = -1
            
        if trigger_index == -1:
            continue
        
        mask[i][trigger_index] = gen_mask_arr(trigger_index, argument_index)

生成 Label

得到 event2index

event2index 是一个字典，其记录了事件类型与 id 的映射。

event_type = dict()

for sent in dataset:
    for trigger in sent['triggers']:
        t = trigger['event']
        if t not in event_type.keys():
            event_type[t] = 1
        else:
            event_type[t] += 1
            
events = list(event_type.keys())
events.sort()

event2id = dict(zip(events, [i+1 for i in range(len(events))]))
event2id
# {'action': 1,
#  'emergency': 2,
#  'movement': 3,
#  'operation': 4,
#  'perception': 5,
#  'stateChange': 6,
#  'statement': 7}

利用 to_categorical 函数生成 Label

利用 to_categorical 生成 one-hot 形式的 label 向量：

from tensorflow.keras.utils import to_categorical

label = np.zeros((len(dataset), MAX_SEQUENCE_LENGTH))

for i, piece in enumerate(dataset):
    for trigger in piece['triggers']:
        j = find_index(trigger['event_trigger'], we[i])
        t = event2id[trigger['event']]
        label[i][j] = t

label = to_categorical(label)
label.shape
# (1665, 85, 8)

np.save('./label_{}.npy'.format(label.shape), label)

Dawn's Blogs

事件抽取模型复现之DMCNN (2) 制作模型所需要的各项输入/输出矩阵