# -*- coding: utf-8 -*-
import os
from itertools import izip
import h5py
import numpy as np
import copy
import math
from config import HERE
from utils.logger import BaseLogger
class H5EmbeddingManager(BaseLogger):
def __init__(self, h5_path, mode='disk', **kwargs):
super(H5EmbeddingManager, self).__init__(**kwargs)
self.mode = mode
f = h5py.File(h5_path, 'r')
if mode == 'disk':
self.W = f['embedding']
elif mode == 'in-memory':
self.W = f['embedding'][:]
message = "load mode=%s, embedding data type=%s, shape=%s" % (self.mode, type(self.W), self.W.shape)
self.info(message)
words_flatten = f['words_flatten'][0]
self.id2word = words_flatten.split('\n')
assert len(self.id2word) == f.attrs['vocab_len'], "%s != %s" % (len(self.id2word), f.attrs['vocab_len'])
self.word2id = dict(izip(self.id2word, range(len(self.id2word))))
del words_flatten
def __getitem__(self, item):
item_type = type(item)
if item_type is str:
index = self.word2id[item]
embs = self.W[index]
return embs
else:
raise RuntimeError("don't support type: %s" % type(item))
def init_word_embedding(self, words, dim_size=300, scale=0.1, mode='google'):
print('loading word embedding.')
word2id = self.word2id
W = self.W
shape = (len(words), dim_size)
np.random.seed(len(words))
# W2V = np.random.uniform(low=-scale, high=scale, size=shape).astype('float32')
W2V = np.zeros(shape, dtype='float32')
for i, word in enumerate(words[1:], 1):
if word in word2id:
_id = word2id[word]
vec = W[_id]
vec /= np.linalg.norm(vec)
elif word.capitalize() in word2id:
_id = word2id[word.capitalize()]
vec = W[_id]
vec /= np.linalg.norm(vec)
else:
vec = np.random.normal(0, 1.0, 300)
vec = (0.01 * vec).astype('float32')
W2V[i] = vec[:dim_size]
return W2V
def init_word_embedding1(self, words, dim_size=300, scale=0.1, mode='google'):
word2id = self.word2id
W = self.W
shape = (len(words), dim_size)
np.random.seed(len(words))
# W2V = np.random.uniform(low=-scale, high=scale, size=shape).astype('float32')
W2V = np.random.normal(0, 1.0, size=shape).astype('float32') * 0.01
W2V[0, :] = 0
if mode == 'random':
return W2V
in_vocab = np.ones(shape[0], dtype=np.bool)
oov_set = set()
word_ids = []
for i, word in enumerate(words):
_id = -1
try:
_id = word2id[word]
except KeyError:
pass
if _id < 0:
try:
_id = word2id[word.capitalize()]
except KeyError:
pass
if _id < 0:
in_vocab[i] = False
if not word.startswith("$oov-"):
oov_set.update([word])
else:
word_ids.append(_id)
if self.mode == 'in-memory':
W2V[in_vocab][:, :] = W[np.array(word_ids, dtype='int32')][:, :dim_size]
else:
nonzero_ids = in_vocab.nonzero()[0]
for i in nonzero_ids:
emb = W[word_ids[i]]
W2V[i][:] = emb[:dim_size]
# logger.debug("%s words is not in google word2vec, and it is random "
# "initialized: %s" % (len(oov_set), oov_set))
return W2V
class EmbeddingInitEnhancer(BaseLogger):
'''
For more details, read "Counter-fitting Word Vectors to Linguistic Constraints"
'''
def __init__(self, init_word_vectors, vocab, repel_path_list, attract_path_list, **kwargs):
super(EmbeddingInitEnhancer, self).__init__(**kwargs)
self.build_word_vector_map(init_word_vectors, vocab)
self.init_vocab = vocab
self.repel_path_list = repel_path_list
self.attract_path_list = attract_path_list
self.repel = set()
self.attract = set()
# and we then have true the information to collect true the linguistic constraints:
for syn_filepath in self.attract_path_list:
self.attract = self.attract | self.load_constraints(syn_filepath, self.vocab)
for ant_filepath in self.repel_path_list:
self.repel = self.repel | self.load_constraints(ant_filepath, self.vocab)
# finally, set the experiment hyperparameters:
self.set_hyperparameters()
def build_word_vector_map(self, init_word_vectors, vocab):
self.word_vectors = {}
for i in xrange(len(vocab)):
self.word_vectors[vocab[i]] = init_word_vectors[i]
self.vocab = set(vocab)
def vector_map_to_vectors(self, word_vectors):
vector_list = [word_vectors[v] for v in self.init_vocab]
return np.vstack(vector_list)
def load_constraints(self, constraints_filepath, vocab):
"""
This methods reads a collection of constraints from the specified file, and returns a set with
true constraints for which both of their constituent words are in the specified vocabulary.
"""
constraints_filepath.strip()
constraints = set()
with open(constraints_filepath, "r+") as f:
for line in f:
word_pair = line.split()
if word_pair[0] in vocab and word_pair[1] in vocab and word_pair[0] != word_pair[1]:
constraints |= {(word_pair[0], word_pair[1])}
constraints |= {(word_pair[1], word_pair[0])}
self.info("%s yielded %s constraints." % (constraints_filepath, len(constraints)))
return constraints
def set_hyperparameters(self):
"""
This method sets the hyperparameters of the procedure as specified in the paper.
"""
self.hyper_k1 = 0.1
self.hyper_k2 = 0.1
self.hyper_k3 = 0.1
self.delta = 1.0
self.gamma = 0.0
self.rho = 0.2
self.info("embedding init enhancer hyperparameters --- k_1: %s, k_2: %s, k_3: %s, delta: %s, gamma: %s, rho: %s" %
(self.hyper_k1, self.hyper_k2, self.hyper_k3, self.delta, self.gamma, self.rho))
def get_enhanced_embedding(self, from_pretrained_vector=False):
"""
This method repeatedly applies SGD steps to counter-fit word vectors to linguistic constraints.
"""
word_vectors = self.word_vectors
repel = self.repel
attract = self.attract
current_iteration = 0
if from_pretrained_vector:
vsp_pairs = {}
if self.hyper_k3 > 0.0: # if we need to compute the VSP terms.
vsp_pairs = self.compute_vsp_pairs(word_vectors, self.vocab, rho=self.rho)
# Post-processing: remove synonym pairs which are deemed to be both synonyms and antonyms:
for repel_pair in repel:
if repel_pair in attract:
attract.remove(repel_pair)
if from_pretrained_vector and repel_pair in vsp_pairs:
del vsp_pairs[repel_pair]
max_iter = 20
self.info("repel pairs: %s, attract pairs: %s" % (len(repel), len(attract)))
self.info("Running the optimisation procedure for %s SGD steps..." % max_iter)
while current_iteration < max_iter:
current_iteration += 1
vsp_pairs = vsp_pairs if from_pretrained_vector else None
word_vectors = self.one_step_SGD(word_vectors, attract, repel, vsp_pairs)
return self.vector_map_to_vectors(word_vectors)
def one_step_SGD(self, word_vectors, attract_pairs, repel_pairs, vsp_pairs=None):
"""
This method performs a step of SGD to optimise the counterfitting cost funct
博士僧小星
- 粉丝: 2439
- 资源: 5998
最新资源
- 19 工资发放明细表-可视化图表.xlsx
- 27 员工工资表(图表分析).xlsx
- 23 财务报告工资数据图表模板.xlsx
- 22 财务报告工资数据图表模板.xlsx
- 24 工资表-年度薪资可视化图表.xlsx
- 26 财务分析部门工资支出图表.xlsx
- Python爬虫技术详解:从基础到实战.zip
- 25 工资费用支出表-可视化图表.xlsx
- 30公司各部门工资支出数据图表1.xlsx
- 29 员工月度工资支出数据图表.xlsx
- 28 工资表(自动计算,图表显示).xlsx
- 31 财务分析工资年度开支图表.xlsx
- 33 年度工资预算表(可视化看板).xlsx
- 32 公司年度工资成本数据图表.xlsx
- 34 年度工资汇总-数据可视化看板.xlsx
- 36 财务报表新年度部门工资预算表.xlsx
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈