人工智能-问答系统-基于知识图谱的问答

共18个文件

py：17个

readme：1个

人工智能

知识图谱

问答系统

109 浏览量 2024-08-29 14:42:25 上传评论 1 收藏 24KB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

KnowledgeGraph-QA.zip （18个子文件）

KnowledgeGraph-QA-Service-master

main.py 416B

utils

__init__.py 1KB

neo4j_api.py 11KB

nlu_api.py 1KB

solr_api.py 5KB

logger.py 2KB

web

__init__.py 0B

Test.py 2KB

service

__init__.py 1KB

retrieval_service.py 4KB

template_service.py 13KB

const.py 2KB

wordembedding

__init__.py 2KB

embedding.py 16KB

readme 159B

embedding_h5.py 4KB

wordvector.py 2KB

config.py 1KB

# -*- coding: utf-8 -*- import os from itertools import izip import h5py import numpy as np import copy import math from config import HERE from utils.logger import BaseLogger class H5EmbeddingManager(BaseLogger): def __init__(self, h5_path, mode='disk', **kwargs): super(H5EmbeddingManager, self).__init__(**kwargs) self.mode = mode f = h5py.File(h5_path, 'r') if mode == 'disk': self.W = f['embedding'] elif mode == 'in-memory': self.W = f['embedding'][:] message = "load mode=%s, embedding data type=%s, shape=%s" % (self.mode, type(self.W), self.W.shape) self.info(message) words_flatten = f['words_flatten'][0] self.id2word = words_flatten.split('\n') assert len(self.id2word) == f.attrs['vocab_len'], "%s != %s" % (len(self.id2word), f.attrs['vocab_len']) self.word2id = dict(izip(self.id2word, range(len(self.id2word)))) del words_flatten def __getitem__(self, item): item_type = type(item) if item_type is str: index = self.word2id[item] embs = self.W[index] return embs else: raise RuntimeError("don't support type: %s" % type(item)) def init_word_embedding(self, words, dim_size=300, scale=0.1, mode='google'): print('loading word embedding.') word2id = self.word2id W = self.W shape = (len(words), dim_size) np.random.seed(len(words)) # W2V = np.random.uniform(low=-scale, high=scale, size=shape).astype('float32') W2V = np.zeros(shape, dtype='float32') for i, word in enumerate(words[1:], 1): if word in word2id: _id = word2id[word] vec = W[_id] vec /= np.linalg.norm(vec) elif word.capitalize() in word2id: _id = word2id[word.capitalize()] vec = W[_id] vec /= np.linalg.norm(vec) else: vec = np.random.normal(0, 1.0, 300) vec = (0.01 * vec).astype('float32') W2V[i] = vec[:dim_size] return W2V def init_word_embedding1(self, words, dim_size=300, scale=0.1, mode='google'): word2id = self.word2id W = self.W shape = (len(words), dim_size) np.random.seed(len(words)) # W2V = np.random.uniform(low=-scale, high=scale, size=shape).astype('float32') W2V = np.random.normal(0, 1.0, size=shape).astype('float32') * 0.01 W2V[0, :] = 0 if mode == 'random': return W2V in_vocab = np.ones(shape[0], dtype=np.bool) oov_set = set() word_ids = [] for i, word in enumerate(words): _id = -1 try: _id = word2id[word] except KeyError: pass if _id < 0: try: _id = word2id[word.capitalize()] except KeyError: pass if _id < 0: in_vocab[i] = False if not word.startswith("$oov-"): oov_set.update([word]) else: word_ids.append(_id) if self.mode == 'in-memory': W2V[in_vocab][:, :] = W[np.array(word_ids, dtype='int32')][:, :dim_size] else: nonzero_ids = in_vocab.nonzero()[0] for i in nonzero_ids: emb = W[word_ids[i]] W2V[i][:] = emb[:dim_size] # logger.debug("%s words is not in google word2vec, and it is random " # "initialized: %s" % (len(oov_set), oov_set)) return W2V class EmbeddingInitEnhancer(BaseLogger): ''' For more details, read "Counter-fitting Word Vectors to Linguistic Constraints" ''' def __init__(self, init_word_vectors, vocab, repel_path_list, attract_path_list, **kwargs): super(EmbeddingInitEnhancer, self).__init__(**kwargs) self.build_word_vector_map(init_word_vectors, vocab) self.init_vocab = vocab self.repel_path_list = repel_path_list self.attract_path_list = attract_path_list self.repel = set() self.attract = set() # and we then have true the information to collect true the linguistic constraints: for syn_filepath in self.attract_path_list: self.attract = self.attract | self.load_constraints(syn_filepath, self.vocab) for ant_filepath in self.repel_path_list: self.repel = self.repel | self.load_constraints(ant_filepath, self.vocab) # finally, set the experiment hyperparameters: self.set_hyperparameters() def build_word_vector_map(self, init_word_vectors, vocab): self.word_vectors = {} for i in xrange(len(vocab)): self.word_vectors[vocab[i]] = init_word_vectors[i] self.vocab = set(vocab) def vector_map_to_vectors(self, word_vectors): vector_list = [word_vectors[v] for v in self.init_vocab] return np.vstack(vector_list) def load_constraints(self, constraints_filepath, vocab): """ This methods reads a collection of constraints from the specified file, and returns a set with true constraints for which both of their constituent words are in the specified vocabulary. """ constraints_filepath.strip() constraints = set() with open(constraints_filepath, "r+") as f: for line in f: word_pair = line.split() if word_pair[0] in vocab and word_pair[1] in vocab and word_pair[0] != word_pair[1]: constraints |= {(word_pair[0], word_pair[1])} constraints |= {(word_pair[1], word_pair[0])} self.info("%s yielded %s constraints." % (constraints_filepath, len(constraints))) return constraints def set_hyperparameters(self): """ This method sets the hyperparameters of the procedure as specified in the paper. """ self.hyper_k1 = 0.1 self.hyper_k2 = 0.1 self.hyper_k3 = 0.1 self.delta = 1.0 self.gamma = 0.0 self.rho = 0.2 self.info("embedding init enhancer hyperparameters --- k_1: %s, k_2: %s, k_3: %s, delta: %s, gamma: %s, rho: %s" % (self.hyper_k1, self.hyper_k2, self.hyper_k3, self.delta, self.gamma, self.rho)) def get_enhanced_embedding(self, from_pretrained_vector=False): """ This method repeatedly applies SGD steps to counter-fit word vectors to linguistic constraints. """ word_vectors = self.word_vectors repel = self.repel attract = self.attract current_iteration = 0 if from_pretrained_vector: vsp_pairs = {} if self.hyper_k3 > 0.0: # if we need to compute the VSP terms. vsp_pairs = self.compute_vsp_pairs(word_vectors, self.vocab, rho=self.rho) # Post-processing: remove synonym pairs which are deemed to be both synonyms and antonyms: for repel_pair in repel: if repel_pair in attract: attract.remove(repel_pair) if from_pretrained_vector and repel_pair in vsp_pairs: del vsp_pairs[repel_pair] max_iter = 20 self.info("repel pairs: %s, attract pairs: %s" % (len(repel), len(attract))) self.info("Running the optimisation procedure for %s SGD steps..." % max_iter) while current_iteration < max_iter: current_iteration += 1 vsp_pairs = vsp_pairs if from_pretrained_vector else None word_vectors = self.one_step_SGD(word_vectors, attract, repel, vsp_pairs) return self.vector_map_to_vectors(word_vectors) def one_step_SGD(self, word_vectors, attract_pairs, repel_pairs, vsp_pairs=None): """ This method performs a step of SGD to optimise the counterfitting cost funct

评论收藏

内容反馈