在线问诊 Python、FastAPI、Neo4j — 构建问题分类器

将问题进行分析,和系统已有的分类进行关联

构建字典数据

将构建的知识图片字典化, 用于后面对问题的解析,下图为症状的字典,其它字典同理
在线问诊 Python、FastAPI、Neo4j — 构建问题分类器

构建 Trie 字典树

将建字典数据,组装集合

cur_dir = '/'.join(os.path.abspath(__file__).split('/')[:-1]) #  特征词路径 self.disease_path = os.path.join(cur_dir, '../dict/disease.txt') self.check_path = os.path.join(cur_dir, '../dict/examine.txt') self.drug_path = os.path.join(cur_dir, '../dict/drug.txt') self.food_path = os.path.join(cur_dir, '../dict/food.txt') self.symptom_path = os.path.join(cur_dir, '../dict/symptom.txt') self.deny_path = os.path.join(cur_dir, '../dict/deny.txt') # 加载数据 self.disease_wds = [i.strip() for i in open(self.disease_path, encoding="utf-8") if i.strip()]   # ['干眼', '右膝髌上囊及关节腔少量积液'] self.check_wds = [i.strip() for i in open(self.check_path, encoding="utf-8") if i.strip()]  # ['膝关节核磁', '视力', '砂眼', '辨色力', '角膜', '眼底'] self.drug_wds = [i.strip() for i in open(self.drug_path, encoding="utf-8") if i.strip()] self.food_wds = [i.strip() for i in open(self.food_path, encoding="utf-8") if i.strip()] self.symptom_wds = [i.strip() for i in open(self.symptom_path, encoding="utf-8") if i.strip()] # ['畏光','干涩','看东西有时候清楚有时候不清楚']  # 读出所有 dict 里面的字典数据,并拼接成一个大而全的 集合 # ['干眼', '右膝髌上囊及关节腔少量积液','膝关节核磁', '视力', '砂眼', '辨色力', '角膜', '眼底','畏光','干涩','看东西有时候清楚有时候不清楚'] self.region_words = set(self.disease_wds + self.check_wds + self.drug_wds + self.food_wds + self.symptom_wds) 

构建 Trie 字典树
Trie字典树:https://www.cnblogs.com/vipsoft/p/17722820.html
Aho-Corasick 算法 AC自动机实现:https://www.cnblogs.com/vipsoft/p/17722761.html

# 目的是为了将来对用户提的问题,进行关键词快速提取 def build_actree(self, word_list):     """     构造actree,加速过滤     :param word_list:     :return:     """     actree = ahocorasick.Automaton()     for index, word in enumerate(word_list):         actree.add_word(word, (index, word))  # 向trie树中添加单词     actree.make_automaton()     return actree 

按实体组装字典

# 将 ['干眼', '右膝髌上囊及关节腔少量积液','膝关节核磁', '视力', '砂眼', '辨色力', '角膜', '眼底'],进行分类,组装成不同类型的字典 def build_wdtype_dict(self):     """     构造词对应的类型     :return:     """     wd_dict = dict()     for wd in self.region_words:         wd_dict[wd] = []         if wd in self.disease_wds:             wd_dict[wd].append('disease')         if wd in self.check_wds:             wd_dict[wd].append('check')         if wd in self.drug_wds:             wd_dict[wd].append('drug')         if wd in self.food_wds:             wd_dict[wd].append('food')         if wd in self.symptom_wds:             wd_dict[wd].append('symptom')     return wd_dict 

问题分析

通过AC算法,过滤关键词

# "请问最近看东西有时候清楚有时候不清楚是怎么回事" def check_medical(self, question):     """     问句过滤     :param question:     :return:     """     region_wds = []     for i in self.region_tree.iter(question):  # 从问题中,找出关键词         wd = i[1][1]  # 看东西有时候清楚有时候不清楚         region_wds.append(wd)     stop_wds = []     for wd1 in region_wds:         for wd2 in region_wds:             if wd1 in wd2 and wd1 != wd2:                 stop_wds.append(wd1)     final_wds = [i for i in region_wds if i not in stop_wds]  # '看东西有时候清楚有时候不清楚'     medical_dict = {i: self.wdtype_dict.get(i) for i in final_wds}  # {'看东西有时候清楚有时候不清楚': ['symptom']}     return medical_dict 

解析出问题的类型

data['args'] = medical_dict # 若没有查到相关的外部查询信息,那么则将该疾病的描述信息返回 if question_types == [] and 'symptom' in types:    question_types = ['symptom_disease'] # 将多个分类结果进行合并处理,组装成一个字典 data['question_types'] = question_types 

输出字典

question = "请问最近看东西有时候清楚有时候不清楚是怎么回事" # 最终输出 data = {'args': {'看东西有时候清楚有时候不清楚': ['symptom']}, 'question_types': ['symptom_disease']}  question = "干眼常用药有哪些" # 最终输出 data = {'args': {'干眼': ['disease']}, 'question_types': ['disease_drug']}  question = "干眼哪些不能吃" data = {'args': {'干眼': ['disease']}, 'question_types': ['disease_not_food']} 

后面根据 question_types 生成 CQL语句

源代码地址:https://gitee.com/VipSoft/VipQA

发表评论

评论已关闭。

相关文章