Python绝对匹配提取实体

问题描述

./data/city.txt

北京
上海
广州
深圳
海西
海西蒙古族藏族自治州

./data/scene.txt

故宫
长城
圆明园
外滩
白云山
欢乐谷

input:

我想去北京的圆明园玩

output:

[{'start': 3, 'end': 5, 'value': '北京', 'entity': 'city'}, {'start': 6, 'end': 9, 'value': '圆明园', 'entity': 'scene'}]




解决方案

  1. 文件名作为实体名
  2. 绝对匹配




生成器读取

# -*- coding: utf-8 -*-
# @Author  : XerCis
# @Time    : 2020/5/15 17:00
# @Function: 绝对匹配提取实体词

import os


def read(file_path):
    """读取文本文档生成器"""
    with open(file_path, mode="r", encoding="utf-8") as f:
        for line in f:
            yield line.strip()  # 去除空格换行


def extract(message, dictionary_path):
    """绝对匹配提取实体词"""
    entities = []
    for file_path in os.listdir(dictionary_path):
        if file_path.endswith(".txt"):
            file_path = os.path.join(dictionary_path, file_path)
            it = read(file_path)
            for i in it:
                start = message.find(i)
                if start != -1:
                    entities.append({
                        "start": start,
                        "end": start + len(i),
                        "value": i,
                        "entity": os.path.basename(file_path)[:-4],  # 以文件名作实体名
                        "confidence": 1
                    })
    return entities


if __name__ == "__main__":
    print(extract("我想去北京的圆明园玩", dictionary_path="./data"))

优点:逐行读取文件,内存消耗极小
缺点:运行效率慢




将文件读进内存

# -*- coding: utf-8 -*-
# @Author  : XerCis
# @Time    : 2020/5/15 17:00
# @Function: 绝对匹配提取实体词

import os


def read(dictionary_path):
    """读取文本文档数据进字典"""
    data = {}
    for file_path in os.listdir(dictionary_path):
        if file_path.endswith(".txt"):
            file_path = os.path.join(dictionary_path, file_path)
            file_name = os.path.basename(file_path)[:-4]  # 以文件名作实体名
            with open(file_path, mode="r", encoding="utf-8") as f:
                data[file_name] = f.read().splitlines()
    return data


def extract(data, message):
    """绝对匹配提取实体词"""
    entities = []
    for entity, value in data.items():
        for i in value:
            start = message.find(i)
            if start != -1:
                entities.append({
                    "start": start,
                    "end": start + len(i),
                    "value": i,
                    "entity": entity,
                    "confidence": 1
                })
    return entities


if __name__ == "__main__":
    data = read("./data")
    print(extract(data, message="我想去北京的圆明园玩"))




重复实体取长或取短

# -*- coding: utf-8 -*-
# @Author  : XerCis
# @Time    : 2020/5/15 17:00
# @Function: 绝对匹配提取实体词

import os
from itertools import combinations


def read(dictionary_path):
    """读取文本文档数据进字典"""
    data = {}
    for file_path in os.listdir(dictionary_path):
        if file_path.endswith(".txt"):
            file_path = os.path.join(dictionary_path, file_path)
            file_name = os.path.basename(file_path)[:-4]  # 以文件名作实体名
            with open(file_path, mode="r", encoding="utf-8") as f:
                data[file_name] = f.read().splitlines()
    return data


def extract(data, message, take_long=False, take_short=False):
    """绝对匹配提取实体词"""
    if take_long and take_short:
        raise ValueError("take_long and take_short can not be both True")
    entities = []
    for entity, value in data.items():
        for i in value:
            start = message.find(i)
            if start != -1:
                entities.append({
                    "start": start,
                    "end": start + len(i),
                    "value": i,
                    "entity": entity,
                    "confidence": 1
                })
    for i in list(combinations(entities, 2)):
        v0, v1 = i[0]["value"], i[1]["value"]
        if v0 in v1 or v1 in v0:
            (long, short) = (i[0], i[1]) if len(v0) > len(v1) else (i[1], i[0])
            if take_long == True and short in entities:
                entities.remove(short)
            if take_short == True and long in entities:
                entities.remove(long)
    return entities


if __name__ == "__main__":
    data = read("./data")
    print(extract(data, message="海西全称为海西蒙古族藏族自治州", take_long=True))
    print(extract(data, message="海西全称为海西蒙古族藏族自治州", take_short=True))
    # [{'start': 5, 'end': 15, 'value': '海西蒙古族藏族自治州', 'entity': 'city', 'confidence': 1}]
    # [{'start': 0, 'end': 2, 'value': '海西', 'entity': 'city', 'confidence': 1}]




参考文献

  1. Python可迭代对象、迭代器和生成器的区别
  2. Python os.path() 模块 | 菜鸟教程
  3. Python 字符串
已标记关键词 清除标记
相关推荐
©️2020 CSDN 皮肤主题: 编程工作室 设计师:CSDN官方博客 返回首页