Source code for rhoknp.props.named_entity

import logging
import re
from dataclasses import dataclass
from enum import Enum
from typing import TYPE_CHECKING, ClassVar, Optional

if TYPE_CHECKING:
    from rhoknp.units.morpheme import Morpheme

logger = logging.getLogger(__name__)


[docs] class NamedEntityCategory(Enum): """固有表現カテゴリを表す列挙体.""" ORGANIZATION = "ORGANIZATION" PERSON = "PERSON" LOCATION = "LOCATION" ARTIFACT = "ARTIFACT" DATE = "DATE" TIME = "TIME" MONEY = "MONEY" PERCENT = "PERCENT" OPTIONAL = "OPTIONAL"
[docs] @classmethod def has_value(cls, value: str) -> bool: """有効な固有表現カテゴリであれば True. Args: value: 固有表現のカテゴリ. """ return any(value == item.value for item in cls)
[docs] @dataclass class NamedEntity: """固有表現を表すクラス.""" PAT: ClassVar[re.Pattern] = re.compile(r"<NE:(?P<cat>\w+):(?P<name>([^>\\]|\\>?)+)>") category: NamedEntityCategory morphemes: list["Morpheme"] def __str__(self) -> str: return self.text @property def text(self) -> str: """固有表現の表層文字列.""" return "".join(m.text for m in self.morphemes)
[docs] @classmethod def from_fstring(cls, fstring: str, candidate_morphemes: list["Morpheme"]) -> Optional["NamedEntity"]: """KNP における素性文字列からオブジェクトを作成.""" match = cls.PAT.match(fstring) if match is None: logger.warning(f"{fstring} is not a valid NE fstring") return None category: str = match["cat"] if not NamedEntityCategory.has_value(category): logger.warning(f"{candidate_morphemes[0].sentence.sid}: unknown NE category: {category}") return None name: str = match["name"].replace(r"\>", ">") span = cls._find_morpheme_span(name, candidate_morphemes) if span is None: logger.warning(f"{candidate_morphemes[0].sentence.sid}: morpheme span of '{name}' not found") return None return NamedEntity(NamedEntityCategory(category), candidate_morphemes[span.start : span.stop])
[docs] def to_fstring(self) -> str: """素性文字列に変換.""" escaped_text = self.text.replace(">", r"\>") # escape ">" return f"<NE:{self.category.value}:{escaped_text}>"
@staticmethod def _find_morpheme_span(name: str, candidates: list["Morpheme"]) -> range | None: """固有表現の文字列にマッチする形態素の範囲を返す. Args: name: 固有表現の文字列 candidates: 固有表現を構成する候補形態素のリスト """ stop = len(candidates) while stop > 0: for start in reversed(range(stop)): if "".join(m.text for m in candidates[start:stop]) == name: return range(start, stop) stop -= 1 return None