Source code for rhoknp.units.morpheme

import re
from functools import cached_property
from typing import TYPE_CHECKING, ClassVar, Optional, Union

try:
    from typing import override  # type: ignore[attr-defined]
except ImportError:
    from typing_extensions import override

from rhoknp.props.feature import FeatureDict
from rhoknp.props.semantics import SemanticsDict
from rhoknp.units.unit import Unit

if TYPE_CHECKING:
    from rhoknp.units.base_phrase import BasePhrase
    from rhoknp.units.clause import Clause
    from rhoknp.units.document import Document
    from rhoknp.units.phrase import Phrase
    from rhoknp.units.sentence import Sentence


[docs] class Morpheme(Unit): """形態素クラス.""" _ATTRIBUTES = ( "surf", "reading", "lemma", "pos", "pos_id", "subpos", "subpos_id", "conjtype", "conjtype_id", "conjform", "conjform_id", ) _ATTRIBUTE_PAT: ClassVar[re.Pattern] = re.compile( r"([^ ]+| [^ ]*) ([^ ]+| [^ ]*) ([^ ]+) (\d+) ([^ ]+) (\d+) ([^ ]+) (\d+) ([^ ]+) (\d+)" ) _ATTRIBUTE_PAT_REPEATED: ClassVar[re.Pattern] = re.compile( r"(?P<pat>.+) ((?P=pat)) ([^ ]+) (\d+) ([^ ]+) (\d+) ([^ ]+) (\d+) ([^ ]+) (\d+)" ) PAT: ClassVar[re.Pattern] = re.compile( r"(?P<surf>^([^ ]+| [^ ]*))" + rf"( (?P<attrs>{_ATTRIBUTE_PAT.pattern}))" + rf"( {SemanticsDict.PAT.pattern})?" + rf"( {FeatureDict.PAT.pattern})?$" ) PAT_REPEATED: ClassVar[re.Pattern] = re.compile( r"(?P<surf>.+) (?P<attrs>(?P=surf) (?P=surf) [^ ]+ \d+ [^ ]+ \d+ [^ ]+ \d+ [^ ]+ \d+)" + rf"( {SemanticsDict.PAT.pattern})?" + rf"( {FeatureDict.PAT.pattern})?$" ) # https://github.com/ku-nlp/jumanpp/blob/v2.0.0-rc3/src/jumandic/shared/juman_format.cc#L44 _ESCAPE_MAP_HALF_TO_FULL_WIDTH: ClassVar[dict[str, str]] = {" ": " ", '"': "”", "<": "<", ">": ">"} _UNESCAPE_MAP_HALF_TO_FULL_WIDTH: ClassVar[dict[str, str]] = { v: k for k, v in _ESCAPE_MAP_HALF_TO_FULL_WIDTH.items() } # https://github.com/ku-nlp/jumanpp/blob/v2.0.0-rc4/src/jumandic/shared/juman_format.cc#L44 _ESCAPE_MAP_CONTROL_CHAR: ClassVar[dict[str, str]] = {"\t": r"\t", " ": r"\␣"} _UNESCAPE_MAP_CONTROL_CHAR: ClassVar[dict[str, str]] = {v: k for k, v in _ESCAPE_MAP_CONTROL_CHAR.items()} count = 0 def __init__( self, text: str, reading: str, lemma: str, pos: str, pos_id: int, subpos: str, subpos_id: int, conjtype: str, conjtype_id: int, conjform: str, conjform_id: int, semantics: Optional[SemanticsDict] = None, features: Optional[FeatureDict] = None, homograph: bool = False, ) -> None: super().__init__() self.text = text self.reading = reading #: 読み. self.lemma = lemma #: 原形. self.pos = pos #: 品詞. self.pos_id = pos_id #: 品詞ID. self.subpos = subpos #: 品詞細分類. self.subpos_id = subpos_id #: 品詞細分類ID. self.conjtype = conjtype #: 活用型. self.conjtype_id = conjtype_id #: 活用型ID. self.conjform = conjform #: 活用形ID. self.conjform_id = conjform_id #: 活用形ID. # parent unit self._base_phrase: Optional["BasePhrase"] = None self._sentence: Optional["Sentence"] = None self.semantics: SemanticsDict = ( semantics if semantics is not None else SemanticsDict() ) #: 辞書に記載の意味情報. self.features: FeatureDict = features if features is not None else FeatureDict() #: 素性. self.homographs: list["Morpheme"] = [] #: 同形の形態素のリスト. self.index = self.count #: 文内におけるインデックス. if homograph is False: Morpheme.count += 1 @override def __eq__(self, other: object) -> bool: if not isinstance(other, type(self)): return False if self.parent_unit != other.parent_unit: return False return self.index == other.index @cached_property def global_index(self) -> int: """文書全体におけるインデックス.""" if not self.sentence.has_document(): return self.index if self.sentence.index == 0: return self.index if self.index > 0: return self.sentence.morphemes[0].global_index + self.index prev_sentence = self.document.sentences[self.sentence.index - 1] return prev_sentence.morphemes[0].global_index + len(prev_sentence.morphemes) @property def parent_unit(self) -> Optional[Union["BasePhrase", "Sentence"]]: """上位の言語単位(基本句もしくは文).未登録なら None. ..note:: KNP によって解析済みなら基本句, Jumanpp によって解析済みなら文を返却. """ if self._base_phrase is not None: return self._base_phrase if self._sentence is not None: return self._sentence return None @property def child_units(self) -> None: """下位の言語単位のリスト.形態素は最下位の言語単位なので常に None.""" return @property def document(self) -> "Document": """文書. Raises: AttributeError: 解析結果にアクセスできない場合. """ return self.sentence.document @property def sentence(self) -> "Sentence": """文.""" return self._sentence or self.base_phrase.sentence @sentence.setter def sentence(self, sentence: "Sentence") -> None: """文. Args: sentence: 文. """ self._sentence = sentence @property def clause(self) -> "Clause": """節. Raises: AttributeError: 解析結果にアクセスできない場合. """ return self.base_phrase.clause @property def phrase(self) -> "Phrase": """文節. Raises: AttributeError: 解析結果にアクセスできない場合. """ return self.base_phrase.phrase @property def base_phrase(self) -> "BasePhrase": """基本句. Raises: AttributeError: 解析結果にアクセスできない場合. """ if self._base_phrase is None: raise AttributeError("base_phrase has not been set") return self._base_phrase @base_phrase.setter def base_phrase(self, base_phrase: "BasePhrase") -> None: """基本句. Args: base_phrase: 基本句. """ self._base_phrase = base_phrase @property def surf(self) -> str: """表層表現.""" return self.text @property def canon(self) -> Optional[str]: """代表表記.""" canon = self.semantics.get("代表表記") assert canon is None or isinstance(canon, str) return canon @property def sstring(self) -> str: """Juman++ フォーマットの意味情報.""" return self.semantics.to_sstring() @property def fstring(self) -> str: """Juman++ フォーマットの素性.""" return self.features.to_fstring() @cached_property def parent(self) -> Optional["Morpheme"]: """係り先の形態素.ないなら None.""" if self.base_phrase.head == self: if self.base_phrase.parent is not None: return self.base_phrase.parent.head return None return self.base_phrase.head @cached_property def span(self) -> tuple[int, int]: """文における文字レベルのスパン.""" if self.index == 0: start = 0 else: _, start = self.sentence.morphemes[self.index - 1].span end = start + len(self.text) # TODO: correctly handle multibyte characters return start, end @cached_property def global_span(self) -> tuple[int, int]: """文書全体における文字レベルのスパン.""" offset = 0 for prev_sentence in self.document.sentences[: self.sentence.index]: offset += len(prev_sentence.text) start, end = self.span return start + offset, end + offset @cached_property def children(self) -> list["Morpheme"]: """この形態素に係っている形態素のリスト.""" return [morpheme for morpheme in self.sentence.morphemes if morpheme.parent == self]
[docs] @classmethod def from_jumanpp(cls, jumanpp_text: str) -> "Morpheme": """形態素クラスのインスタンスを Juman++ の解析結果から初期化. Args: jumanpp_text: Juman++ の解析結果. Raises: ValueError: 解析結果読み込み中にエラーが発生した場合. """ first_line, *lines = jumanpp_text.rstrip().split("\n") morpheme = cls._from_jumanpp_line(first_line) for line in lines: assert cls.is_homograph_line(line) homograph = cls._from_jumanpp_line(line[2:], homograph=True) morpheme.homographs.append(homograph) return morpheme
@classmethod def _from_jumanpp_line(cls, jumanpp_line: str, homograph: bool = False) -> "Morpheme": """形態素クラスのインスタンスを Juman++ の解析結果から初期化. Args: jumanpp_line: Juman++ の解析結果. homograph: 同形かどうかを表すフラグ. Raises: ValueError: 解析結果読み込み中にエラーが発生した場合. """ match = cls.PAT.match(jumanpp_line) or cls.PAT_REPEATED.match(jumanpp_line) if match is None: raise ValueError(f"malformed morpheme line: {jumanpp_line}") match_attr = cls._ATTRIBUTE_PAT.match(match["attrs"]) or cls._ATTRIBUTE_PAT_REPEATED.match(match["attrs"]) assert match_attr is not None attributes = match_attr.groups() surf, reading, lemma = match["surf"], attributes[0], attributes[1] semantics = SemanticsDict.from_sstring(match["sems"] or "") # Resume text if it is escaped (Juman++ 2.0.0-rc3) if semantics.get("元半角") is True: surf, reading, lemma = ( # pragma: no cover cls._UNESCAPE_MAP_HALF_TO_FULL_WIDTH.get(s, s) for s in (surf, reading, lemma) ) surf, reading, lemma = (cls._UNESCAPE_MAP_CONTROL_CHAR.get(s, s) for s in (surf, reading, lemma)) return cls( surf, reading, lemma, attributes[2], int(attributes[3]), attributes[4], int(attributes[5]), attributes[6], int(attributes[7]), attributes[8], int(attributes[9]), semantics=semantics, features=FeatureDict.from_fstring(match["feats"] or ""), homograph=homograph, )
[docs] def to_jumanpp(self) -> str: """Juman++ フォーマットに変換.""" ret = self._to_jumanpp_line() if self.features: ret += f" {self.features.to_fstring()}" ret += "\n" for homograph in self.homographs: ret += f"@ {homograph.to_jumanpp()}" return ret
[docs] def to_knp(self) -> str: """KNP フォーマットに変換.""" ret = self._to_jumanpp_line() features = FeatureDict(self.features) # deep copy for homograph in self.homographs: alt_feature_key = "ALT-{}-{}-{}-{}-{}-{}-{}-{}".format( # noqa: UP032 homograph.surf, homograph.reading, homograph.lemma, homograph.pos_id, homograph.subpos_id, homograph.conjtype_id, homograph.conjform_id, homograph.semantics.to_sstring(), ) features[alt_feature_key] = True if features: ret += f" {features.to_fstring()}" ret += "\n" return ret
def _to_jumanpp_line(self) -> str: """Juman++ フォーマットに変換.""" attrs: list[str] = [] for attr_name in self._ATTRIBUTES: attr = getattr(self, attr_name) if attr_name in ("surf", "reading", "lemma"): attr = self._ESCAPE_MAP_CONTROL_CHAR.get(attr, attr) attrs.append(str(attr)) ret = " ".join(attrs) if self.semantics or self.semantics.is_nil(): ret += f" {self.semantics.to_sstring()}" return ret
[docs] @staticmethod def is_morpheme_line(line: str) -> bool: """形態素行なら True を返す.""" return Morpheme.PAT.match(line) is not None or Morpheme.PAT_REPEATED.match(line) is not None
[docs] @staticmethod def is_homograph_line(line: str) -> bool: """同形行なら True を返す.""" return line.startswith("@") and Morpheme.is_morpheme_line(line[2:])