import re
from functools import cached_property
from typing import TYPE_CHECKING, ClassVar, Optional, Union
try:
from typing import override # type: ignore[attr-defined]
except ImportError:
from typing_extensions import override
from rhoknp.props.feature import FeatureDict
from rhoknp.props.semantics import SemanticsDict
from rhoknp.units.unit import Unit
if TYPE_CHECKING:
from rhoknp.units.base_phrase import BasePhrase
from rhoknp.units.clause import Clause
from rhoknp.units.document import Document
from rhoknp.units.phrase import Phrase
from rhoknp.units.sentence import Sentence
[docs]
class Morpheme(Unit):
"""形態素クラス."""
_ATTRIBUTES = (
"surf",
"reading",
"lemma",
"pos",
"pos_id",
"subpos",
"subpos_id",
"conjtype",
"conjtype_id",
"conjform",
"conjform_id",
)
_ATTRIBUTE_PAT: ClassVar[re.Pattern] = re.compile(
r"([^ ]+| [^ ]*) ([^ ]+| [^ ]*) ([^ ]+) (\d+) ([^ ]+) (\d+) ([^ ]+) (\d+) ([^ ]+) (\d+)"
)
_ATTRIBUTE_PAT_REPEATED: ClassVar[re.Pattern] = re.compile(
r"(?P<pat>.+) ((?P=pat)) ([^ ]+) (\d+) ([^ ]+) (\d+) ([^ ]+) (\d+) ([^ ]+) (\d+)"
)
PAT: ClassVar[re.Pattern] = re.compile(
r"(?P<surf>^([^ ]+| [^ ]*))"
+ rf"( (?P<attrs>{_ATTRIBUTE_PAT.pattern}))"
+ rf"( {SemanticsDict.PAT.pattern})?"
+ rf"( {FeatureDict.PAT.pattern})?$"
)
PAT_REPEATED: ClassVar[re.Pattern] = re.compile(
r"(?P<surf>.+) (?P<attrs>(?P=surf) (?P=surf) [^ ]+ \d+ [^ ]+ \d+ [^ ]+ \d+ [^ ]+ \d+)"
+ rf"( {SemanticsDict.PAT.pattern})?"
+ rf"( {FeatureDict.PAT.pattern})?$"
)
# https://github.com/ku-nlp/jumanpp/blob/v2.0.0-rc3/src/jumandic/shared/juman_format.cc#L44
_ESCAPE_MAP_HALF_TO_FULL_WIDTH: ClassVar[dict[str, str]] = {" ": " ", '"': "”", "<": "<", ">": ">"}
_UNESCAPE_MAP_HALF_TO_FULL_WIDTH: ClassVar[dict[str, str]] = {
v: k for k, v in _ESCAPE_MAP_HALF_TO_FULL_WIDTH.items()
}
# https://github.com/ku-nlp/jumanpp/blob/v2.0.0-rc4/src/jumandic/shared/juman_format.cc#L44
_ESCAPE_MAP_CONTROL_CHAR: ClassVar[dict[str, str]] = {"\t": r"\t", " ": r"\␣"}
_UNESCAPE_MAP_CONTROL_CHAR: ClassVar[dict[str, str]] = {v: k for k, v in _ESCAPE_MAP_CONTROL_CHAR.items()}
count = 0
def __init__(
self,
text: str,
reading: str,
lemma: str,
pos: str,
pos_id: int,
subpos: str,
subpos_id: int,
conjtype: str,
conjtype_id: int,
conjform: str,
conjform_id: int,
semantics: SemanticsDict | None = None,
features: FeatureDict | None = None,
homograph: bool = False,
) -> None:
super().__init__()
self.text = text
self.reading = reading #: 読み.
self.lemma = lemma #: 原形.
self.pos = pos #: 品詞.
self.pos_id = pos_id #: 品詞ID.
self.subpos = subpos #: 品詞細分類.
self.subpos_id = subpos_id #: 品詞細分類ID.
self.conjtype = conjtype #: 活用型.
self.conjtype_id = conjtype_id #: 活用型ID.
self.conjform = conjform #: 活用形ID.
self.conjform_id = conjform_id #: 活用形ID.
# parent unit
self._base_phrase: "BasePhrase" | None = None
self._sentence: "Sentence" | None = None
self.semantics: SemanticsDict = (
semantics if semantics is not None else SemanticsDict()
) #: 辞書に記載の意味情報.
self.features: FeatureDict = features if features is not None else FeatureDict() #: 素性.
self.homographs: list["Morpheme"] = [] #: 同形の形態素のリスト.
self.index = self.count #: 文内におけるインデックス.
if homograph is False:
Morpheme.count += 1
@override
def __hash__(self) -> int:
return hash((self.parent_unit, self.index))
@override
def __eq__(self, other: object) -> bool:
if not isinstance(other, type(self)):
return False
if self.parent_unit != other.parent_unit:
return False
return self.index == other.index
@cached_property
def global_index(self) -> int:
"""文書全体におけるインデックス."""
if not self.sentence.has_document():
return self.index
if self.sentence.index == 0:
return self.index
if self.index > 0:
return self.sentence.morphemes[0].global_index + self.index
prev_sentence = self.document.sentences[self.sentence.index - 1]
return prev_sentence.morphemes[0].global_index + len(prev_sentence.morphemes)
@property
def parent_unit(self) -> Union["BasePhrase", "Sentence"] | None:
"""上位の言語単位(基本句もしくは文).未登録なら None.
..note::
KNP によって解析済みなら基本句, Jumanpp によって解析済みなら文を返却.
"""
if self._base_phrase is not None:
return self._base_phrase
if self._sentence is not None:
return self._sentence
return None
@property
def child_units(self) -> None:
"""下位の言語単位のリスト.形態素は最下位の言語単位なので常に None."""
return
@property
def document(self) -> "Document":
"""文書.
Raises:
AttributeError: 解析結果にアクセスできない場合.
"""
return self.sentence.document
@property
def sentence(self) -> "Sentence":
"""文."""
return self._sentence or self.base_phrase.sentence
@sentence.setter
def sentence(self, sentence: "Sentence") -> None:
"""文.
Args:
sentence: 文.
"""
self._sentence = sentence
@property
def clause(self) -> "Clause":
"""節.
Raises:
AttributeError: 解析結果にアクセスできない場合.
"""
return self.base_phrase.clause
@property
def phrase(self) -> "Phrase":
"""文節.
Raises:
AttributeError: 解析結果にアクセスできない場合.
"""
return self.base_phrase.phrase
@property
def base_phrase(self) -> "BasePhrase":
"""基本句.
Raises:
AttributeError: 解析結果にアクセスできない場合.
"""
if self._base_phrase is None:
raise AttributeError("base_phrase has not been set")
return self._base_phrase
@base_phrase.setter
def base_phrase(self, base_phrase: "BasePhrase") -> None:
"""基本句.
Args:
base_phrase: 基本句.
"""
self._base_phrase = base_phrase
@property
def surf(self) -> str:
"""表層表現."""
return self.text
@property
def canon(self) -> str | None:
"""代表表記."""
canon = self.semantics.get("代表表記")
assert canon is None or isinstance(canon, str)
return canon
@property
def sstring(self) -> str:
"""Juman++ フォーマットの意味情報."""
return self.semantics.to_sstring()
@property
def fstring(self) -> str:
"""Juman++ フォーマットの素性."""
return self.features.to_fstring()
@cached_property
def parent(self) -> Optional["Morpheme"]:
"""係り先の形態素.ないなら None."""
if self.base_phrase.head == self:
if self.base_phrase.parent is not None:
return self.base_phrase.parent.head
return None
return self.base_phrase.head
@cached_property
def span(self) -> tuple[int, int]:
"""文における文字レベルのスパン."""
if self.index == 0:
start = 0
else:
_, start = self.sentence.morphemes[self.index - 1].span
end = start + len(self.text) # TODO: correctly handle multibyte characters
return start, end
@cached_property
def global_span(self) -> tuple[int, int]:
"""文書全体における文字レベルのスパン."""
offset = 0
for prev_sentence in self.document.sentences[: self.sentence.index]:
offset += len(prev_sentence.text)
start, end = self.span
return start + offset, end + offset
@cached_property
def children(self) -> list["Morpheme"]:
"""この形態素に係っている形態素のリスト."""
return [morpheme for morpheme in self.sentence.morphemes if morpheme.parent == self]
[docs]
@classmethod
def from_jumanpp(cls, jumanpp_text: str) -> "Morpheme":
"""形態素クラスのインスタンスを Juman++ の解析結果から初期化.
Args:
jumanpp_text: Juman++ の解析結果.
Raises:
ValueError: 解析結果読み込み中にエラーが発生した場合.
"""
first_line, *lines = jumanpp_text.rstrip().split("\n")
morpheme = cls._from_jumanpp_line(first_line)
for line in lines:
assert cls.is_homograph_line(line)
homograph = cls._from_jumanpp_line(line[2:], homograph=True)
morpheme.homographs.append(homograph)
return morpheme
@classmethod
def _from_jumanpp_line(cls, jumanpp_line: str, homograph: bool = False) -> "Morpheme":
"""形態素クラスのインスタンスを Juman++ の解析結果から初期化.
Args:
jumanpp_line: Juman++ の解析結果.
homograph: 同形かどうかを表すフラグ.
Raises:
ValueError: 解析結果読み込み中にエラーが発生した場合.
"""
match = cls.PAT.match(jumanpp_line) or cls.PAT_REPEATED.match(jumanpp_line)
if match is None:
raise ValueError(f"malformed morpheme line: {jumanpp_line}")
match_attr = cls._ATTRIBUTE_PAT.match(match["attrs"]) or cls._ATTRIBUTE_PAT_REPEATED.match(match["attrs"])
assert match_attr is not None
attributes = match_attr.groups()
surf, reading, lemma = match["surf"], attributes[0], attributes[1]
semantics = SemanticsDict.from_sstring(match["sems"] or "")
# Resume text if it is escaped (Juman++ 2.0.0-rc3)
if semantics.get("元半角") is True:
surf, reading, lemma = ( # pragma: no cover
cls._UNESCAPE_MAP_HALF_TO_FULL_WIDTH.get(s, s) for s in (surf, reading, lemma)
)
surf, reading, lemma = (cls._UNESCAPE_MAP_CONTROL_CHAR.get(s, s) for s in (surf, reading, lemma))
return cls(
surf,
reading,
lemma,
attributes[2],
int(attributes[3]),
attributes[4],
int(attributes[5]),
attributes[6],
int(attributes[7]),
attributes[8],
int(attributes[9]),
semantics=semantics,
features=FeatureDict.from_fstring(match["feats"] or ""),
homograph=homograph,
)
[docs]
def to_jumanpp(self) -> str:
"""Juman++ フォーマットに変換."""
ret = self._to_jumanpp_line()
if self.features:
ret += f" {self.features.to_fstring()}"
ret += "\n"
for homograph in self.homographs:
ret += f"@ {homograph.to_jumanpp()}"
return ret
[docs]
def to_knp(self) -> str:
"""KNP フォーマットに変換."""
ret = self._to_jumanpp_line()
features = FeatureDict(self.features) # deep copy
for homograph in self.homographs:
alt_feature_key = "ALT-{}-{}-{}-{}-{}-{}-{}-{}".format( # noqa: UP032
homograph.surf,
homograph.reading,
homograph.lemma,
homograph.pos_id,
homograph.subpos_id,
homograph.conjtype_id,
homograph.conjform_id,
homograph.semantics.to_sstring(),
)
features[alt_feature_key] = True
if features:
ret += f" {features.to_fstring()}"
ret += "\n"
return ret
def _to_jumanpp_line(self) -> str:
"""Juman++ フォーマットに変換."""
attrs: list[str] = []
for attr_name in self._ATTRIBUTES:
attr = getattr(self, attr_name)
if attr_name in ("surf", "reading", "lemma"):
attr = self._ESCAPE_MAP_CONTROL_CHAR.get(attr, attr)
attrs.append(str(attr))
ret = " ".join(attrs)
if self.semantics or self.semantics.is_nil():
ret += f" {self.semantics.to_sstring()}"
return ret
[docs]
@staticmethod
def is_morpheme_line(line: str) -> bool:
"""形態素行なら True を返す."""
return Morpheme.PAT.match(line) is not None or Morpheme.PAT_REPEATED.match(line) is not None
[docs]
@staticmethod
def is_homograph_line(line: str) -> bool:
"""同形行なら True を返す."""
return line.startswith("@") and Morpheme.is_morpheme_line(line[2:])