Source code for rhoknp.props.feature
import logging
import re
from typing import ClassVar
logger = logging.getLogger(__name__)
[docs]
class FeatureDict(dict[str, str | bool]):
"""文節,基本句,形態素の素性情報を表すクラス."""
IGNORE_TAG_PREFIXES: ClassVar[set[str]] = {"rel ", "memo "}
_FEATURE_KEY_PAT: ClassVar[re.Pattern] = re.compile(r"(?P<key>([^:\"]|\"[^\"]*?\")+?)")
_FEATURE_VALUE_PAT: ClassVar[re.Pattern] = re.compile(r"(?P<value>([^>\\]|\\>?)+)")
PAT: ClassVar[re.Pattern] = re.compile(
rf"(?P<feats>(<{_FEATURE_KEY_PAT.pattern}(:{_FEATURE_VALUE_PAT.pattern})?>)*)"
)
FEATURE_PAT: ClassVar[re.Pattern] = re.compile(
rf"<(?!({'|'.join(IGNORE_TAG_PREFIXES)})){_FEATURE_KEY_PAT.pattern}(:{_FEATURE_VALUE_PAT.pattern})?>"
)
def __setitem__(self, key: str, value: str | bool) -> None:
if key == "rel":
logger.warning(
f"Adding 'rel' to {self.__class__.__name__} is not supported and was ignored. Instead, add a RelTag "
f"object to BasePhrase.rel_tags and call Document.reparse()."
)
return
if key == "memo":
logger.warning(
f"Adding 'memo' to {self.__class__.__name__} is not supported and was ignored. Instead, set a MemoTag "
f"object to BasePhrase.memo_tag."
)
return
super().__setitem__(key, value)
[docs]
@classmethod
def from_fstring(cls, fstring: str) -> "FeatureDict":
"""素性文字列をパースして辞書型に変換する.
例:"<正規化代表表記:遅れる/おくれる>" -> {"正規化代表表記": "遅れる/おくれる"}
Args:
fstring: KNP 形式における素性文字列.
"""
features = cls()
for match in cls.FEATURE_PAT.finditer(fstring):
features[match["key"]] = match["value"].replace(r"\>", ">") if match["value"] is not None else True
return features
[docs]
def to_fstring(self) -> str:
"""素性文字列に変換."""
return "".join(self._item_to_fstring(k, v) for k, v in self.items())
@staticmethod
def _item_to_fstring(key: str, value: str | bool) -> str:
if value is False:
return ""
if value is True:
return f"<{key}>"
escaped_value = value.replace(">", r"\>") # escape ">"
return f"<{key}:{escaped_value}>"