Source code for rhoknp.processors.knp
import logging
import select
import subprocess
import threading
import time
from subprocess import PIPE, Popen
from threading import Lock
try:
from typing import override # type: ignore[attr-defined]
except ImportError:
from typing_extensions import override
from rhoknp.processors.jumanpp import Jumanpp
from rhoknp.processors.processor import Processor
from rhoknp.processors.senter import RegexSenter
from rhoknp.units import Document, Sentence
logger = logging.getLogger(__name__)
[docs]
class KNP(Processor):
"""KNP クラス.
Args:
executable: KNP のパス.
options: KNP のオプション.
senter: 文分割器のインスタンス.文分割がまだなら,先にこのインスタンスを用いて文分割する.
未設定なら RegexSenter を使って文分割する.
jumanpp: Jumanpp のインスタンス.形態素解析がまだなら,先にこのインスタンスを用いて形態素解析する.
未設定なら Jumanpp (オプションなし)を使って形態素解析する.
skip_sanity_check: True なら,KNP の起動時に sanity check をスキップする.
Example:
>>> from rhoknp import KNP
>>> knp = KNP()
>>> document = knp.apply("電気抵抗率は電気の通しにくさを表す物性値である。")
.. note::
使用するには `KNP <https://github.com/ku-nlp/knp>`_ がインストールされている必要がある.
"""
def __init__(
self,
executable: str = "knp",
options: list[str] | None = None,
senter: Processor | None = None,
jumanpp: Processor | None = None,
skip_sanity_check: bool = False,
) -> None:
self.executable = executable #: KNP のパス.
self.options = options or ["-tab"] #: KNP のオプション.
self.senter = senter
self.jumanpp = jumanpp
self._lock = Lock()
self._proc: Popen | None = None
if "-tab" not in self.options:
raise ValueError("`-tab` option is required when you use KNP.")
self.start_process(skip_sanity_check)
def __repr__(self) -> str:
arg_string = f"executable={self.executable!r}"
if self.options:
arg_string += f", options={self.options!r}"
if self.senter is not None:
arg_string += f", senter={self.senter!r}"
if self.jumanpp is not None:
arg_string += f", jumanpp={self.jumanpp!r}"
return f"{self.__class__.__name__}({arg_string})"
def __del__(self) -> None:
try:
if self._proc is not None:
self._proc.terminate()
except AttributeError: # pragma: no cover
# for free-threaded Python interpreters
pass # pragma: no cover
[docs]
def start_process(self, skip_sanity_check: bool = False) -> None:
"""KNP を起動する.
.. note::
KNP がすでに起動している場合は再起動する.
skip_sanity_check: True なら,KNP の起動時に sanity check をスキップする.
"""
if self._proc is not None:
self._proc.terminate()
try:
self._proc = Popen(self.run_command, stdin=PIPE, stdout=PIPE, stderr=PIPE, encoding="utf-8")
if skip_sanity_check is False:
_ = self.apply(Sentence.from_jumanpp("EOS"))
except Exception as e:
logger.warning(f"failed to start KNP: {e}")
[docs]
def is_available(self) -> bool:
"""KNP が利用可能であれば True を返す."""
return self._proc is not None and self._proc.poll() is None
[docs]
@override
def apply_to_document(self, document: Document | str, timeout: int = 10) -> Document:
"""文書に KNP を適用する.
Args:
document: 文書.
timeout: 最大処理時間.
.. note::
文分割がまだなら,先に初期化時に設定した senter で文分割する.
未設定なら RegexSenter で文分割する.
形態素解析がまだなら,先に初期化時に設定した jumanpp で形態素解析する.
未設定なら Jumanpp (オプションなし)で形態素解析する.
"""
if not self.is_available():
raise RuntimeError("KNP is not available.")
start: float = time.time()
if isinstance(document, str):
document = Document(document)
doc_id = document.doc_id
if document.is_senter_required():
if self.senter is None:
logger.debug("senter is not specified; use RegexSenter")
self.senter = RegexSenter()
document = self.senter.apply_to_document(document, timeout=timeout - int(time.time() - start))
sentences: list[Sentence] = []
for sentence in document.sentences:
sentences.append(self.apply_to_sentence(sentence, timeout=timeout - int(time.time() - start)))
ret = Document.from_sentences(sentences)
if doc_id != "":
ret.doc_id = doc_id
for sentence in ret.sentences:
sentence.doc_id = doc_id
return ret
[docs]
@override
def apply_to_sentence(self, sentence: Sentence | str, timeout: int = 10) -> Sentence:
"""文に KNP を適用する.
Args:
sentence: 文.
timeout: 最大処理時間.
.. note::
形態素解析がまだなら,先に初期化時に設定した jumanpp で形態素解析する.
未設定なら Jumanpp (オプションなし)で形態素解析する.
"""
if self.is_available() is False:
raise RuntimeError("KNP is not available.")
start: float = time.time()
if isinstance(sentence, str):
sentence = Sentence(sentence)
if sentence.is_jumanpp_required():
with self._lock:
if self.jumanpp is None:
logger.debug("jumanpp is not specified when initializing KNP: use Jumanpp with no option")
self.jumanpp = Jumanpp()
sentence = self.jumanpp.apply_to_sentence(sentence, timeout=timeout - int(time.time() - start))
stdout_text: str = ""
def worker() -> None:
nonlocal stdout_text
assert self._proc is not None
assert self._proc.stdin is not None
assert self._proc.stdout is not None
assert self._proc.stderr is not None
if sentence.is_knp_required():
self._proc.stdin.write(sentence.to_jumanpp())
else:
self._proc.stdin.write(sentence.to_knp())
self._proc.stdin.flush()
stdout_text = ""
while self.is_available():
line = self._proc.stdout.readline()
stdout_text += line
if line.strip() == Sentence.EOS:
break
# Non-blocking read from stderr
stderr_text = ""
while self._proc.stderr in select.select([self._proc.stderr], [], [], 0)[0]:
line = self._proc.stderr.readline()
if line.strip() == "":
break
stderr_text += line
if stderr_text.strip() != "":
logger.debug(stderr_text.strip())
with self._lock:
thread = threading.Thread(target=worker, daemon=True)
thread.start()
thread.join(timeout)
if thread.is_alive():
self.start_process(skip_sanity_check=True)
raise TimeoutError(f"Operation timed out after {timeout} seconds.")
if not self.is_available():
self.start_process(skip_sanity_check=True)
raise RuntimeError("KNP exited unexpectedly.")
ret = Sentence.from_knp(stdout_text)
if sentence.text and not ret.text:
raise RuntimeError(f"KNP returned empty result for input: '{sentence.text}'")
return ret
[docs]
def get_version(self) -> str:
"""Juman++ のバージョンを返す."""
if not self.is_available():
raise RuntimeError("KNP is not available.")
p = subprocess.run(self.version_command, capture_output=True, encoding="utf-8", check=True)
return p.stderr.strip()
@property
def run_command(self) -> list[str]:
"""解析時に実行するコマンド."""
return [self.executable, *self.options]
@property
def version_command(self) -> list[str]:
"""バージョンを確認するコマンド."""
return [self.executable, "-v"]