The line wrapping algorithm of gettext is somewhat complicated. Rather than trying to approximate it, use `msgcat` from gettext directly. This is also what poedit does.
27 lines
629 B
Python
Executable file
27 lines
629 B
Python
Executable file
#!/usr/bin/env python
|
|
from typing import List, Tuple
|
|
|
|
import sudachipy
|
|
|
|
import segmenter_lib
|
|
|
|
|
|
class JaTokenizer():
|
|
_MODE = sudachipy.SplitMode.C
|
|
|
|
def __init__(self) -> None:
|
|
self._tokenizer = sudachipy.Dictionary().create()
|
|
|
|
def __call__(self, text: bytes) -> Tuple[List[int], List[int]]:
|
|
unicode_text = text.decode()
|
|
tokens = self._tokenizer.tokenize(unicode_text)
|
|
starts = []
|
|
ends = []
|
|
for token in tokens:
|
|
starts.append(len(unicode_text[:token.begin()].encode()))
|
|
ends.append(len(unicode_text[:token.end()].encode()))
|
|
return starts, ends
|
|
|
|
|
|
if __name__ == "__main__":
|
|
segmenter_lib.Main(JaTokenizer())
|