devilutionX/tools/segmenter/segment_ja.py
Gleb Mazovetskiy 395dbb18f0 Segmenter: Use gettext to line-wrap po files
The line wrapping algorithm of gettext is somewhat complicated.

Rather than trying to approximate it, use `msgcat` from gettext
directly. This is also what poedit does.
2021-11-19 11:39:04 +00:00

27 lines
629 B
Python
Executable file

#!/usr/bin/env python
from typing import List, Tuple
import sudachipy
import segmenter_lib
class JaTokenizer():
_MODE = sudachipy.SplitMode.C
def __init__(self) -> None:
self._tokenizer = sudachipy.Dictionary().create()
def __call__(self, text: bytes) -> Tuple[List[int], List[int]]:
unicode_text = text.decode()
tokens = self._tokenizer.tokenize(unicode_text)
starts = []
ends = []
for token in tokens:
starts.append(len(unicode_text[:token.begin()].encode()))
ends.append(len(unicode_text[:token.end()].encode()))
return starts, ends
if __name__ == "__main__":
segmenter_lib.Main(JaTokenizer())