hojichar.filters.language_identification

  1import hashlib
  2import logging
  3import os
  4import time
  5from os import PathLike
  6from pathlib import Path
  7from typing import Any, Tuple, Union
  8
  9try:
 10    import requests
 11    from fasttext import load_model  # type: ignore
 12
 13    is_loaded_extras = True
 14except ImportError:
 15    is_loaded_extras = False
 16
 17from tqdm import tqdm
 18
 19from hojichar import Document, Filter
 20
 21logger = logging.getLogger(__name__)
 22
 23
 24FASTTEXT_MODEL_URL = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
 25MODEL_CHECKSUM = "01810bc59c6a3d2b79c79e6336612f65"
 26
 27
 28def _download_with_progress_bar(
 29    download_url: str, save_path: Union[str, PathLike], retries: int = 3, delay: float = 1.0
 30) -> None:
 31    # HACK type hint `os.PathLike[str]` is not allowed in Python 3.8 or older.
 32    # So I write Union[str, PathLike]. In the future, I will use `os.PathLike[str]` or simply Path.
 33    try:
 34        Path(save_path).parent.mkdir(parents=True, exist_ok=True)
 35        with requests.get(download_url, stream=True) as r:
 36            r.raise_for_status()
 37            total_size = int(r.headers.get("content-length", 0))
 38            with tqdm(total=total_size, unit="B", unit_scale=True) as pbar:
 39                with open(save_path, "wb") as f:
 40                    for chunk in r.iter_content(chunk_size=8192):
 41                        f.write(chunk)
 42                        pbar.update(len(chunk))
 43    except requests.RequestException as e:
 44        if retries > 0:
 45            logger.warning(
 46                f"Download failed, retrying in {delay} seconds... ({retries} retries left)"
 47            )
 48            time.sleep(delay)
 49            _download_with_progress_bar(download_url, save_path, retries - 1, delay)
 50        else:
 51            logger.error(f"Download failed after retries: {e}")
 52            raise
 53
 54
 55def _get_md5_hash_of_file(file_path: Union[str, PathLike]) -> str:
 56    """
 57    Function to calculate the MD5 hash of a file.
 58
 59    Read the file in chunks to avoid loading large files into memory.
 60    cf. https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file
 61    """
 62    md5_hash = hashlib.md5()
 63    with open(file_path, "rb") as file:
 64        while chunk := file.read(8192):
 65            md5_hash.update(chunk)
 66    return md5_hash.hexdigest()
 67
 68
 69def _download_fasttext_model(model_path: Union[str, PathLike]) -> None:
 70    logger.info(f"Downloading fasttext model from {FASTTEXT_MODEL_URL} to {model_path}...")
 71    _download_with_progress_bar(FASTTEXT_MODEL_URL, model_path)
 72
 73
 74class LanguageIdentificationByFastText(Filter):
 75    """
 76    A filter that removes non-Japanese text
 77    using Language IDentification (LID).
 78    Download the fastText model for LID: https://fasttext.cc/docs/en/language-identification.html
 79    The available languages are:
 80        af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr ca cbk ce ceb
 81        ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa fi fr frr fy ga gd gl gn gom
 82        gu gv he hi hif hr hsb ht hu hy ia id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv
 83        kw ky la lb lez li lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn
 84        nah nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah
 85        sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec
 86        vep vi vls vo wa war wuu xal xmf yi yo yue zh
 87    """
 88
 89    def __init__(
 90        self,
 91        language: str,
 92        lang_score_threshold: float = 0.50,
 93        model_path: Union[str, PathLike, None] = None,
 94        *args: Any,
 95        **kwargs: Any,
 96    ) -> None:
 97        """
 98        Args:
 99            language: the language to be accepted.
100            lang_score_threshold: the document whose fasttext score is
101              below this threshold will be discarded.
102              A default value is 0.50, which is empirically very generous value,
103              i.e., almost all documents are accepted if the document
104              is classified as japanese text.
105            model_path: The directory path, which the model saved in.
106                If None, the model will be saved in the current directory.
107            *args:
108            **kwargs:
109        """
110        super().__init__(*args, **kwargs)
111        if not is_loaded_extras:
112            raise ImportError(
113                "The `fasttext` package is required to use this filter. "
114                "Please install it by running `pip install hojichar[all]`"
115                "or `pip install fasttext requests`."
116            )
117
118        self.lang_score_threshold = lang_score_threshold
119        self.language = language
120
121        self.model_path = Path(model_path) if model_path else Path(os.getcwd()) / "lid.176.bin"
122
123        if not self.model_path.exists():
124            logger.info("Fasttext model file was not found.")
125            _download_fasttext_model(self.model_path)
126
127        assert _get_md5_hash_of_file(self.model_path) == MODEL_CHECKSUM, (
128            f"Checksum of the downloaded model file does not match the expected value. "
129            f"Expected: {MODEL_CHECKSUM}, "
130            f"Actual: {_get_md5_hash_of_file(self.model_path)}"
131        )
132        self.model = load_model(str(self.model_path))
133
134    def _predict_language(self, text: str) -> Tuple[str, float]:
135        # fasttext cannot handle multiline input
136        # so we must remove the newline character
137        text = text.strip().replace("\n", " ")
138        pred = self.model.predict(text)
139        pred_lang = pred[0][0].replace("__label__", "")
140        pred_score = pred[1][0]
141        return pred_lang, pred_score
142
143    def apply(self, doc: Document) -> Document:
144        pred_lang, score = self._predict_language(doc.text)
145        if not (pred_lang == self.language and score >= self.lang_score_threshold):
146            doc.is_rejected = True
147        return doc
148
149
150class AcceptJapaneseByFastText(LanguageIdentificationByFastText):
151    """
152    A filter that removes non-Japanese text via Language Identification (LID) by FastText.
153
154    >>> AcceptJapaneseByFastText().apply(Document("This is English document")).is_rejected
155    True
156    >>> AcceptJapaneseByFastText().apply(Document("自然言語処理大好き!")).is_rejected
157    False
158    >>> AcceptJapaneseByFastText().apply(Document("快三手机投注平台代理")).is_rejected
159    True
160    """
161
162    def __init__(
163        self,
164        lang_score_threshold: float = 0.50,
165        model_path: Union[str, PathLike, None] = None,
166        *args: Any,
167        **kwargs: Any,
168    ) -> None:
169        super().__init__("ja", lang_score_threshold, model_path, *args, **kwargs)
class LanguageIdentificationByFastText(hojichar.core.filter_interface.Filter):
 75class LanguageIdentificationByFastText(Filter):
 76    """
 77    A filter that removes non-Japanese text
 78    using Language IDentification (LID).
 79    Download the fastText model for LID: https://fasttext.cc/docs/en/language-identification.html
 80    The available languages are:
 81        af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr ca cbk ce ceb
 82        ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa fi fr frr fy ga gd gl gn gom
 83        gu gv he hi hif hr hsb ht hu hy ia id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv
 84        kw ky la lb lez li lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn
 85        nah nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah
 86        sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec
 87        vep vi vls vo wa war wuu xal xmf yi yo yue zh
 88    """
 89
 90    def __init__(
 91        self,
 92        language: str,
 93        lang_score_threshold: float = 0.50,
 94        model_path: Union[str, PathLike, None] = None,
 95        *args: Any,
 96        **kwargs: Any,
 97    ) -> None:
 98        """
 99        Args:
100            language: the language to be accepted.
101            lang_score_threshold: the document whose fasttext score is
102              below this threshold will be discarded.
103              A default value is 0.50, which is empirically very generous value,
104              i.e., almost all documents are accepted if the document
105              is classified as japanese text.
106            model_path: The directory path, which the model saved in.
107                If None, the model will be saved in the current directory.
108            *args:
109            **kwargs:
110        """
111        super().__init__(*args, **kwargs)
112        if not is_loaded_extras:
113            raise ImportError(
114                "The `fasttext` package is required to use this filter. "
115                "Please install it by running `pip install hojichar[all]`"
116                "or `pip install fasttext requests`."
117            )
118
119        self.lang_score_threshold = lang_score_threshold
120        self.language = language
121
122        self.model_path = Path(model_path) if model_path else Path(os.getcwd()) / "lid.176.bin"
123
124        if not self.model_path.exists():
125            logger.info("Fasttext model file was not found.")
126            _download_fasttext_model(self.model_path)
127
128        assert _get_md5_hash_of_file(self.model_path) == MODEL_CHECKSUM, (
129            f"Checksum of the downloaded model file does not match the expected value. "
130            f"Expected: {MODEL_CHECKSUM}, "
131            f"Actual: {_get_md5_hash_of_file(self.model_path)}"
132        )
133        self.model = load_model(str(self.model_path))
134
135    def _predict_language(self, text: str) -> Tuple[str, float]:
136        # fasttext cannot handle multiline input
137        # so we must remove the newline character
138        text = text.strip().replace("\n", " ")
139        pred = self.model.predict(text)
140        pred_lang = pred[0][0].replace("__label__", "")
141        pred_score = pred[1][0]
142        return pred_lang, pred_score
143
144    def apply(self, doc: Document) -> Document:
145        pred_lang, score = self._predict_language(doc.text)
146        if not (pred_lang == self.language and score >= self.lang_score_threshold):
147            doc.is_rejected = True
148        return doc

A filter that removes non-Japanese text using Language IDentification (LID). Download the fastText model for LID: https://fasttext.cc/docs/en/language-identification.html The available languages are: af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh

LanguageIdentificationByFastText( language: str, lang_score_threshold: float = 0.5, model_path: Union[str, os.PathLike, NoneType] = None, *args: Any, **kwargs: Any)
 90    def __init__(
 91        self,
 92        language: str,
 93        lang_score_threshold: float = 0.50,
 94        model_path: Union[str, PathLike, None] = None,
 95        *args: Any,
 96        **kwargs: Any,
 97    ) -> None:
 98        """
 99        Args:
100            language: the language to be accepted.
101            lang_score_threshold: the document whose fasttext score is
102              below this threshold will be discarded.
103              A default value is 0.50, which is empirically very generous value,
104              i.e., almost all documents are accepted if the document
105              is classified as japanese text.
106            model_path: The directory path, which the model saved in.
107                If None, the model will be saved in the current directory.
108            *args:
109            **kwargs:
110        """
111        super().__init__(*args, **kwargs)
112        if not is_loaded_extras:
113            raise ImportError(
114                "The `fasttext` package is required to use this filter. "
115                "Please install it by running `pip install hojichar[all]`"
116                "or `pip install fasttext requests`."
117            )
118
119        self.lang_score_threshold = lang_score_threshold
120        self.language = language
121
122        self.model_path = Path(model_path) if model_path else Path(os.getcwd()) / "lid.176.bin"
123
124        if not self.model_path.exists():
125            logger.info("Fasttext model file was not found.")
126            _download_fasttext_model(self.model_path)
127
128        assert _get_md5_hash_of_file(self.model_path) == MODEL_CHECKSUM, (
129            f"Checksum of the downloaded model file does not match the expected value. "
130            f"Expected: {MODEL_CHECKSUM}, "
131            f"Actual: {_get_md5_hash_of_file(self.model_path)}"
132        )
133        self.model = load_model(str(self.model_path))

Args: language: the language to be accepted. lang_score_threshold: the document whose fasttext score is below this threshold will be discarded. A default value is 0.50, which is empirically very generous value, i.e., almost all documents are accepted if the document is classified as japanese text. model_path: The directory path, which the model saved in. If None, the model will be saved in the current directory. args: *kwargs:

def apply( self, doc: hojichar.core.models.Document) -> hojichar.core.models.Document:
144    def apply(self, doc: Document) -> Document:
145        pred_lang, score = self._predict_language(doc.text)
146        if not (pred_lang == self.language and score >= self.lang_score_threshold):
147            doc.is_rejected = True
148        return doc

Definition of filter behavior.

The document must have a protocol TextContent, and mostly used hojichar.Document class.

In this method, the filter will modify document.text or document.extras and set document.is_rejected = True to discard the document.

Parameters

document : Document Input document

Returns

Document Processed Document

class AcceptJapaneseByFastText(LanguageIdentificationByFastText):
151class AcceptJapaneseByFastText(LanguageIdentificationByFastText):
152    """
153    A filter that removes non-Japanese text via Language Identification (LID) by FastText.
154
155    >>> AcceptJapaneseByFastText().apply(Document("This is English document")).is_rejected
156    True
157    >>> AcceptJapaneseByFastText().apply(Document("自然言語処理大好き!")).is_rejected
158    False
159    >>> AcceptJapaneseByFastText().apply(Document("快三手机投注平台代理")).is_rejected
160    True
161    """
162
163    def __init__(
164        self,
165        lang_score_threshold: float = 0.50,
166        model_path: Union[str, PathLike, None] = None,
167        *args: Any,
168        **kwargs: Any,
169    ) -> None:
170        super().__init__("ja", lang_score_threshold, model_path, *args, **kwargs)

A filter that removes non-Japanese text via Language Identification (LID) by FastText.

>>> AcceptJapaneseByFastText().apply(Document("This is English document")).is_rejected
True
>>> AcceptJapaneseByFastText().apply(Document("自然言語処理大好き!")).is_rejected
False
>>> AcceptJapaneseByFastText().apply(Document("快三手机投注平台代理")).is_rejected
True
AcceptJapaneseByFastText( lang_score_threshold: float = 0.5, model_path: Union[str, os.PathLike, NoneType] = None, *args: Any, **kwargs: Any)
163    def __init__(
164        self,
165        lang_score_threshold: float = 0.50,
166        model_path: Union[str, PathLike, None] = None,
167        *args: Any,
168        **kwargs: Any,
169    ) -> None:
170        super().__init__("ja", lang_score_threshold, model_path, *args, **kwargs)

Args: language: the language to be accepted. lang_score_threshold: the document whose fasttext score is below this threshold will be discarded. A default value is 0.50, which is empirically very generous value, i.e., almost all documents are accepted if the document is classified as japanese text. model_path: The directory path, which the model saved in. If None, the model will be saved in the current directory. args: *kwargs: