hojichar.filters.token_filters

 1import re
 2from typing import Any
 3
 4from hojichar.core.filter_interface import TokenFilter
 5from hojichar.core.models import Token
 6
 7
 8class TokenAddDebagTag(TokenFilter):
 9    """トークン末尾にデバッグ用のタグを追加します."""
10
11    def apply(self, token: Token) -> Token:  # type: ignore
12        """
13        >>> TokenAddDebagTag()("hello")
14        'hello<sep>'
15        """
16        token.text += "<sep>"
17        return token
18
19
20class SEOTokenRemover(TokenFilter):
21    """
22    The process is migrated from legacy code.
23    I couldn't understand what this process was about, mainly because
24    the regex pattern is too complex.
25    """
26
27    def __init__(self, min_average_seo_char_length: int = 5, *args: Any, **kwargs: Any) -> None:
28        super().__init__(*args, **kwargs)
29        self.token_split_pat = re.compile(r"\ |-|・|,")
30        self.min_average_seo_char_length = min_average_seo_char_length
31        self.replace_pat = re.compile(
32            r"\-{5,},@[a-zA-Z0-9]+,[#\$\%\-]{4,},[_=#\$\%\-]{4,}[\ ]*.+?[\ ]*[_=#\$\%\-]{4,}|★[…━]+★"  # noqa
33        )
34
35    def apply(self, token: Token) -> Token:  # type: ignore
36        seo_words = self.token_split_pat.split(token.text.strip())
37        n_words = len(seo_words)
38        if n_words == 0:
39            return token
40        avg_char_len = len(token.text) / n_words
41
42        if avg_char_len <= self.min_average_seo_char_length:
43            return token
44
45        replace_patterns = self.replace_pat.search(token.text)
46        if replace_patterns is not None:
47            token.text = token.text.replace(replace_patterns.group(0), "", 1)
48
49        return token
class TokenAddDebagTag(hojichar.core.filter_interface.TokenFilter):
 9class TokenAddDebagTag(TokenFilter):
10    """トークン末尾にデバッグ用のタグを追加します."""
11
12    def apply(self, token: Token) -> Token:  # type: ignore
13        """
14        >>> TokenAddDebagTag()("hello")
15        'hello<sep>'
16        """
17        token.text += "<sep>"
18        return token

トークン末尾にデバッグ用のタグを追加します.

def apply(self, token: hojichar.core.models.Token) -> hojichar.core.models.Token:
12    def apply(self, token: Token) -> Token:  # type: ignore
13        """
14        >>> TokenAddDebagTag()("hello")
15        'hello<sep>'
16        """
17        token.text += "<sep>"
18        return token
>>> TokenAddDebagTag()("hello")
'hello<sep>'
class SEOTokenRemover(hojichar.core.filter_interface.TokenFilter):
21class SEOTokenRemover(TokenFilter):
22    """
23    The process is migrated from legacy code.
24    I couldn't understand what this process was about, mainly because
25    the regex pattern is too complex.
26    """
27
28    def __init__(self, min_average_seo_char_length: int = 5, *args: Any, **kwargs: Any) -> None:
29        super().__init__(*args, **kwargs)
30        self.token_split_pat = re.compile(r"\ |-|・|,")
31        self.min_average_seo_char_length = min_average_seo_char_length
32        self.replace_pat = re.compile(
33            r"\-{5,},@[a-zA-Z0-9]+,[#\$\%\-]{4,},[_=#\$\%\-]{4,}[\ ]*.+?[\ ]*[_=#\$\%\-]{4,}|★[…━]+★"  # noqa
34        )
35
36    def apply(self, token: Token) -> Token:  # type: ignore
37        seo_words = self.token_split_pat.split(token.text.strip())
38        n_words = len(seo_words)
39        if n_words == 0:
40            return token
41        avg_char_len = len(token.text) / n_words
42
43        if avg_char_len <= self.min_average_seo_char_length:
44            return token
45
46        replace_patterns = self.replace_pat.search(token.text)
47        if replace_patterns is not None:
48            token.text = token.text.replace(replace_patterns.group(0), "", 1)
49
50        return token

The process is migrated from legacy code. I couldn't understand what this process was about, mainly because the regex pattern is too complex.

SEOTokenRemover(min_average_seo_char_length: int = 5, *args: Any, **kwargs: Any)
28    def __init__(self, min_average_seo_char_length: int = 5, *args: Any, **kwargs: Any) -> None:
29        super().__init__(*args, **kwargs)
30        self.token_split_pat = re.compile(r"\ |-|・|,")
31        self.min_average_seo_char_length = min_average_seo_char_length
32        self.replace_pat = re.compile(
33            r"\-{5,},@[a-zA-Z0-9]+,[#\$\%\-]{4,},[_=#\$\%\-]{4,}[\ ]*.+?[\ ]*[_=#\$\%\-]{4,}|★[…━]+★"  # noqa
34        )

Initialize the filter.

Parameters

p : float The probability of applying the filter. If p is 1, the filter will always be applied. skip_rejected : bool If True, the filter will skip documents that are already rejected. If you want to apply the filter to all documents (e.g., postprocess), set this to False. random_state : Optional[Union[int, np.random.Generator]] Seed for the random number generator. If None, a new random number generator will be created. If None, and use in the Compose class, the random state is shared with the Compose object. use_batch : bool If True, the filter will process documents in batches in the apply_stream method. batch_size : int The size of the batch to process documents in the apply_stream method. kwargs : Any Additional keyword arguments to pass to the filter.

def apply(self, token: hojichar.core.models.Token) -> hojichar.core.models.Token:
36    def apply(self, token: Token) -> Token:  # type: ignore
37        seo_words = self.token_split_pat.split(token.text.strip())
38        n_words = len(seo_words)
39        if n_words == 0:
40            return token
41        avg_char_len = len(token.text) / n_words
42
43        if avg_char_len <= self.min_average_seo_char_length:
44            return token
45
46        replace_patterns = self.replace_pat.search(token.text)
47        if replace_patterns is not None:
48            token.text = token.text.replace(replace_patterns.group(0), "", 1)
49
50        return token

Definition of filter behavior.

The document must have a protocol TextContent, and mostly used hojichar.Document class.

In this method, the filter will modify document.text or document.extras and set document.is_rejected = True to discard the document.

Parameters

document : Document Input document

Returns

Document Processed Document