hojichar.filters.token_filters
1import re 2from typing import Any 3 4from hojichar.core.filter_interface import TokenFilter 5from hojichar.core.models import Token 6 7 8class TokenAddDebagTag(TokenFilter): 9 """トークン末尾にデバッグ用のタグを追加します.""" 10 11 def apply(self, token: Token) -> Token: # type: ignore 12 """ 13 >>> TokenAddDebagTag()("hello") 14 'hello<sep>' 15 """ 16 token.text += "<sep>" 17 return token 18 19 20class SEOTokenRemover(TokenFilter): 21 """ 22 The process is migrated from legacy code. 23 I couldn't understand what this process was about, mainly because 24 the regex pattern is too complex. 25 """ 26 27 def __init__(self, min_average_seo_char_length: int = 5, *args: Any, **kwargs: Any) -> None: 28 super().__init__(*args, **kwargs) 29 self.token_split_pat = re.compile(r"\ |-|・|,") 30 self.min_average_seo_char_length = min_average_seo_char_length 31 self.replace_pat = re.compile( 32 r"\-{5,},@[a-zA-Z0-9]+,[#\$\%\-]{4,},[_=#\$\%\-]{4,}[\ ]*.+?[\ ]*[_=#\$\%\-]{4,}|★[…━]+★" # noqa 33 ) 34 35 def apply(self, token: Token) -> Token: # type: ignore 36 seo_words = self.token_split_pat.split(token.text.strip()) 37 n_words = len(seo_words) 38 if n_words == 0: 39 return token 40 avg_char_len = len(token.text) / n_words 41 42 if avg_char_len <= self.min_average_seo_char_length: 43 return token 44 45 replace_patterns = self.replace_pat.search(token.text) 46 if replace_patterns is not None: 47 token.text = token.text.replace(replace_patterns.group(0), "", 1) 48 49 return token
9class TokenAddDebagTag(TokenFilter): 10 """トークン末尾にデバッグ用のタグを追加します.""" 11 12 def apply(self, token: Token) -> Token: # type: ignore 13 """ 14 >>> TokenAddDebagTag()("hello") 15 'hello<sep>' 16 """ 17 token.text += "<sep>" 18 return token
トークン末尾にデバッグ用のタグを追加します.
21class SEOTokenRemover(TokenFilter): 22 """ 23 The process is migrated from legacy code. 24 I couldn't understand what this process was about, mainly because 25 the regex pattern is too complex. 26 """ 27 28 def __init__(self, min_average_seo_char_length: int = 5, *args: Any, **kwargs: Any) -> None: 29 super().__init__(*args, **kwargs) 30 self.token_split_pat = re.compile(r"\ |-|・|,") 31 self.min_average_seo_char_length = min_average_seo_char_length 32 self.replace_pat = re.compile( 33 r"\-{5,},@[a-zA-Z0-9]+,[#\$\%\-]{4,},[_=#\$\%\-]{4,}[\ ]*.+?[\ ]*[_=#\$\%\-]{4,}|★[…━]+★" # noqa 34 ) 35 36 def apply(self, token: Token) -> Token: # type: ignore 37 seo_words = self.token_split_pat.split(token.text.strip()) 38 n_words = len(seo_words) 39 if n_words == 0: 40 return token 41 avg_char_len = len(token.text) / n_words 42 43 if avg_char_len <= self.min_average_seo_char_length: 44 return token 45 46 replace_patterns = self.replace_pat.search(token.text) 47 if replace_patterns is not None: 48 token.text = token.text.replace(replace_patterns.group(0), "", 1) 49 50 return token
The process is migrated from legacy code. I couldn't understand what this process was about, mainly because the regex pattern is too complex.
28 def __init__(self, min_average_seo_char_length: int = 5, *args: Any, **kwargs: Any) -> None: 29 super().__init__(*args, **kwargs) 30 self.token_split_pat = re.compile(r"\ |-|・|,") 31 self.min_average_seo_char_length = min_average_seo_char_length 32 self.replace_pat = re.compile( 33 r"\-{5,},@[a-zA-Z0-9]+,[#\$\%\-]{4,},[_=#\$\%\-]{4,}[\ ]*.+?[\ ]*[_=#\$\%\-]{4,}|★[…━]+★" # noqa 34 )
Initialize the filter.
Parameters
p : float
The probability of applying the filter.
If p
is 1, the filter will always be applied.
skip_rejected : bool
If True
, the filter will skip documents that are already rejected.
If you want to apply the filter to all documents (e.g., postprocess), set this to False
.
random_state : Optional[Union[int, np.random.Generator]]
Seed for the random number generator.
If None
, a new random number generator will be created.
If None
, and use in the Compose
class, the random state is shared with the Compose
object.
use_batch : bool
If True
, the filter will process documents in batches in the apply_stream
method.
batch_size : int
The size of the batch to process documents in the apply_stream
method.
kwargs : Any
Additional keyword arguments to pass to the filter.
36 def apply(self, token: Token) -> Token: # type: ignore 37 seo_words = self.token_split_pat.split(token.text.strip()) 38 n_words = len(seo_words) 39 if n_words == 0: 40 return token 41 avg_char_len = len(token.text) / n_words 42 43 if avg_char_len <= self.min_average_seo_char_length: 44 return token 45 46 replace_patterns = self.replace_pat.search(token.text) 47 if replace_patterns is not None: 48 token.text = token.text.replace(replace_patterns.group(0), "", 1) 49 50 return token
Definition of filter behavior.
The document must have a protocol TextContent
,
and mostly used hojichar.Document class.
In this method, the filter will modify document.text
or
document.extras
and set document.is_rejected = True
to discard the document.
Parameters
document : Document Input document
Returns
Document Processed Document