hojichar.core.models
1import time 2from dataclasses import dataclass, fields 3from typing import Any, Dict, List, Optional 4 5from hojichar.utils.warn_deprecation import deprecated_since 6 7 8@deprecated_since("0.1.0", "Document") 9class Token: 10 def __init__(self, text: str, is_rejected: bool = False) -> None: 11 self.text = text 12 self.__original = text 13 self.is_rejected = is_rejected 14 15 @property 16 def original(self) -> str: 17 return self.__original 18 19 def __str__(self) -> str: 20 return self.text 21 22 23class Document: 24 """ 25 Document class represents a text document with metadata. 26 It contains the text of the document, a flag indicating whether it is rejected, 27 and additional metadata stored in the `extras` dictionary. 28 29 The `tokens` attribute will be deprecated in future versions, 30 and users are encouraged to use the `extras` dictionary to store token-related information. 31 32 Attributes: 33 text (str): The text content of the document. 34 is_rejected (bool): A flag indicating whether the document is rejected. 35 extras (Dict[str, Any]): A dictionary to store additional metadata about the document. 36 reject_reason (Dict[str, Any]): A dictionary to store the reason for rejection. The 37 filter class and the member name and value will logged at the filter is logged here. 38 39 Next attributes will be deprecated in future versions: 40 tokens (List[Token]): A list of tokens extracted from the document. 41 """ 42 43 def __init__( 44 self, 45 text: str, 46 is_rejected: bool = False, 47 tokens: Optional[List[Token]] = None, 48 extras: Optional[Dict[str, Any]] = None, 49 ) -> None: 50 self.text = text 51 self.__original = text 52 self.is_rejected = is_rejected 53 if tokens is None: 54 self.tokens: List[Token] = [] 55 else: 56 self.tokens = tokens 57 58 if extras is None: 59 self.extras: Dict[str, Any] = {} 60 else: 61 self.extras = extras 62 63 self.reject_reason: Dict[str, Any] = {} 64 65 @property 66 def original(self) -> str: 67 return self.__original 68 69 @deprecated_since("1.0.0") 70 def set_tokens(self, tokens: List[str]) -> None: 71 self.tokens = [Token(token) for token in tokens] 72 73 @deprecated_since("1.0.0") 74 def get_tokens(self) -> List[str]: 75 return [token.text for token in self.tokens] 76 77 def __str__(self) -> str: 78 return self.text 79 80 def __repr__(self) -> str: 81 return ( 82 f"Document(text={self.text!r}, is_rejected={self.is_rejected}, extras={self.extras})" # noqa 83 ) 84 85 86@dataclass 87class Statistics: 88 """ 89 Statistics class to track the performance of the document processing pipeline. 90 """ 91 92 name: Optional[str] = None 93 input_num: int = 0 94 input_bytes: int = 0 95 input_chars: int = 0 96 output_num: int = 0 97 output_bytes: int = 0 98 output_chars: int = 0 99 discard_num: int = 0 100 diff_bytes: int = 0 101 diff_chars: int = 0 102 cumulative_time_ns: int = 0 103 errors: int = 0 104 105 def to_dict(self) -> Dict[str, Any]: 106 """ 107 Convert the Statistics object to a dictionary. 108 """ 109 return {f.name: getattr(self, f.name) for f in fields(self)} 110 111 def update(self, other: "Statistics") -> None: 112 """ 113 Update the statistics by adding another Statistics object. 114 """ 115 self.input_num += other.input_num 116 self.input_bytes += other.input_bytes 117 self.input_chars += other.input_chars 118 self.output_num += other.output_num 119 self.output_bytes += other.output_bytes 120 self.output_chars += other.output_chars 121 self.discard_num += other.discard_num 122 self.diff_bytes += other.diff_bytes 123 self.diff_chars += other.diff_chars 124 self.cumulative_time_ns += other.cumulative_time_ns 125 self.errors += other.errors 126 127 def reset(self) -> "Statistics": 128 """ 129 Reset the statistics to their initial values. 130 """ 131 self.input_num = 0 132 self.input_bytes = 0 133 self.input_chars = 0 134 self.output_num = 0 135 self.output_bytes = 0 136 self.output_chars = 0 137 self.discard_num = 0 138 self.diff_bytes = 0 139 self.diff_chars = 0 140 self.cumulative_time_ns = 0 141 self.errors = 0 142 return self 143 144 def update_by_diff( 145 self, 146 before: dict[str, Any], 147 after: dict[str, Any], 148 ) -> None: 149 """ 150 Update the statistics by calculating the differences between two Doc-info mappings. 151 This method is used to update the statistics after a filter is applied. 152 """ 153 if not before["is_rejected"] and after["is_rejected"]: 154 # Document is rejected after the filter is applied 155 self.input_num += 1 156 self.input_bytes += before["bytes"] 157 self.input_chars += before["chars"] 158 self.discard_num += 1 159 self.diff_bytes -= before["bytes"] 160 self.diff_chars -= before["chars"] 161 self.cumulative_time_ns += after["time_ns"] - before["time_ns"] 162 else: 163 # Document is not rejected or still not rejected after the filter is applied 164 self.input_num += 1 165 self.input_bytes += before["bytes"] 166 self.input_chars += before["chars"] 167 self.output_num += 1 168 self.output_bytes += after["bytes"] 169 self.output_chars += after["chars"] 170 self.diff_bytes += after["bytes"] - before["bytes"] 171 self.diff_chars += after["chars"] - before["chars"] 172 self.cumulative_time_ns += after["time_ns"] - before["time_ns"] 173 174 @staticmethod 175 def add(x: "Statistics", y: "Statistics") -> "Statistics": 176 """ 177 Add two Statistics objects together. 178 This method assumes that the names of the two Statistics objects match. 179 If they do not match, it will raise an AssertionError.""" 180 assert x.name == y.name, "Layer names must match" 181 return Statistics( 182 name=x.name, 183 input_num=x.input_num + y.input_num, 184 input_bytes=x.input_bytes + y.input_bytes, 185 input_chars=x.input_chars + y.input_chars, 186 output_num=x.output_num + y.output_num, 187 output_bytes=x.output_bytes + y.output_bytes, 188 output_chars=x.output_chars + y.output_chars, 189 discard_num=x.discard_num + y.discard_num, 190 diff_bytes=x.diff_bytes + y.diff_bytes, 191 diff_chars=x.diff_chars + y.diff_chars, 192 cumulative_time_ns=x.cumulative_time_ns + y.cumulative_time_ns, 193 errors=x.errors + y.errors, 194 ) 195 196 @staticmethod 197 def add_list_of_stats(x: List["Statistics"], y: List["Statistics"]) -> List["Statistics"]: 198 """ 199 Add FilterStatistics objects from two lists by matching their names. 200 This method assumes that both lists contain FilterStatistics objects 201 with the same names, and it will raise a ValueError if the sets of names 202 in the two lists do not match. 203 """ 204 # check if the names in both lists match 205 names_x = {stat.name for stat in x} 206 names_y = {stat.name for stat in y} 207 if names_x != names_y: 208 raise ValueError(f"name の集合が一致しません: {names_x} vs {names_y}") 209 210 y_map = {stat.name: stat for stat in y} 211 212 # keep the order of x and add corresponding y 213 result: List[Statistics] = [] 214 for stat_x in x: 215 stat_y = y_map[stat_x.name] 216 result.append(Statistics.add(stat_x, stat_y)) 217 218 return result 219 220 @staticmethod 221 def get_filter(name: str, stats: List["Statistics"]) -> "Statistics": 222 """ 223 Get a Statistics object by its name from a list of statistics. 224 If the name is not found, return None. 225 """ 226 for stat in stats: 227 if stat.name == name: 228 return stat 229 raise KeyError(f"Statistics with name '{name}' not found in the list.") 230 231 232def get_doc_info(document: Document) -> dict[str, Any]: 233 """ 234 Create a document-info mapping from a Document instance. 235 This function is used to extract metadata from the Document for statistics tracking. 236 """ 237 return { 238 "is_rejected": document.is_rejected, 239 "bytes": len(document.text.encode("utf-8")), 240 "chars": len(document.text), 241 "time_ns": time.perf_counter_ns(), 242 }
9@deprecated_since("0.1.0", "Document") 10class Token: 11 def __init__(self, text: str, is_rejected: bool = False) -> None: 12 self.text = text 13 self.__original = text 14 self.is_rejected = is_rejected 15 16 @property 17 def original(self) -> str: 18 return self.__original 19 20 def __str__(self) -> str: 21 return self.text
24class Document: 25 """ 26 Document class represents a text document with metadata. 27 It contains the text of the document, a flag indicating whether it is rejected, 28 and additional metadata stored in the `extras` dictionary. 29 30 The `tokens` attribute will be deprecated in future versions, 31 and users are encouraged to use the `extras` dictionary to store token-related information. 32 33 Attributes: 34 text (str): The text content of the document. 35 is_rejected (bool): A flag indicating whether the document is rejected. 36 extras (Dict[str, Any]): A dictionary to store additional metadata about the document. 37 reject_reason (Dict[str, Any]): A dictionary to store the reason for rejection. The 38 filter class and the member name and value will logged at the filter is logged here. 39 40 Next attributes will be deprecated in future versions: 41 tokens (List[Token]): A list of tokens extracted from the document. 42 """ 43 44 def __init__( 45 self, 46 text: str, 47 is_rejected: bool = False, 48 tokens: Optional[List[Token]] = None, 49 extras: Optional[Dict[str, Any]] = None, 50 ) -> None: 51 self.text = text 52 self.__original = text 53 self.is_rejected = is_rejected 54 if tokens is None: 55 self.tokens: List[Token] = [] 56 else: 57 self.tokens = tokens 58 59 if extras is None: 60 self.extras: Dict[str, Any] = {} 61 else: 62 self.extras = extras 63 64 self.reject_reason: Dict[str, Any] = {} 65 66 @property 67 def original(self) -> str: 68 return self.__original 69 70 @deprecated_since("1.0.0") 71 def set_tokens(self, tokens: List[str]) -> None: 72 self.tokens = [Token(token) for token in tokens] 73 74 @deprecated_since("1.0.0") 75 def get_tokens(self) -> List[str]: 76 return [token.text for token in self.tokens] 77 78 def __str__(self) -> str: 79 return self.text 80 81 def __repr__(self) -> str: 82 return ( 83 f"Document(text={self.text!r}, is_rejected={self.is_rejected}, extras={self.extras})" # noqa 84 )
Document class represents a text document with metadata.
It contains the text of the document, a flag indicating whether it is rejected,
and additional metadata stored in the extras
dictionary.
The tokens
attribute will be deprecated in future versions,
and users are encouraged to use the extras
dictionary to store token-related information.
Attributes: text (str): The text content of the document. is_rejected (bool): A flag indicating whether the document is rejected. extras (Dict[str, Any]): A dictionary to store additional metadata about the document. reject_reason (Dict[str, Any]): A dictionary to store the reason for rejection. The filter class and the member name and value will logged at the filter is logged here.
Next attributes will be deprecated in future versions: tokens (List[Token]): A list of tokens extracted from the document.
44 def __init__( 45 self, 46 text: str, 47 is_rejected: bool = False, 48 tokens: Optional[List[Token]] = None, 49 extras: Optional[Dict[str, Any]] = None, 50 ) -> None: 51 self.text = text 52 self.__original = text 53 self.is_rejected = is_rejected 54 if tokens is None: 55 self.tokens: List[Token] = [] 56 else: 57 self.tokens = tokens 58 59 if extras is None: 60 self.extras: Dict[str, Any] = {} 61 else: 62 self.extras = extras 63 64 self.reject_reason: Dict[str, Any] = {}
87@dataclass 88class Statistics: 89 """ 90 Statistics class to track the performance of the document processing pipeline. 91 """ 92 93 name: Optional[str] = None 94 input_num: int = 0 95 input_bytes: int = 0 96 input_chars: int = 0 97 output_num: int = 0 98 output_bytes: int = 0 99 output_chars: int = 0 100 discard_num: int = 0 101 diff_bytes: int = 0 102 diff_chars: int = 0 103 cumulative_time_ns: int = 0 104 errors: int = 0 105 106 def to_dict(self) -> Dict[str, Any]: 107 """ 108 Convert the Statistics object to a dictionary. 109 """ 110 return {f.name: getattr(self, f.name) for f in fields(self)} 111 112 def update(self, other: "Statistics") -> None: 113 """ 114 Update the statistics by adding another Statistics object. 115 """ 116 self.input_num += other.input_num 117 self.input_bytes += other.input_bytes 118 self.input_chars += other.input_chars 119 self.output_num += other.output_num 120 self.output_bytes += other.output_bytes 121 self.output_chars += other.output_chars 122 self.discard_num += other.discard_num 123 self.diff_bytes += other.diff_bytes 124 self.diff_chars += other.diff_chars 125 self.cumulative_time_ns += other.cumulative_time_ns 126 self.errors += other.errors 127 128 def reset(self) -> "Statistics": 129 """ 130 Reset the statistics to their initial values. 131 """ 132 self.input_num = 0 133 self.input_bytes = 0 134 self.input_chars = 0 135 self.output_num = 0 136 self.output_bytes = 0 137 self.output_chars = 0 138 self.discard_num = 0 139 self.diff_bytes = 0 140 self.diff_chars = 0 141 self.cumulative_time_ns = 0 142 self.errors = 0 143 return self 144 145 def update_by_diff( 146 self, 147 before: dict[str, Any], 148 after: dict[str, Any], 149 ) -> None: 150 """ 151 Update the statistics by calculating the differences between two Doc-info mappings. 152 This method is used to update the statistics after a filter is applied. 153 """ 154 if not before["is_rejected"] and after["is_rejected"]: 155 # Document is rejected after the filter is applied 156 self.input_num += 1 157 self.input_bytes += before["bytes"] 158 self.input_chars += before["chars"] 159 self.discard_num += 1 160 self.diff_bytes -= before["bytes"] 161 self.diff_chars -= before["chars"] 162 self.cumulative_time_ns += after["time_ns"] - before["time_ns"] 163 else: 164 # Document is not rejected or still not rejected after the filter is applied 165 self.input_num += 1 166 self.input_bytes += before["bytes"] 167 self.input_chars += before["chars"] 168 self.output_num += 1 169 self.output_bytes += after["bytes"] 170 self.output_chars += after["chars"] 171 self.diff_bytes += after["bytes"] - before["bytes"] 172 self.diff_chars += after["chars"] - before["chars"] 173 self.cumulative_time_ns += after["time_ns"] - before["time_ns"] 174 175 @staticmethod 176 def add(x: "Statistics", y: "Statistics") -> "Statistics": 177 """ 178 Add two Statistics objects together. 179 This method assumes that the names of the two Statistics objects match. 180 If they do not match, it will raise an AssertionError.""" 181 assert x.name == y.name, "Layer names must match" 182 return Statistics( 183 name=x.name, 184 input_num=x.input_num + y.input_num, 185 input_bytes=x.input_bytes + y.input_bytes, 186 input_chars=x.input_chars + y.input_chars, 187 output_num=x.output_num + y.output_num, 188 output_bytes=x.output_bytes + y.output_bytes, 189 output_chars=x.output_chars + y.output_chars, 190 discard_num=x.discard_num + y.discard_num, 191 diff_bytes=x.diff_bytes + y.diff_bytes, 192 diff_chars=x.diff_chars + y.diff_chars, 193 cumulative_time_ns=x.cumulative_time_ns + y.cumulative_time_ns, 194 errors=x.errors + y.errors, 195 ) 196 197 @staticmethod 198 def add_list_of_stats(x: List["Statistics"], y: List["Statistics"]) -> List["Statistics"]: 199 """ 200 Add FilterStatistics objects from two lists by matching their names. 201 This method assumes that both lists contain FilterStatistics objects 202 with the same names, and it will raise a ValueError if the sets of names 203 in the two lists do not match. 204 """ 205 # check if the names in both lists match 206 names_x = {stat.name for stat in x} 207 names_y = {stat.name for stat in y} 208 if names_x != names_y: 209 raise ValueError(f"name の集合が一致しません: {names_x} vs {names_y}") 210 211 y_map = {stat.name: stat for stat in y} 212 213 # keep the order of x and add corresponding y 214 result: List[Statistics] = [] 215 for stat_x in x: 216 stat_y = y_map[stat_x.name] 217 result.append(Statistics.add(stat_x, stat_y)) 218 219 return result 220 221 @staticmethod 222 def get_filter(name: str, stats: List["Statistics"]) -> "Statistics": 223 """ 224 Get a Statistics object by its name from a list of statistics. 225 If the name is not found, return None. 226 """ 227 for stat in stats: 228 if stat.name == name: 229 return stat 230 raise KeyError(f"Statistics with name '{name}' not found in the list.")
Statistics class to track the performance of the document processing pipeline.
106 def to_dict(self) -> Dict[str, Any]: 107 """ 108 Convert the Statistics object to a dictionary. 109 """ 110 return {f.name: getattr(self, f.name) for f in fields(self)}
Convert the Statistics object to a dictionary.
112 def update(self, other: "Statistics") -> None: 113 """ 114 Update the statistics by adding another Statistics object. 115 """ 116 self.input_num += other.input_num 117 self.input_bytes += other.input_bytes 118 self.input_chars += other.input_chars 119 self.output_num += other.output_num 120 self.output_bytes += other.output_bytes 121 self.output_chars += other.output_chars 122 self.discard_num += other.discard_num 123 self.diff_bytes += other.diff_bytes 124 self.diff_chars += other.diff_chars 125 self.cumulative_time_ns += other.cumulative_time_ns 126 self.errors += other.errors
Update the statistics by adding another Statistics object.
128 def reset(self) -> "Statistics": 129 """ 130 Reset the statistics to their initial values. 131 """ 132 self.input_num = 0 133 self.input_bytes = 0 134 self.input_chars = 0 135 self.output_num = 0 136 self.output_bytes = 0 137 self.output_chars = 0 138 self.discard_num = 0 139 self.diff_bytes = 0 140 self.diff_chars = 0 141 self.cumulative_time_ns = 0 142 self.errors = 0 143 return self
Reset the statistics to their initial values.
145 def update_by_diff( 146 self, 147 before: dict[str, Any], 148 after: dict[str, Any], 149 ) -> None: 150 """ 151 Update the statistics by calculating the differences between two Doc-info mappings. 152 This method is used to update the statistics after a filter is applied. 153 """ 154 if not before["is_rejected"] and after["is_rejected"]: 155 # Document is rejected after the filter is applied 156 self.input_num += 1 157 self.input_bytes += before["bytes"] 158 self.input_chars += before["chars"] 159 self.discard_num += 1 160 self.diff_bytes -= before["bytes"] 161 self.diff_chars -= before["chars"] 162 self.cumulative_time_ns += after["time_ns"] - before["time_ns"] 163 else: 164 # Document is not rejected or still not rejected after the filter is applied 165 self.input_num += 1 166 self.input_bytes += before["bytes"] 167 self.input_chars += before["chars"] 168 self.output_num += 1 169 self.output_bytes += after["bytes"] 170 self.output_chars += after["chars"] 171 self.diff_bytes += after["bytes"] - before["bytes"] 172 self.diff_chars += after["chars"] - before["chars"] 173 self.cumulative_time_ns += after["time_ns"] - before["time_ns"]
Update the statistics by calculating the differences between two Doc-info mappings. This method is used to update the statistics after a filter is applied.
175 @staticmethod 176 def add(x: "Statistics", y: "Statistics") -> "Statistics": 177 """ 178 Add two Statistics objects together. 179 This method assumes that the names of the two Statistics objects match. 180 If they do not match, it will raise an AssertionError.""" 181 assert x.name == y.name, "Layer names must match" 182 return Statistics( 183 name=x.name, 184 input_num=x.input_num + y.input_num, 185 input_bytes=x.input_bytes + y.input_bytes, 186 input_chars=x.input_chars + y.input_chars, 187 output_num=x.output_num + y.output_num, 188 output_bytes=x.output_bytes + y.output_bytes, 189 output_chars=x.output_chars + y.output_chars, 190 discard_num=x.discard_num + y.discard_num, 191 diff_bytes=x.diff_bytes + y.diff_bytes, 192 diff_chars=x.diff_chars + y.diff_chars, 193 cumulative_time_ns=x.cumulative_time_ns + y.cumulative_time_ns, 194 errors=x.errors + y.errors, 195 )
Add two Statistics objects together. This method assumes that the names of the two Statistics objects match. If they do not match, it will raise an AssertionError.
197 @staticmethod 198 def add_list_of_stats(x: List["Statistics"], y: List["Statistics"]) -> List["Statistics"]: 199 """ 200 Add FilterStatistics objects from two lists by matching their names. 201 This method assumes that both lists contain FilterStatistics objects 202 with the same names, and it will raise a ValueError if the sets of names 203 in the two lists do not match. 204 """ 205 # check if the names in both lists match 206 names_x = {stat.name for stat in x} 207 names_y = {stat.name for stat in y} 208 if names_x != names_y: 209 raise ValueError(f"name の集合が一致しません: {names_x} vs {names_y}") 210 211 y_map = {stat.name: stat for stat in y} 212 213 # keep the order of x and add corresponding y 214 result: List[Statistics] = [] 215 for stat_x in x: 216 stat_y = y_map[stat_x.name] 217 result.append(Statistics.add(stat_x, stat_y)) 218 219 return result
Add FilterStatistics objects from two lists by matching their names. This method assumes that both lists contain FilterStatistics objects with the same names, and it will raise a ValueError if the sets of names in the two lists do not match.
221 @staticmethod 222 def get_filter(name: str, stats: List["Statistics"]) -> "Statistics": 223 """ 224 Get a Statistics object by its name from a list of statistics. 225 If the name is not found, return None. 226 """ 227 for stat in stats: 228 if stat.name == name: 229 return stat 230 raise KeyError(f"Statistics with name '{name}' not found in the list.")
Get a Statistics object by its name from a list of statistics. If the name is not found, return None.
233def get_doc_info(document: Document) -> dict[str, Any]: 234 """ 235 Create a document-info mapping from a Document instance. 236 This function is used to extract metadata from the Document for statistics tracking. 237 """ 238 return { 239 "is_rejected": document.is_rejected, 240 "bytes": len(document.text.encode("utf-8")), 241 "chars": len(document.text), 242 "time_ns": time.perf_counter_ns(), 243 }
Create a document-info mapping from a Document instance. This function is used to extract metadata from the Document for statistics tracking.