hojichar.core.models

  1import time
  2from dataclasses import dataclass, fields
  3from typing import Any, Dict, List, Optional
  4
  5from hojichar.utils.warn_deprecation import deprecated_since
  6
  7
  8@deprecated_since("0.1.0", "Document")
  9class Token:
 10    def __init__(self, text: str, is_rejected: bool = False) -> None:
 11        self.text = text
 12        self.__original = text
 13        self.is_rejected = is_rejected
 14
 15    @property
 16    def original(self) -> str:
 17        return self.__original
 18
 19    def __str__(self) -> str:
 20        return self.text
 21
 22
 23class Document:
 24    """
 25    Document class represents a text document with metadata.
 26    It contains the text of the document, a flag indicating whether it is rejected,
 27     and additional metadata stored in the `extras` dictionary.
 28
 29    The `tokens` attribute will be deprecated in future versions,
 30    and users are encouraged to use the `extras` dictionary to store token-related information.
 31
 32    Attributes:
 33        text (str): The text content of the document.
 34        is_rejected (bool): A flag indicating whether the document is rejected.
 35        extras (Dict[str, Any]): A dictionary to store additional metadata about the document.
 36        reject_reason (Dict[str, Any]): A dictionary to store the reason for rejection. The
 37          filter class and the member name and value will logged at the filter is logged here.
 38
 39    Next attributes will be deprecated in future versions:
 40        tokens (List[Token]): A list of tokens extracted from the document.
 41    """
 42
 43    def __init__(
 44        self,
 45        text: str,
 46        is_rejected: bool = False,
 47        tokens: Optional[List[Token]] = None,
 48        extras: Optional[Dict[str, Any]] = None,
 49    ) -> None:
 50        self.text = text
 51        self.__original = text
 52        self.is_rejected = is_rejected
 53        if tokens is None:
 54            self.tokens: List[Token] = []
 55        else:
 56            self.tokens = tokens
 57
 58        if extras is None:
 59            self.extras: Dict[str, Any] = {}
 60        else:
 61            self.extras = extras
 62
 63        self.reject_reason: Dict[str, Any] = {}
 64
 65    @property
 66    def original(self) -> str:
 67        return self.__original
 68
 69    @deprecated_since("1.0.0")
 70    def set_tokens(self, tokens: List[str]) -> None:
 71        self.tokens = [Token(token) for token in tokens]
 72
 73    @deprecated_since("1.0.0")
 74    def get_tokens(self) -> List[str]:
 75        return [token.text for token in self.tokens]
 76
 77    def __str__(self) -> str:
 78        return self.text
 79
 80    def __repr__(self) -> str:
 81        return (
 82            f"Document(text={self.text!r}, is_rejected={self.is_rejected}, extras={self.extras})"  # noqa
 83        )
 84
 85
 86@dataclass
 87class Statistics:
 88    """
 89    Statistics class to track the performance of the document processing pipeline.
 90    """
 91
 92    name: Optional[str] = None
 93    input_num: int = 0
 94    input_bytes: int = 0
 95    input_chars: int = 0
 96    output_num: int = 0
 97    output_bytes: int = 0
 98    output_chars: int = 0
 99    discard_num: int = 0
100    diff_bytes: int = 0
101    diff_chars: int = 0
102    cumulative_time_ns: int = 0
103    errors: int = 0
104
105    def to_dict(self) -> Dict[str, Any]:
106        """
107        Convert the Statistics object to a dictionary.
108        """
109        return {f.name: getattr(self, f.name) for f in fields(self)}
110
111    def update(self, other: "Statistics") -> None:
112        """
113        Update the statistics by adding another Statistics object.
114        """
115        self.input_num += other.input_num
116        self.input_bytes += other.input_bytes
117        self.input_chars += other.input_chars
118        self.output_num += other.output_num
119        self.output_bytes += other.output_bytes
120        self.output_chars += other.output_chars
121        self.discard_num += other.discard_num
122        self.diff_bytes += other.diff_bytes
123        self.diff_chars += other.diff_chars
124        self.cumulative_time_ns += other.cumulative_time_ns
125        self.errors += other.errors
126
127    def reset(self) -> "Statistics":
128        """
129        Reset the statistics to their initial values.
130        """
131        self.input_num = 0
132        self.input_bytes = 0
133        self.input_chars = 0
134        self.output_num = 0
135        self.output_bytes = 0
136        self.output_chars = 0
137        self.discard_num = 0
138        self.diff_bytes = 0
139        self.diff_chars = 0
140        self.cumulative_time_ns = 0
141        self.errors = 0
142        return self
143
144    def update_by_diff(
145        self,
146        before: dict[str, Any],
147        after: dict[str, Any],
148    ) -> None:
149        """
150        Update the statistics by calculating the differences between two Doc-info mappings.
151        This method is used to update the statistics after a filter is applied.
152        """
153        if not before["is_rejected"] and after["is_rejected"]:
154            # Document is rejected after the filter is applied
155            self.input_num += 1
156            self.input_bytes += before["bytes"]
157            self.input_chars += before["chars"]
158            self.discard_num += 1
159            self.diff_bytes -= before["bytes"]
160            self.diff_chars -= before["chars"]
161            self.cumulative_time_ns += after["time_ns"] - before["time_ns"]
162        else:
163            # Document is not rejected or still not rejected after the filter is applied
164            self.input_num += 1
165            self.input_bytes += before["bytes"]
166            self.input_chars += before["chars"]
167            self.output_num += 1
168            self.output_bytes += after["bytes"]
169            self.output_chars += after["chars"]
170            self.diff_bytes += after["bytes"] - before["bytes"]
171            self.diff_chars += after["chars"] - before["chars"]
172            self.cumulative_time_ns += after["time_ns"] - before["time_ns"]
173
174    @staticmethod
175    def add(x: "Statistics", y: "Statistics") -> "Statistics":
176        """
177        Add two Statistics objects together.
178        This method assumes that the names of the two Statistics objects match.
179        If they do not match, it will raise an AssertionError."""
180        assert x.name == y.name, "Layer names must match"
181        return Statistics(
182            name=x.name,
183            input_num=x.input_num + y.input_num,
184            input_bytes=x.input_bytes + y.input_bytes,
185            input_chars=x.input_chars + y.input_chars,
186            output_num=x.output_num + y.output_num,
187            output_bytes=x.output_bytes + y.output_bytes,
188            output_chars=x.output_chars + y.output_chars,
189            discard_num=x.discard_num + y.discard_num,
190            diff_bytes=x.diff_bytes + y.diff_bytes,
191            diff_chars=x.diff_chars + y.diff_chars,
192            cumulative_time_ns=x.cumulative_time_ns + y.cumulative_time_ns,
193            errors=x.errors + y.errors,
194        )
195
196    @staticmethod
197    def add_list_of_stats(x: List["Statistics"], y: List["Statistics"]) -> List["Statistics"]:
198        """
199        Add FilterStatistics objects from two lists by matching their names.
200        This method assumes that both lists contain FilterStatistics objects
201        with the same names, and it will raise a ValueError if the sets of names
202        in the two lists do not match.
203        """
204        # check if the names in both lists match
205        names_x = {stat.name for stat in x}
206        names_y = {stat.name for stat in y}
207        if names_x != names_y:
208            raise ValueError(f"name の集合が一致しません: {names_x} vs {names_y}")
209
210        y_map = {stat.name: stat for stat in y}
211
212        # keep the order of x and add corresponding y
213        result: List[Statistics] = []
214        for stat_x in x:
215            stat_y = y_map[stat_x.name]
216            result.append(Statistics.add(stat_x, stat_y))
217
218        return result
219
220    @staticmethod
221    def get_filter(name: str, stats: List["Statistics"]) -> "Statistics":
222        """
223        Get a Statistics object by its name from a list of statistics.
224        If the name is not found, return None.
225        """
226        for stat in stats:
227            if stat.name == name:
228                return stat
229        raise KeyError(f"Statistics with name '{name}' not found in the list.")
230
231
232def get_doc_info(document: Document) -> dict[str, Any]:
233    """
234    Create a document-info mapping from a Document instance.
235    This function is used to extract metadata from the Document for statistics tracking.
236    """
237    return {
238        "is_rejected": document.is_rejected,
239        "bytes": len(document.text.encode("utf-8")),
240        "chars": len(document.text),
241        "time_ns": time.perf_counter_ns(),
242    }
@deprecated_since('0.1.0', 'Document')
class Token:
 9@deprecated_since("0.1.0", "Document")
10class Token:
11    def __init__(self, text: str, is_rejected: bool = False) -> None:
12        self.text = text
13        self.__original = text
14        self.is_rejected = is_rejected
15
16    @property
17    def original(self) -> str:
18        return self.__original
19
20    def __str__(self) -> str:
21        return self.text
Token(text: str, is_rejected: bool = False)
11    def __init__(self, text: str, is_rejected: bool = False) -> None:
12        self.text = text
13        self.__original = text
14        self.is_rejected = is_rejected
class Document:
24class Document:
25    """
26    Document class represents a text document with metadata.
27    It contains the text of the document, a flag indicating whether it is rejected,
28     and additional metadata stored in the `extras` dictionary.
29
30    The `tokens` attribute will be deprecated in future versions,
31    and users are encouraged to use the `extras` dictionary to store token-related information.
32
33    Attributes:
34        text (str): The text content of the document.
35        is_rejected (bool): A flag indicating whether the document is rejected.
36        extras (Dict[str, Any]): A dictionary to store additional metadata about the document.
37        reject_reason (Dict[str, Any]): A dictionary to store the reason for rejection. The
38          filter class and the member name and value will logged at the filter is logged here.
39
40    Next attributes will be deprecated in future versions:
41        tokens (List[Token]): A list of tokens extracted from the document.
42    """
43
44    def __init__(
45        self,
46        text: str,
47        is_rejected: bool = False,
48        tokens: Optional[List[Token]] = None,
49        extras: Optional[Dict[str, Any]] = None,
50    ) -> None:
51        self.text = text
52        self.__original = text
53        self.is_rejected = is_rejected
54        if tokens is None:
55            self.tokens: List[Token] = []
56        else:
57            self.tokens = tokens
58
59        if extras is None:
60            self.extras: Dict[str, Any] = {}
61        else:
62            self.extras = extras
63
64        self.reject_reason: Dict[str, Any] = {}
65
66    @property
67    def original(self) -> str:
68        return self.__original
69
70    @deprecated_since("1.0.0")
71    def set_tokens(self, tokens: List[str]) -> None:
72        self.tokens = [Token(token) for token in tokens]
73
74    @deprecated_since("1.0.0")
75    def get_tokens(self) -> List[str]:
76        return [token.text for token in self.tokens]
77
78    def __str__(self) -> str:
79        return self.text
80
81    def __repr__(self) -> str:
82        return (
83            f"Document(text={self.text!r}, is_rejected={self.is_rejected}, extras={self.extras})"  # noqa
84        )

Document class represents a text document with metadata. It contains the text of the document, a flag indicating whether it is rejected, and additional metadata stored in the extras dictionary.

The tokens attribute will be deprecated in future versions, and users are encouraged to use the extras dictionary to store token-related information.

Attributes: text (str): The text content of the document. is_rejected (bool): A flag indicating whether the document is rejected. extras (Dict[str, Any]): A dictionary to store additional metadata about the document. reject_reason (Dict[str, Any]): A dictionary to store the reason for rejection. The filter class and the member name and value will logged at the filter is logged here.

Next attributes will be deprecated in future versions: tokens (List[Token]): A list of tokens extracted from the document.

Document( text: str, is_rejected: bool = False, tokens: Optional[List[hojichar.core.models.Token]] = None, extras: Optional[Dict[str, Any]] = None)
44    def __init__(
45        self,
46        text: str,
47        is_rejected: bool = False,
48        tokens: Optional[List[Token]] = None,
49        extras: Optional[Dict[str, Any]] = None,
50    ) -> None:
51        self.text = text
52        self.__original = text
53        self.is_rejected = is_rejected
54        if tokens is None:
55            self.tokens: List[Token] = []
56        else:
57            self.tokens = tokens
58
59        if extras is None:
60            self.extras: Dict[str, Any] = {}
61        else:
62            self.extras = extras
63
64        self.reject_reason: Dict[str, Any] = {}
@deprecated_since('1.0.0')
def set_tokens(self, tokens: List[str]) -> None:
70    @deprecated_since("1.0.0")
71    def set_tokens(self, tokens: List[str]) -> None:
72        self.tokens = [Token(token) for token in tokens]
@deprecated_since('1.0.0')
def get_tokens(self) -> List[str]:
74    @deprecated_since("1.0.0")
75    def get_tokens(self) -> List[str]:
76        return [token.text for token in self.tokens]
@dataclass
class Statistics:
 87@dataclass
 88class Statistics:
 89    """
 90    Statistics class to track the performance of the document processing pipeline.
 91    """
 92
 93    name: Optional[str] = None
 94    input_num: int = 0
 95    input_bytes: int = 0
 96    input_chars: int = 0
 97    output_num: int = 0
 98    output_bytes: int = 0
 99    output_chars: int = 0
100    discard_num: int = 0
101    diff_bytes: int = 0
102    diff_chars: int = 0
103    cumulative_time_ns: int = 0
104    errors: int = 0
105
106    def to_dict(self) -> Dict[str, Any]:
107        """
108        Convert the Statistics object to a dictionary.
109        """
110        return {f.name: getattr(self, f.name) for f in fields(self)}
111
112    def update(self, other: "Statistics") -> None:
113        """
114        Update the statistics by adding another Statistics object.
115        """
116        self.input_num += other.input_num
117        self.input_bytes += other.input_bytes
118        self.input_chars += other.input_chars
119        self.output_num += other.output_num
120        self.output_bytes += other.output_bytes
121        self.output_chars += other.output_chars
122        self.discard_num += other.discard_num
123        self.diff_bytes += other.diff_bytes
124        self.diff_chars += other.diff_chars
125        self.cumulative_time_ns += other.cumulative_time_ns
126        self.errors += other.errors
127
128    def reset(self) -> "Statistics":
129        """
130        Reset the statistics to their initial values.
131        """
132        self.input_num = 0
133        self.input_bytes = 0
134        self.input_chars = 0
135        self.output_num = 0
136        self.output_bytes = 0
137        self.output_chars = 0
138        self.discard_num = 0
139        self.diff_bytes = 0
140        self.diff_chars = 0
141        self.cumulative_time_ns = 0
142        self.errors = 0
143        return self
144
145    def update_by_diff(
146        self,
147        before: dict[str, Any],
148        after: dict[str, Any],
149    ) -> None:
150        """
151        Update the statistics by calculating the differences between two Doc-info mappings.
152        This method is used to update the statistics after a filter is applied.
153        """
154        if not before["is_rejected"] and after["is_rejected"]:
155            # Document is rejected after the filter is applied
156            self.input_num += 1
157            self.input_bytes += before["bytes"]
158            self.input_chars += before["chars"]
159            self.discard_num += 1
160            self.diff_bytes -= before["bytes"]
161            self.diff_chars -= before["chars"]
162            self.cumulative_time_ns += after["time_ns"] - before["time_ns"]
163        else:
164            # Document is not rejected or still not rejected after the filter is applied
165            self.input_num += 1
166            self.input_bytes += before["bytes"]
167            self.input_chars += before["chars"]
168            self.output_num += 1
169            self.output_bytes += after["bytes"]
170            self.output_chars += after["chars"]
171            self.diff_bytes += after["bytes"] - before["bytes"]
172            self.diff_chars += after["chars"] - before["chars"]
173            self.cumulative_time_ns += after["time_ns"] - before["time_ns"]
174
175    @staticmethod
176    def add(x: "Statistics", y: "Statistics") -> "Statistics":
177        """
178        Add two Statistics objects together.
179        This method assumes that the names of the two Statistics objects match.
180        If they do not match, it will raise an AssertionError."""
181        assert x.name == y.name, "Layer names must match"
182        return Statistics(
183            name=x.name,
184            input_num=x.input_num + y.input_num,
185            input_bytes=x.input_bytes + y.input_bytes,
186            input_chars=x.input_chars + y.input_chars,
187            output_num=x.output_num + y.output_num,
188            output_bytes=x.output_bytes + y.output_bytes,
189            output_chars=x.output_chars + y.output_chars,
190            discard_num=x.discard_num + y.discard_num,
191            diff_bytes=x.diff_bytes + y.diff_bytes,
192            diff_chars=x.diff_chars + y.diff_chars,
193            cumulative_time_ns=x.cumulative_time_ns + y.cumulative_time_ns,
194            errors=x.errors + y.errors,
195        )
196
197    @staticmethod
198    def add_list_of_stats(x: List["Statistics"], y: List["Statistics"]) -> List["Statistics"]:
199        """
200        Add FilterStatistics objects from two lists by matching their names.
201        This method assumes that both lists contain FilterStatistics objects
202        with the same names, and it will raise a ValueError if the sets of names
203        in the two lists do not match.
204        """
205        # check if the names in both lists match
206        names_x = {stat.name for stat in x}
207        names_y = {stat.name for stat in y}
208        if names_x != names_y:
209            raise ValueError(f"name の集合が一致しません: {names_x} vs {names_y}")
210
211        y_map = {stat.name: stat for stat in y}
212
213        # keep the order of x and add corresponding y
214        result: List[Statistics] = []
215        for stat_x in x:
216            stat_y = y_map[stat_x.name]
217            result.append(Statistics.add(stat_x, stat_y))
218
219        return result
220
221    @staticmethod
222    def get_filter(name: str, stats: List["Statistics"]) -> "Statistics":
223        """
224        Get a Statistics object by its name from a list of statistics.
225        If the name is not found, return None.
226        """
227        for stat in stats:
228            if stat.name == name:
229                return stat
230        raise KeyError(f"Statistics with name '{name}' not found in the list.")

Statistics class to track the performance of the document processing pipeline.

Statistics( name: Optional[str] = None, input_num: int = 0, input_bytes: int = 0, input_chars: int = 0, output_num: int = 0, output_bytes: int = 0, output_chars: int = 0, discard_num: int = 0, diff_bytes: int = 0, diff_chars: int = 0, cumulative_time_ns: int = 0, errors: int = 0)
def to_dict(self) -> Dict[str, Any]:
106    def to_dict(self) -> Dict[str, Any]:
107        """
108        Convert the Statistics object to a dictionary.
109        """
110        return {f.name: getattr(self, f.name) for f in fields(self)}

Convert the Statistics object to a dictionary.

def update(self, other: hojichar.core.models.Statistics) -> None:
112    def update(self, other: "Statistics") -> None:
113        """
114        Update the statistics by adding another Statistics object.
115        """
116        self.input_num += other.input_num
117        self.input_bytes += other.input_bytes
118        self.input_chars += other.input_chars
119        self.output_num += other.output_num
120        self.output_bytes += other.output_bytes
121        self.output_chars += other.output_chars
122        self.discard_num += other.discard_num
123        self.diff_bytes += other.diff_bytes
124        self.diff_chars += other.diff_chars
125        self.cumulative_time_ns += other.cumulative_time_ns
126        self.errors += other.errors

Update the statistics by adding another Statistics object.

def reset(self) -> hojichar.core.models.Statistics:
128    def reset(self) -> "Statistics":
129        """
130        Reset the statistics to their initial values.
131        """
132        self.input_num = 0
133        self.input_bytes = 0
134        self.input_chars = 0
135        self.output_num = 0
136        self.output_bytes = 0
137        self.output_chars = 0
138        self.discard_num = 0
139        self.diff_bytes = 0
140        self.diff_chars = 0
141        self.cumulative_time_ns = 0
142        self.errors = 0
143        return self

Reset the statistics to their initial values.

def update_by_diff( self, before: dict[str, typing.Any], after: dict[str, typing.Any]) -> None:
145    def update_by_diff(
146        self,
147        before: dict[str, Any],
148        after: dict[str, Any],
149    ) -> None:
150        """
151        Update the statistics by calculating the differences between two Doc-info mappings.
152        This method is used to update the statistics after a filter is applied.
153        """
154        if not before["is_rejected"] and after["is_rejected"]:
155            # Document is rejected after the filter is applied
156            self.input_num += 1
157            self.input_bytes += before["bytes"]
158            self.input_chars += before["chars"]
159            self.discard_num += 1
160            self.diff_bytes -= before["bytes"]
161            self.diff_chars -= before["chars"]
162            self.cumulative_time_ns += after["time_ns"] - before["time_ns"]
163        else:
164            # Document is not rejected or still not rejected after the filter is applied
165            self.input_num += 1
166            self.input_bytes += before["bytes"]
167            self.input_chars += before["chars"]
168            self.output_num += 1
169            self.output_bytes += after["bytes"]
170            self.output_chars += after["chars"]
171            self.diff_bytes += after["bytes"] - before["bytes"]
172            self.diff_chars += after["chars"] - before["chars"]
173            self.cumulative_time_ns += after["time_ns"] - before["time_ns"]

Update the statistics by calculating the differences between two Doc-info mappings. This method is used to update the statistics after a filter is applied.

175    @staticmethod
176    def add(x: "Statistics", y: "Statistics") -> "Statistics":
177        """
178        Add two Statistics objects together.
179        This method assumes that the names of the two Statistics objects match.
180        If they do not match, it will raise an AssertionError."""
181        assert x.name == y.name, "Layer names must match"
182        return Statistics(
183            name=x.name,
184            input_num=x.input_num + y.input_num,
185            input_bytes=x.input_bytes + y.input_bytes,
186            input_chars=x.input_chars + y.input_chars,
187            output_num=x.output_num + y.output_num,
188            output_bytes=x.output_bytes + y.output_bytes,
189            output_chars=x.output_chars + y.output_chars,
190            discard_num=x.discard_num + y.discard_num,
191            diff_bytes=x.diff_bytes + y.diff_bytes,
192            diff_chars=x.diff_chars + y.diff_chars,
193            cumulative_time_ns=x.cumulative_time_ns + y.cumulative_time_ns,
194            errors=x.errors + y.errors,
195        )

Add two Statistics objects together. This method assumes that the names of the two Statistics objects match. If they do not match, it will raise an AssertionError.

@staticmethod
def add_list_of_stats( x: List[hojichar.core.models.Statistics], y: List[hojichar.core.models.Statistics]) -> List[hojichar.core.models.Statistics]:
197    @staticmethod
198    def add_list_of_stats(x: List["Statistics"], y: List["Statistics"]) -> List["Statistics"]:
199        """
200        Add FilterStatistics objects from two lists by matching their names.
201        This method assumes that both lists contain FilterStatistics objects
202        with the same names, and it will raise a ValueError if the sets of names
203        in the two lists do not match.
204        """
205        # check if the names in both lists match
206        names_x = {stat.name for stat in x}
207        names_y = {stat.name for stat in y}
208        if names_x != names_y:
209            raise ValueError(f"name の集合が一致しません: {names_x} vs {names_y}")
210
211        y_map = {stat.name: stat for stat in y}
212
213        # keep the order of x and add corresponding y
214        result: List[Statistics] = []
215        for stat_x in x:
216            stat_y = y_map[stat_x.name]
217            result.append(Statistics.add(stat_x, stat_y))
218
219        return result

Add FilterStatistics objects from two lists by matching their names. This method assumes that both lists contain FilterStatistics objects with the same names, and it will raise a ValueError if the sets of names in the two lists do not match.

@staticmethod
def get_filter( name: str, stats: List[hojichar.core.models.Statistics]) -> hojichar.core.models.Statistics:
221    @staticmethod
222    def get_filter(name: str, stats: List["Statistics"]) -> "Statistics":
223        """
224        Get a Statistics object by its name from a list of statistics.
225        If the name is not found, return None.
226        """
227        for stat in stats:
228            if stat.name == name:
229                return stat
230        raise KeyError(f"Statistics with name '{name}' not found in the list.")

Get a Statistics object by its name from a list of statistics. If the name is not found, return None.

def get_doc_info(document: hojichar.core.models.Document) -> dict[str, typing.Any]:
233def get_doc_info(document: Document) -> dict[str, Any]:
234    """
235    Create a document-info mapping from a Document instance.
236    This function is used to extract metadata from the Document for statistics tracking.
237    """
238    return {
239        "is_rejected": document.is_rejected,
240        "bytes": len(document.text.encode("utf-8")),
241        "chars": len(document.text),
242        "time_ns": time.perf_counter_ns(),
243    }

Create a document-info mapping from a Document instance. This function is used to extract metadata from the Document for statistics tracking.