hojichar.core.models

 1from typing import Any, Dict, List, Optional
 2
 3
 4class Token:
 5    def __init__(self, text: str, is_rejected: bool = False) -> None:
 6        self.text = text
 7        self.__original = text
 8        self.is_rejected = is_rejected
 9
10    @property
11    def original(self) -> str:
12        return self.__original
13
14    def __str__(self) -> str:
15        return self.text
16
17
18class Document:
19    def __init__(
20        self,
21        text: str,
22        is_rejected: bool = False,
23        tokens: Optional[List[Token]] = None,
24        extras: Optional[Dict[str, Any]] = None,
25    ) -> None:
26        self.text = text
27        self.__original = text
28        self.is_rejected = is_rejected
29        if tokens is None:
30            self.tokens: List[Token] = []
31
32        if extras is None:
33            self.extras: Dict[str, Any] = {}
34        else:
35            self.extras = extras
36
37        self.dedup_lsh: List[str] = []
38        self.reject_reason: Dict[str, Any] = {}
39
40    @property
41    def original(self) -> str:
42        return self.__original
43
44    def set_tokens(self, tokens: List[str]) -> None:
45        self.tokens = [Token(token) for token in tokens]
46
47    def get_tokens(self) -> List[str]:
48        return [token.text for token in self.tokens]
49
50    def __str__(self) -> str:
51        return self.text
52
53    def __repr__(self) -> str:
54        return f"Document(text={self.text!r}, is_rejected={self.is_rejected}, extras={self.extras})"  # noqa
class Token:
 5class Token:
 6    def __init__(self, text: str, is_rejected: bool = False) -> None:
 7        self.text = text
 8        self.__original = text
 9        self.is_rejected = is_rejected
10
11    @property
12    def original(self) -> str:
13        return self.__original
14
15    def __str__(self) -> str:
16        return self.text
Token(text: str, is_rejected: bool = False)
6    def __init__(self, text: str, is_rejected: bool = False) -> None:
7        self.text = text
8        self.__original = text
9        self.is_rejected = is_rejected
class Document:
19class Document:
20    def __init__(
21        self,
22        text: str,
23        is_rejected: bool = False,
24        tokens: Optional[List[Token]] = None,
25        extras: Optional[Dict[str, Any]] = None,
26    ) -> None:
27        self.text = text
28        self.__original = text
29        self.is_rejected = is_rejected
30        if tokens is None:
31            self.tokens: List[Token] = []
32
33        if extras is None:
34            self.extras: Dict[str, Any] = {}
35        else:
36            self.extras = extras
37
38        self.dedup_lsh: List[str] = []
39        self.reject_reason: Dict[str, Any] = {}
40
41    @property
42    def original(self) -> str:
43        return self.__original
44
45    def set_tokens(self, tokens: List[str]) -> None:
46        self.tokens = [Token(token) for token in tokens]
47
48    def get_tokens(self) -> List[str]:
49        return [token.text for token in self.tokens]
50
51    def __str__(self) -> str:
52        return self.text
53
54    def __repr__(self) -> str:
55        return f"Document(text={self.text!r}, is_rejected={self.is_rejected}, extras={self.extras})"  # noqa
Document( text: str, is_rejected: bool = False, tokens: Optional[List[hojichar.core.models.Token]] = None, extras: Optional[Dict[str, Any]] = None)
20    def __init__(
21        self,
22        text: str,
23        is_rejected: bool = False,
24        tokens: Optional[List[Token]] = None,
25        extras: Optional[Dict[str, Any]] = None,
26    ) -> None:
27        self.text = text
28        self.__original = text
29        self.is_rejected = is_rejected
30        if tokens is None:
31            self.tokens: List[Token] = []
32
33        if extras is None:
34            self.extras: Dict[str, Any] = {}
35        else:
36            self.extras = extras
37
38        self.dedup_lsh: List[str] = []
39        self.reject_reason: Dict[str, Any] = {}
def set_tokens(self, tokens: List[str]) -> None:
45    def set_tokens(self, tokens: List[str]) -> None:
46        self.tokens = [Token(token) for token in tokens]
def get_tokens(self) -> List[str]:
48    def get_tokens(self) -> List[str]:
49        return [token.text for token in self.tokens]