hojichar.core.models
1from typing import Any, Dict, List, Optional 2 3 4class Token: 5 def __init__(self, text: str, is_rejected: bool = False) -> None: 6 self.text = text 7 self.__original = text 8 self.is_rejected = is_rejected 9 10 @property 11 def original(self) -> str: 12 return self.__original 13 14 def __str__(self) -> str: 15 return self.text 16 17 18class Document: 19 def __init__( 20 self, 21 text: str, 22 is_rejected: bool = False, 23 tokens: Optional[List[Token]] = None, 24 extras: Optional[Dict[str, Any]] = None, 25 ) -> None: 26 self.text = text 27 self.__original = text 28 self.is_rejected = is_rejected 29 if tokens is None: 30 self.tokens: List[Token] = [] 31 32 if extras is None: 33 self.extras: Dict[str, Any] = {} 34 else: 35 self.extras = extras 36 37 self.dedup_lsh: List[str] = [] 38 self.reject_reason: Dict[str, Any] = {} 39 40 @property 41 def original(self) -> str: 42 return self.__original 43 44 def set_tokens(self, tokens: List[str]) -> None: 45 self.tokens = [Token(token) for token in tokens] 46 47 def get_tokens(self) -> List[str]: 48 return [token.text for token in self.tokens] 49 50 def __str__(self) -> str: 51 return self.text 52 53 def __repr__(self) -> str: 54 return f"Document(text={self.text!r}, is_rejected={self.is_rejected}, extras={self.extras})" # noqa
class
Token:
class
Document:
19class Document: 20 def __init__( 21 self, 22 text: str, 23 is_rejected: bool = False, 24 tokens: Optional[List[Token]] = None, 25 extras: Optional[Dict[str, Any]] = None, 26 ) -> None: 27 self.text = text 28 self.__original = text 29 self.is_rejected = is_rejected 30 if tokens is None: 31 self.tokens: List[Token] = [] 32 33 if extras is None: 34 self.extras: Dict[str, Any] = {} 35 else: 36 self.extras = extras 37 38 self.dedup_lsh: List[str] = [] 39 self.reject_reason: Dict[str, Any] = {} 40 41 @property 42 def original(self) -> str: 43 return self.__original 44 45 def set_tokens(self, tokens: List[str]) -> None: 46 self.tokens = [Token(token) for token in tokens] 47 48 def get_tokens(self) -> List[str]: 49 return [token.text for token in self.tokens] 50 51 def __str__(self) -> str: 52 return self.text 53 54 def __repr__(self) -> str: 55 return f"Document(text={self.text!r}, is_rejected={self.is_rejected}, extras={self.extras})" # noqa
Document( text: str, is_rejected: bool = False, tokens: Optional[List[hojichar.core.models.Token]] = None, extras: Optional[Dict[str, Any]] = None)
20 def __init__( 21 self, 22 text: str, 23 is_rejected: bool = False, 24 tokens: Optional[List[Token]] = None, 25 extras: Optional[Dict[str, Any]] = None, 26 ) -> None: 27 self.text = text 28 self.__original = text 29 self.is_rejected = is_rejected 30 if tokens is None: 31 self.tokens: List[Token] = [] 32 33 if extras is None: 34 self.extras: Dict[str, Any] = {} 35 else: 36 self.extras = extras 37 38 self.dedup_lsh: List[str] = [] 39 self.reject_reason: Dict[str, Any] = {}