1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

######################## BEGIN LICENSE BLOCK ######################## 

# The Original Code is mozilla.org code. 

# 

# The Initial Developer of the Original Code is 

# Netscape Communications Corporation. 

# Portions created by the Initial Developer are Copyright (C) 1998 

# the Initial Developer. All Rights Reserved. 

# 

# Contributor(s): 

# Mark Pilgrim - port to Python 

# 

# This library is free software; you can redistribute it and/or 

# modify it under the terms of the GNU Lesser General Public 

# License as published by the Free Software Foundation; either 

# version 2.1 of the License, or (at your option) any later version. 

# 

# This library is distributed in the hope that it will be useful, 

# but WITHOUT ANY WARRANTY; without even the implied warranty of 

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 

# Lesser General Public License for more details. 

# 

# You should have received a copy of the GNU Lesser General Public 

# License along with this library; if not, write to the Free Software 

# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 

# 02110-1301 USA 

######################### END LICENSE BLOCK ######################### 

 

from .charsetprober import CharSetProber 

from .codingstatemachine import CodingStateMachine 

from .enums import LanguageFilter, ProbingState, MachineState 

from .escsm import (HZ_SM_MODEL, ISO2022CN_SM_MODEL, ISO2022JP_SM_MODEL, 

ISO2022KR_SM_MODEL) 

 

 

class EscCharSetProber(CharSetProber): 

""" 

This CharSetProber uses a "code scheme" approach for detecting encodings, 

whereby easily recognizable escape or shift sequences are relied on to 

identify these encodings. 

""" 

 

def __init__(self, lang_filter=None): 

super(EscCharSetProber, self).__init__(lang_filter=lang_filter) 

self.coding_sm = [] 

if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED: 

self.coding_sm.append(CodingStateMachine(HZ_SM_MODEL)) 

self.coding_sm.append(CodingStateMachine(ISO2022CN_SM_MODEL)) 

if self.lang_filter & LanguageFilter.JAPANESE: 

self.coding_sm.append(CodingStateMachine(ISO2022JP_SM_MODEL)) 

if self.lang_filter & LanguageFilter.KOREAN: 

self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL)) 

self.active_sm_count = None 

self._detected_charset = None 

self._detected_language = None 

self._state = None 

self.reset() 

 

def reset(self): 

super(EscCharSetProber, self).reset() 

for coding_sm in self.coding_sm: 

if not coding_sm: 

continue 

coding_sm.active = True 

coding_sm.reset() 

self.active_sm_count = len(self.coding_sm) 

self._detected_charset = None 

self._detected_language = None 

 

@property 

def charset_name(self): 

return self._detected_charset 

 

@property 

def language(self): 

return self._detected_language 

 

def get_confidence(self): 

if self._detected_charset: 

return 0.99 

else: 

return 0.00 

 

def feed(self, byte_str): 

for c in byte_str: 

for coding_sm in self.coding_sm: 

if not coding_sm or not coding_sm.active: 

continue 

coding_state = coding_sm.next_state(c) 

if coding_state == MachineState.ERROR: 

coding_sm.active = False 

self.active_sm_count -= 1 

if self.active_sm_count <= 0: 

self._state = ProbingState.NOT_ME 

return self.state 

elif coding_state == MachineState.ITS_ME: 

self._state = ProbingState.FOUND_IT 

self._detected_charset = coding_sm.get_coding_state_machine() 

self._detected_language = coding_sm.language 

return self.state 

 

return self.state