1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

######################## BEGIN LICENSE BLOCK ######################## 

# The Original Code is Mozilla Universal charset detector code. 

# 

# The Initial Developer of the Original Code is 

# Netscape Communications Corporation. 

# Portions created by the Initial Developer are Copyright (C) 2001 

# the Initial Developer. All Rights Reserved. 

# 

# Contributor(s): 

# Mark Pilgrim - port to Python 

# Shy Shalom - original C code 

# 

# This library is free software; you can redistribute it and/or 

# modify it under the terms of the GNU Lesser General Public 

# License as published by the Free Software Foundation; either 

# version 2.1 of the License, or (at your option) any later version. 

# 

# This library is distributed in the hope that it will be useful, 

# but WITHOUT ANY WARRANTY; without even the implied warranty of 

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 

# Lesser General Public License for more details. 

# 

# You should have received a copy of the GNU Lesser General Public 

# License along with this library; if not, write to the Free Software 

# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 

# 02110-1301 USA 

######################### END LICENSE BLOCK ######################### 

 

from .charsetprober import CharSetProber 

from .enums import CharacterCategory, ProbingState, SequenceLikelihood 

 

 

class SingleByteCharSetProber(CharSetProber): 

SAMPLE_SIZE = 64 

SB_ENOUGH_REL_THRESHOLD = 1024 # 0.25 * SAMPLE_SIZE^2 

POSITIVE_SHORTCUT_THRESHOLD = 0.95 

NEGATIVE_SHORTCUT_THRESHOLD = 0.05 

 

def __init__(self, model, reversed=False, name_prober=None): 

super(SingleByteCharSetProber, self).__init__() 

self._model = model 

# TRUE if we need to reverse every pair in the model lookup 

self._reversed = reversed 

# Optional auxiliary prober for name decision 

self._name_prober = name_prober 

self._last_order = None 

self._seq_counters = None 

self._total_seqs = None 

self._total_char = None 

self._freq_char = None 

self.reset() 

 

def reset(self): 

super(SingleByteCharSetProber, self).reset() 

# char order of last character 

self._last_order = 255 

self._seq_counters = [0] * SequenceLikelihood.get_num_categories() 

self._total_seqs = 0 

self._total_char = 0 

# characters that fall in our sampling range 

self._freq_char = 0 

 

@property 

def charset_name(self): 

if self._name_prober: 

return self._name_prober.charset_name 

else: 

return self._model['charset_name'] 

 

@property 

def language(self): 

if self._name_prober: 

return self._name_prober.language 

else: 

return self._model.get('language') 

 

def feed(self, byte_str): 

if not self._model['keep_english_letter']: 

byte_str = self.filter_international_words(byte_str) 

if not byte_str: 

return self.state 

char_to_order_map = self._model['char_to_order_map'] 

for i, c in enumerate(byte_str): 

# XXX: Order is in range 1-64, so one would think we want 0-63 here, 

# but that leads to 27 more test failures than before. 

order = char_to_order_map[c] 

# XXX: This was SYMBOL_CAT_ORDER before, with a value of 250, but 

# CharacterCategory.SYMBOL is actually 253, so we use CONTROL 

# to make it closer to the original intent. The only difference 

# is whether or not we count digits and control characters for 

# _total_char purposes. 

if order < CharacterCategory.CONTROL: 

self._total_char += 1 

if order < self.SAMPLE_SIZE: 

self._freq_char += 1 

if self._last_order < self.SAMPLE_SIZE: 

self._total_seqs += 1 

if not self._reversed: 

i = (self._last_order * self.SAMPLE_SIZE) + order 

model = self._model['precedence_matrix'][i] 

else: # reverse the order of the letters in the lookup 

i = (order * self.SAMPLE_SIZE) + self._last_order 

model = self._model['precedence_matrix'][i] 

self._seq_counters[model] += 1 

self._last_order = order 

 

charset_name = self._model['charset_name'] 

if self.state == ProbingState.DETECTING: 

if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD: 

confidence = self.get_confidence() 

if confidence > self.POSITIVE_SHORTCUT_THRESHOLD: 

self.logger.debug('%s confidence = %s, we have a winner', 

charset_name, confidence) 

self._state = ProbingState.FOUND_IT 

elif confidence < self.NEGATIVE_SHORTCUT_THRESHOLD: 

self.logger.debug('%s confidence = %s, below negative ' 

'shortcut threshhold %s', charset_name, 

confidence, 

self.NEGATIVE_SHORTCUT_THRESHOLD) 

self._state = ProbingState.NOT_ME 

 

return self.state 

 

def get_confidence(self): 

r = 0.01 

if self._total_seqs > 0: 

r = ((1.0 * self._seq_counters[SequenceLikelihood.POSITIVE]) / 

self._total_seqs / self._model['typical_positive_ratio']) 

r = r * self._freq_char / self._total_char 

if r >= 1.0: 

r = 0.99 

return r