diff --git a/lib/CodingStateMachine.rb b/lib/CodingStateMachine.rb index 531b03c..047764d 100644 --- a/lib/CodingStateMachine.rb +++ b/lib/CodingStateMachine.rb @@ -14,12 +14,12 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA @@ -30,8 +30,11 @@ module UniversalDetector class CodingStateMachine + attr_accessor :active + def initialize(sm) @_mModel = sm + @active = false @_mCurrentBytePos = 0 @_mCurrentCharLen = 0 reset() @@ -45,7 +48,7 @@ def next_state(c) # for each byte we get its class # if it is first byte, we also get byte length byteCls = @_mModel['classTable'][c] - + if @_mCurrentState == :Start @_mCurrentBytePos = 0 @_mCurrentCharLen = @_mModel['charLenTable'][byteCls] @@ -58,7 +61,7 @@ def next_state(c) v = stateValue[@_mCurrentState] end @_mCurrentState = @_mModel['stateTable'][v * @_mModel['classFactor'] + byteCls] - + @_mCurrentBytePos += 1 return @_mCurrentState end @@ -71,4 +74,4 @@ def get_coding_state_machine return @_mModel['name'] end end -end \ No newline at end of file +end diff --git a/lib/EscCharSetProber.rb b/lib/EscCharSetProber.rb index 8b50703..48ca9c3 100644 --- a/lib/EscCharSetProber.rb +++ b/lib/EscCharSetProber.rb @@ -14,12 +14,12 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA @@ -36,10 +36,10 @@ class EscCharSetProber < CharSetProber def initialize super @_mCodingSM = [ \ - CodingStateMachine(HZSMModel), - CodingStateMachine(ISO2022CNSMModel), - CodingStateMachine(ISO2022JPSMModel), - CodingStateMachine(ISO2022KRSMModel) + CodingStateMachine.new(HZSMModel), + CodingStateMachine.new(ISO2022CNSMModel), + CodingStateMachine.new(ISO2022JPSMModel), + CodingStateMachine.new(ISO2022KRSMModel) ] reset() end @@ -47,7 +47,7 @@ def initialize def reset super for codingSM in @_mCodingSM - unless codingSM then continue end + next if codingSM.nil? codingSM.active = true codingSM.reset() end @@ -68,10 +68,10 @@ def get_confidence end def feed(aBuf) - for c in aBuf + aBuf.each_byte do |c| for codingSM in @_mCodingSM - unless codingSM then continue end - unless codingSM.active then continue end + next if codingSM.nil? + next unless codingSM.active codingState = codingSM.next_state(c) if codingState == :Error codingSM.active = false diff --git a/lib/UniversalDetector.rb b/lib/UniversalDetector.rb index 34f81af..b6613c1 100644 --- a/lib/UniversalDetector.rb +++ b/lib/UniversalDetector.rb @@ -14,12 +14,12 @@ # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. -# +# # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. -# +# # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA @@ -48,9 +48,9 @@ def chardet(data) u.result end end - + DEBUG = nil - + Detectiong = 0 FoundIt = 1 NotMe = 2 @@ -58,20 +58,20 @@ def chardet(data) Start = 0 Error = 1 ItsMe = 2 - + MINIMUM_THRESHOLD = 0.20 PureAscii = 0 EscAscii = 1 Highbyte = 2 SHORTCUT_THRESHOLD = 0.95 - + class Detector - + include Singleton - + attr_reader :result - + def initialize @_highBitDetector = /[\x80-\xFF]/n @_escDetector = /\033|~\{/n @@ -79,7 +79,7 @@ def initialize @_mCharSetProbers = [] reset end - + def reset @result = {"encoding"=> nil, "confidence"=> 0.0} @done = false @@ -93,11 +93,11 @@ def reset for prober in @_mCharSetProbers prober.reset end - end - + end + def feed(data) if @done || data.empty? - return + return end unless @_mGotData # If the data starts with BOM, we know it is UTF @@ -107,7 +107,7 @@ def feed(data) elsif data[0,4] == "\xFF\xFE\x00\x00" # FF FE 00 00 UTF-32, little-endian BOM @result = {"encoding"=> "UTF-32LE", "confidence"=> 1.0} - elsif data[0,4] == "\x00\x00\xFE\xFF" + elsif data[0,4] == "\x00\x00\xFE\xFF" # 00 00 FE FF UTF-32, big-endian BOM @result = {"encoding"=> "UTF-32BE", "confidence"=> 1.0} elsif data[0,4] == "\xFE\xFF\x00\x00" @@ -121,37 +121,37 @@ def feed(data) @result = {"encoding"=> "UTF-16LE", "confidence"=> 1.0} elsif data[0,2] == "\xFE\xFF" # FE FF UTF-16, big endian BOM - @result = {"encoding"=> "UTF-16BE", "confidence"=> 1.0} + @result = {"encoding"=> "UTF-16BE", "confidence"=> 1.0} end end @_mGotData = true if @result["encoding"] && @result["confidence"] > 0.0 @done = true return - end - + end + if @_mInputState == :PureAscii if data =~ @_highBitDetector @_mInputState = :Highbyte elsif (@_mLastChar + data) =~ @_escDetector @_mInputState = :EscAscii end - end - + end + @_mLastChar = data[-1] if @_mInputState == :EscAscii unless @_mEscCharSetProber @_mEscCharSetProber = EscCharSetProber.new end - if @_mEscCharSetProber.feed(data) == constants.eFoundIt + if @_mEscCharSetProber.feed(data) == :FoundIt @result = {"encoding"=> @_mEscCharSetProber.get_charset_name() ,"confidence"=> @_mEscCharSetProber.get_confidence()} - @done = true - end + @done = true + end elsif @_mInputState == :Highbyte if @_mCharSetProbers.empty? @_mCharSetProbers = MBCSGroupProber.new.mProbers + SBCSGroupProber.new.mProbers + [Latin1Prober.new] - end - @_mCharSetProbers.each do |prober| + end + @_mCharSetProbers.each do |prober| if prober.feed(data) == :FoundIt @result = {"encoding"=> prober.get_charset_name(), "confidence"=> prober.get_confidence()} @done = true @@ -160,7 +160,7 @@ def feed(data) end #for end end #feed - + def close if @done then return end unless @_mGotData @@ -170,7 +170,7 @@ def close return end @done = true - + if @_mInputState == :PureAscii @result = {"encoding" => "ascii", "confidence" => 1.0} return @result @@ -194,7 +194,7 @@ def close return @result end end #if - + if DEBUG p("no probers hit minimum threshhold\n") for prober in @_mCharSetProbers @@ -203,8 +203,8 @@ def close [prober.get_charset_name(), \ prober.get_confidence()]) end - end + end end #close end #class - + end #module