8000 Fix ISO-2022-JP encoding detection. by danhper · Pull Request #10 · janx/chardet2 · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Fix ISO-2022-JP encoding detection. #10

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions lib/CodingStateMachine.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
Expand All @@ -30,8 +30,11 @@

module UniversalDetector
class CodingStateMachine
attr_accessor :active

def initialize(sm)
@_mModel = sm
@active = false
@_mCurrentBytePos = 0
@_mCurrentCharLen = 0
reset()
Expand All @@ -45,7 +48,7 @@ def next_state(c)
# for each byte we get its class
# if it is first byte, we also get byte length
byteCls = @_mModel['classTable'][c]

if @_mCurrentState == :Start
@_mCurrentBytePos = 0
@_mCurrentCharLen = @_mModel['charLenTable'][byteCls]
Expand All @@ -58,7 +61,7 @@ def next_state(c)
v = stateValue[@_mCurrentState]
end
@_mCurrentState = @_mModel['stateTable'][v * @_mModel['classFactor'] + byteCls]

@_mCurrentBytePos += 1
return @_mCurrentState
end
Expand All @@ -71,4 +74,4 @@ def get_coding_state_machine
return @_mModel['name']
end
end
end
end
20 changes: 10 additions & 10 deletions lib/EscCharSetProber.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
Expand All @@ -36,18 +36,18 @@ class EscCharSetProber < CharSetProber
def initialize
super
@_mCodingSM = [ \
CodingStateMachine(HZSMModel),
CodingStateMachine(ISO2022CNSMModel),
CodingStateMachine(ISO2022JPSMModel),
CodingStateMachine(ISO2022KRSMModel)
CodingStateMachine.new(HZSMModel),
CodingStateMachine.new(ISO2022CNSMModel),
CodingStateMachine.new(ISO2022JPSMModel),
CodingStateMachine.new(ISO2022KRSMModel)
]
reset()
end

def reset
super
for codingSM in @_mCodingSM
unless codingSM then continue end
next if codingSM.nil?
codingSM.active = true
codingSM.reset()
end
Expand All @@ -68,10 +68,10 @@ def get_confidence
end

def feed(aBuf)
for c in aBuf
aBuf.each_byte do |c|
for codingSM in @_mCodingSM
unless codingSM then continue end
unless codingSM.active then continue end
next if codingSM.nil?
next unless codingSM.active
codingState = codingSM.next_state(c)
if codingState == :Error
codingSM.active = false
Expand Down
58 changes: 29 additions & 29 deletions lib/UniversalDetector.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
Expand Down Expand Up @@ -48,38 +48,38 @@ def chardet(data)
u.result
end
end

DEBUG = nil

Detectiong = 0
FoundIt = 1
NotMe = 2

Start = 0
Error = 1
ItsMe = 2

MINIMUM_THRESHOLD = 0.20
PureAscii = 0
EscAscii = 1
Highbyte = 2

SHORTCUT_THRESHOLD = 0.95

class Detector

include Singleton

attr_reader :result

def initialize
@_highBitDetector = /[\x80-\xFF]/n
@_escDetector = /\033|~\{/n
@_mEscCharSetProber = nil
@_mCharSetProbers = []
reset
end

def reset
@result = {"encoding"=> nil, "confidence"=> 0.0}
@done = false
Expand All @@ -93,11 +93,11 @@ def reset
for prober in @_mCharSetProbers
prober.reset
end
end
end

def feed(data)
if @done || data.empty?
return
return
end
unless @_mGotData
# If the data starts with BOM, we know it is UTF
Expand All @@ -107,7 +107,7 @@ def feed(data)
elsif data[0,4] == "\xFF\xFE\x00\x00"
# FF FE 00 00 UTF-32, little-endian BOM
@result = {"encoding"=> "UTF-32LE", "confidence"=> 1.0}
elsif data[0,4] == "\x00\x00\xFE\xFF"
elsif data[0,4] == "\x00\x00\xFE\xFF"
# 00 00 FE FF UTF-32, big-endian BOM
@result = {"encoding"=> "UTF-32BE", "confidence"=> 1.0}
elsif data[0,4] == "\xFE\xFF\x00\x00"
Expand All @@ -121,37 +121,37 @@ def feed(data)
@result = {"encoding"=> "UTF-16LE", "confidence"=> 1.0}
elsif data[0,2] == "\xFE\xFF"
# FE FF UTF-16, big endian BOM
@result = {"encoding"=> "UTF-16BE", "confidence"=> 1.0}
@result = {"encoding"=> "UTF-16BE", "confidence"=> 1.0}
end
end
@_mGotData = true
if @result["encoding"] && @result["confidence"] > 0.0
@done = true
return
end
end

if @_mInputState == :PureAscii
if data =~ @_highBitDetector
@_mInputState = :Highbyte
elsif (@_mLastChar + data) =~ @_escDetector
@_mInputState = :EscAscii
end
end
end

@_mLastChar = data[-1]
if @_mInputState == :EscAscii
unless @_mEscCharSetProber
@_mEscCharSetProber = EscCharSetProber.new
end
if @_mEscCharSetProber.feed(data) == constants.eFoundIt
if @_mEscCharSetProber.feed(data) == :FoundIt
@result = {"encoding"=> @_mEscCharSetProber.get_charset_name() ,"confidence"=> @_mEscCharSetProber.get_confidence()}
@done = true
end
@done = true
end
elsif @_mInputState == :Highbyte
if @_mCharSetProbers.empty?
@_mCharSetProbers = MBCSGroupProber.new.mProbers + SBCSGroupProber.new.mProbers + [Latin1Prober.new]
end
@_mCharSetProbers.each do |prober|
end
@_mCharSetProbers.each do |prober|
if prober.feed(data) == :FoundIt
@result = {"encoding"=> prober.get_charset_name(), "confidence"=> prober.get_confidence()}
@done = true
Expand All @@ -160,7 +160,7 @@ def feed(data)
end #for
end
end #feed

def close
if @done then return end
unless @_mGotData
Expand All @@ -170,7 +170,7 @@ def close
return
end
@done = true

if @_mInputState == :PureAscii
@result = {"encoding" => "ascii", "confidence" => 1.0}
return @result
Expand All @@ -194,7 +194,7 @@ def close
return @result
end
end #if

if DEBUG
p("no probers hit minimum threshhold\n")
for prober in @_mCharSetProbers
Expand All @@ -203,8 +203,8 @@ def close
[prober.get_charset_name(), \
prober.get_confidence()])
end
end
end
end #close
end #class

end #module
0