Encode-Detect
view release on metacpan or search on metacpan
src/nsUniversalDetector.cpp view on Meta::CPAN
{
mDone = PR_FALSE;
mBestGuess = -1; //illegal value as signal
mInTag = PR_FALSE;
mEscCharSetProber = nsnull;
mStart = PR_TRUE;
mDetectedCharset = nsnull;
mGotData = PR_FALSE;
mInputState = ePureAscii;
mLastChar = '\0';
PRUint32 i;
for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
mCharSetProbers[i] = nsnull;
}
nsUniversalDetector::~nsUniversalDetector()
{
for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
if (mCharSetProbers[i])
delete mCharSetProbers[i];
if (mEscCharSetProber)
delete mEscCharSetProber;
}
void
nsUniversalDetector::Reset()
{
mDone = PR_FALSE;
mBestGuess = -1; //illegal value as signal
mInTag = PR_FALSE;
mStart = PR_TRUE;
mDetectedCharset = nsnull;
mGotData = PR_FALSE;
mInputState = ePureAscii;
mLastChar = '\0';
if (mEscCharSetProber)
mEscCharSetProber->Reset();
PRUint32 i;
for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
if (mCharSetProbers[i])
mCharSetProbers[i]->Reset();
}
//---------------------------------------------------------------------
#define SHORTCUT_THRESHOLD (float)0.95
#define MINIMUM_THRESHOLD (float)0.20
nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
{
if(mDone)
return NS_OK;
if (aLen > 0)
mGotData = PR_TRUE;
//If the data starts with BOM, we know it is UTF
if (mStart)
{
mStart = PR_FALSE;
if (aLen > 3)
switch (aBuf[0])
{
case '\xEF':
if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
// EF BB BF UTF-8 encoded BOM
mDetectedCharset = "UTF-8";
break;
case '\xFE':
if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
// FE FF 00 00 UCS-4, unusual octet order BOM (3412)
mDetectedCharset = "X-ISO-10646-UCS-4-3412";
else if ('\xFF' == aBuf[1])
// FE FF UTF-16, big endian BOM
mDetectedCharset = "UTF-16BE";
break;
case '\x00':
if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
// 00 00 FE FF UTF-32, big-endian BOM
mDetectedCharset = "UTF-32BE";
else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
// 00 00 FF FE UCS-4, unusual octet order BOM (2143)
mDetectedCharset = "X-ISO-10646-UCS-4-2143";
break;
case '\xFF':
if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
// FF FE 00 00 UTF-32, little-endian BOM
mDetectedCharset = "UTF-32LE";
else if ('\xFE' == aBuf[1])
// FF FE UTF-16, little endian BOM
mDetectedCharset = "UTF-16LE";
break;
} // switch
if (mDetectedCharset)
{
mDone = PR_TRUE;
return NS_OK;
}
}
PRUint32 i;
for (i = 0; i < aLen; i++)
{
//other than 0xa0, if every othe character is ascii, the page is ascii
if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') //Since many Ascii only page contains NBSP
{
//we got a non-ascii byte (high-byte)
if (mInputState != eHighbyte)
{
//adjust state
mInputState = eHighbyte;
//kill mEscCharSetProber if it is active
if (mEscCharSetProber) {
delete mEscCharSetProber;
mEscCharSetProber = nsnull;
}
//start multibyte and singlebyte charset prober
if (nsnull == mCharSetProbers[0])
mCharSetProbers[0] = new nsMBCSGroupProber;
if (nsnull == mCharSetProbers[1])
mCharSetProbers[1] = new nsSBCSGroupProber;
if (nsnull == mCharSetProbers[2])
mCharSetProbers[2] = new nsLatin1Prober;
if ((nsnull == mCharSetProbers[0]) ||
(nsnull == mCharSetProbers[1]) ||
(nsnull == mCharSetProbers[2]))
return NS_ERROR_OUT_OF_MEMORY;
}
}
else
{
//ok, just pure ascii so far
if ( ePureAscii == mInputState &&
(aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) )
{
//found escape character or HZ "~{"
mInputState = eEscAscii;
}
mLastChar = aBuf[i];
}
}
nsProbingState st;
switch (mInputState)
{
case eEscAscii:
( run in 0.588 second using v1.01-cache-2.11-cpan-13bb782fe5a )