Encode-Detect
view release on metacpan or search on metacpan
src/nsUniversalDetector.cpp view on Meta::CPAN
#define MINIMUM_THRESHOLD (float)0.20
nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
{
if(mDone)
return NS_OK;
if (aLen > 0)
mGotData = PR_TRUE;
//If the data starts with BOM, we know it is UTF
if (mStart)
{
mStart = PR_FALSE;
if (aLen > 3)
switch (aBuf[0])
{
case '\xEF':
if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
// EF BB BF UTF-8 encoded BOM
mDetectedCharset = "UTF-8";
break;
case '\xFE':
if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
// FE FF 00 00 UCS-4, unusual octet order BOM (3412)
mDetectedCharset = "X-ISO-10646-UCS-4-3412";
else if ('\xFF' == aBuf[1])
// FE FF UTF-16, big endian BOM
mDetectedCharset = "UTF-16BE";
break;
case '\x00':
if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
// 00 00 FE FF UTF-32, big-endian BOM
mDetectedCharset = "UTF-32BE";
else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
// 00 00 FF FE UCS-4, unusual octet order BOM (2143)
mDetectedCharset = "X-ISO-10646-UCS-4-2143";
break;
case '\xFF':
if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
// FF FE 00 00 UTF-32, little-endian BOM
mDetectedCharset = "UTF-32LE";
else if ('\xFE' == aBuf[1])
// FF FE UTF-16, little endian BOM
mDetectedCharset = "UTF-16LE";
break;
} // switch
if (mDetectedCharset)
{
mDone = PR_TRUE;
return NS_OK;
}
}
( run in 0.485 second using v1.01-cache-2.11-cpan-e9daa2b36ef )