Lingua-Identify-CLD2
view release on metacpan or search on metacpan
src/cld2/public/compact_lang_det.h view on Meta::CPAN
//
// Inputs: text and text_length
// Code skips HTML tags and expands HTML entities, unless
// is_plain_text is true
// Outputs:
// language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE
// percent3 is an array of the text percentages 0..100 of the top 3 languages
// text_bytes is the amount of non-tag/letters-only text found
// is_reliable set true if the returned Language is some amount more
// probable then the second-best Language. Calculation is a complex function
// of the length of the text and the different-script runs of text.
// Return value: the most likely Language for the majority of the input text
// Length 0 input returns UNKNOWN_LANGUAGE. Very short indeterminate text
// defaults to ENGLISH.
//
// The first two versions return ENGLISH instead of UNKNOWN_LANGUAGE, for
// backwards compatibility with a different detector.
//
// The third version may return UNKNOWN_LANGUAGE, and also returns extended
// language codes from lang_script.h
//
// Instead of individual arguments, pass in hints as an initialized struct
// Init to {NULL, NULL, UNKNOWN_ENCODING, UNKNOWN_LANGUAGE} if not known.
//
// Pass in hints whenever possible; doing so improves detection accuracy. The
// set of passed-in hints are all information that is external to the text
// itself.
//
// The content_language_hint is intended to come from an HTTP header
// Content-Language: field, the tld_hint from the hostname of a URL, the
// encoding-hint from an encoding detector applied to the input
// document, and the language hint from any other context you might have.
// The lang= tags inside an HTML document will be picked up as hints
// by code within the compact language detector.
typedef struct {
const char* content_language_hint; // "mi,en" boosts Maori and English
const char* tld_hint; // "id" boosts Indonesian
int encoding_hint; // SJS boosts Japanese
Language language_hint; // ITALIAN boosts it
} CLDHints;
static const int32 kMaxResultChunkBytes = 0x7fffffff;
// Note: this was initially over-optimized to fit into 8 bytes,
// causing too much work to deal with with greater than 16-bit byte lengths.
// For returning a vector of per-language pieces of the input buffer
// Unreliable and too-short are mapped to UNKNOWN_LANGUAGE
typedef struct {
int offset; // Starting byte offset in original buffer
int32 bytes; // Number of bytes in chunk
uint16 lang1; // Top lang, as full Language. Apply
// static_cast<Language>() to this short value.
uint16 pad; // Make multiple of 4 bytes
} ResultChunk;
typedef std::vector<ResultChunk> ResultChunkVector;
// These initial simple versions all cascade through the full-blown last
// version which it would be better for you to use directly because you will
// get better results passing in any available hints.
// Scan interchange-valid UTF-8 bytes and detect most likely language
// If the input is in fact not valid UTF-8, this returns immediately with
// the result value UNKNOWN_LANGUAGE and is_reliable set to false.
//
// In all cases, valid_prefix_bytes will be set to the number of leading
// bytes that are valid UTF-8. If this is < buffer_length, there is invalid
// input starting at the following byte.
Language DetectLanguageCheckUTF8(
const char* buffer,
int buffer_length,
bool is_plain_text,
bool* is_reliable,
int* valid_prefix_bytes);
// Use this one ONLY if you can prove the the input text is valid UTF-8 by
// design because it went through a known-good conversion program.
// Scan interchange-valid UTF-8 bytes and detect most likely language
Language DetectLanguage(
const char* buffer,
int buffer_length,
bool is_plain_text,
bool* is_reliable);
// Use this one ONLY if you can prove the the input text is valid UTF-8 by
// design because it went through a known-good conversion program.
// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
// language3[0] is usually also the return value
Language DetectLanguageSummary(
const char* buffer,
int buffer_length,
bool is_plain_text,
Language* language3,
int* percent3,
int* text_bytes,
bool* is_reliable);
// Use this one ONLY if you can prove the the input text is valid UTF-8 by
// design because it went through a known-good conversion program.
// Same as above, with hints supplied
// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
// language3[0] is usually also the return value
Language DetectLanguageSummary(
const char* buffer,
int buffer_length,
bool is_plain_text,
const char* tld_hint, // "id" boosts Indonesian
int encoding_hint, // SJS boosts Japanese
Language language_hint, // ITALIAN boosts it
Language* language3,
int* percent3,
int* text_bytes,
bool* is_reliable);
// Use this one ONLY if you can prove the the input text is valid UTF-8 by
// design because it went through a known-good conversion program.
// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
// languages.
( run in 1.687 second using v1.01-cache-2.11-cpan-140bd7fdf52 )