Lingua-Identify-CLD2

 view release on metacpan or  search on metacpan

src/cld2/public/compact_lang_det.h  view on Meta::CPAN

  //
  // Inputs: text and text_length
  //  Code skips HTML tags and expands HTML entities, unless
  //  is_plain_text is true
  // Outputs:
  //  language3 is an array of the top 3 languages or UNKNOWN_LANGUAGE
  //  percent3 is an array of the text percentages 0..100 of the top 3 languages
  //  text_bytes is the amount of non-tag/letters-only text found
  //  is_reliable set true if the returned Language is some amount more
  //   probable then the second-best Language. Calculation is a complex function
  //   of the length of the text and the different-script runs of text.
  // Return value: the most likely Language for the majority of the input text
  //  Length 0 input returns UNKNOWN_LANGUAGE. Very short indeterminate text
  //  defaults to ENGLISH.
  //
  // The first two versions return ENGLISH instead of UNKNOWN_LANGUAGE, for
  // backwards compatibility with a different detector.
  //
  // The third version may return UNKNOWN_LANGUAGE, and also returns extended
  // language codes from lang_script.h
  //


  // Instead of individual arguments, pass in hints as an initialized struct
  // Init to {NULL, NULL, UNKNOWN_ENCODING, UNKNOWN_LANGUAGE} if not known.
  //
  // Pass in hints whenever possible; doing so improves detection accuracy. The
  // set of passed-in hints are all information that is external to the text
  // itself.
  //
  // The content_language_hint is intended to come from an HTTP header
  // Content-Language: field, the tld_hint from the hostname of a URL, the
  // encoding-hint from an encoding detector applied to the input
  // document, and the language hint from any other context you might have.
  // The lang= tags inside an HTML document will be picked up as hints
  // by code within the compact language detector.

  typedef struct {
    const char* content_language_hint;      // "mi,en" boosts Maori and English
    const char* tld_hint;                   // "id" boosts Indonesian
    int encoding_hint;                      // SJS boosts Japanese
    Language language_hint;                 // ITALIAN boosts it
  } CLDHints;

  static const int32 kMaxResultChunkBytes = 0x7fffffff;

  // Note: this was initially over-optimized to fit into 8 bytes,
  // causing too much work to deal with with greater than 16-bit byte lengths.
  // For returning a vector of per-language pieces of the input buffer
  // Unreliable and too-short are mapped to UNKNOWN_LANGUAGE
  typedef struct {
    int offset;                 // Starting byte offset in original buffer
    int32 bytes;                // Number of bytes in chunk
    uint16 lang1;               // Top lang, as full Language. Apply
                                //  static_cast<Language>() to this short value.
    uint16 pad;                 // Make multiple of 4 bytes
  } ResultChunk;
  typedef std::vector<ResultChunk> ResultChunkVector;


  // These initial simple versions all cascade through the full-blown last
  // version which it would be better for you to use directly because you will
  // get better results passing in any available hints.

  // Scan interchange-valid UTF-8 bytes and detect most likely language
  // If the input is in fact not valid UTF-8, this returns immediately with
  // the result value UNKNOWN_LANGUAGE and is_reliable set to false.
  //
  // In all cases, valid_prefix_bytes will be set to the number of leading
  // bytes that are valid UTF-8. If this is < buffer_length, there is invalid
  // input starting at the following byte.
  Language DetectLanguageCheckUTF8(
                          const char* buffer,
                          int buffer_length,
                          bool is_plain_text,
                          bool* is_reliable,
                          int* valid_prefix_bytes);

  // Use this one ONLY if you can prove the the input text is valid UTF-8 by
  // design because it went through a known-good conversion program.
  // Scan interchange-valid UTF-8 bytes and detect most likely language
  Language DetectLanguage(
                          const char* buffer,
                          int buffer_length,
                          bool is_plain_text,
                          bool* is_reliable);

  // Use this one ONLY if you can prove the the input text is valid UTF-8 by
  // design because it went through a known-good conversion program.
  // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
  // language3[0] is usually also the return value
  Language DetectLanguageSummary(
                          const char* buffer,
                          int buffer_length,
                          bool is_plain_text,
                          Language* language3,
                          int* percent3,
                          int* text_bytes,
                          bool* is_reliable);

  // Use this one ONLY if you can prove the the input text is valid UTF-8 by
  // design because it went through a known-good conversion program.
  // Same as above, with hints supplied
  // Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
  // language3[0] is usually also the return value
  Language DetectLanguageSummary(
                          const char* buffer,
                          int buffer_length,
                          bool is_plain_text,
                          const char* tld_hint,       // "id" boosts Indonesian
                          int encoding_hint,          // SJS boosts Japanese
                          Language language_hint,     // ITALIAN boosts it
                          Language* language3,
                          int* percent3,
                          int* text_bytes,
                          bool* is_reliable);

  // Use this one ONLY if you can prove the the input text is valid UTF-8 by
  // design because it went through a known-good conversion program.
  // Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
  // languages.



( run in 1.687 second using v1.01-cache-2.11-cpan-140bd7fdf52 )