Alien-SVN

 view release on metacpan or  search on metacpan

src/subversion/subversion/libsvn_subr/utf_validate.c  view on Meta::CPAN

/*
 * utf_validate.c:  Validate a UTF-8 string
 *
 * ====================================================================
 *    Licensed to the Apache Software Foundation (ASF) under one
 *    or more contributor license agreements.  See the NOTICE file
 *    distributed with this work for additional information
 *    regarding copyright ownership.  The ASF licenses this file
 *    to you under the Apache License, Version 2.0 (the
 *    "License"); you may not use this file except in compliance
 *    with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing,
 *    software distributed under the License is distributed on an
 *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 *    KIND, either express or implied.  See the License for the
 *    specific language governing permissions and limitations
 *    under the License.
 * ====================================================================
 */

/* Validate a UTF-8 string according to the rules in
 *
 *    Table 3-6. Well-Formed UTF-8 Bytes Sequences
 *
 * in
 *
 *    The Unicode Standard, Version 4.0
 *
 * which is available at
 *
 *    http://www.unicode.org/
 *
 * UTF-8 was originally defined in RFC-2279, Unicode's "well-formed UTF-8"
 * is a subset of that enconding.  The Unicode enconding prohibits things
 * like non-shortest encodings (some characters can be represented by more
 * than one multi-byte encoding) and the encodings for the surrogate code
 * points.  RFC-3629 superceeds RFC-2279 and adopts the same well-formed
 * rules as Unicode.  This is the ABNF in RFC-3629 that describes
 * well-formed UTF-8 rules:
 *
 *   UTF8-octets = *( UTF8-char )
 *   UTF8-char   = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
 *   UTF8-1      = %x00-7F
 *   UTF8-2      = %xC2-DF UTF8-tail
 *   UTF8-3      = %xE0 %xA0-BF UTF8-tail /
 *                 %xE1-EC 2( UTF8-tail ) /
 *                 %xED %x80-9F UTF8-tail /
 *                 %xEE-EF 2( UTF8-tail )
 *   UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) /
 *                 %xF1-F3 3( UTF8-tail ) /
 *                 %xF4 %x80-8F 2( UTF8-tail )
 *   UTF8-tail   = %x80-BF
 *
 */

#include "private/svn_utf_private.h"
#include "private/svn_eol_private.h"
#include "private/svn_dep_compat.h"

/* Lookup table to categorise each octet in the string. */
static const char octet_category[256] = {
  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, /* 0x00-0x7f */
  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, /* 0x80-0x8f */
  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2, /* 0x90-0x9f */
  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3, /* 0xa0-0xbf */
  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
  4,  4,                                                         /* 0xc0-0xc1 */
          5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5, /* 0xc2-0xdf */
  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
  6,                                                             /* 0xe0 */
      7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,             /* 0xe1-0xec */
                                                      8,         /* 0xed */
                                                          9,  9, /* 0xee-0xef */
  10,                                                            /* 0xf0 */
      11, 11, 11,                                                /* 0xf1-0xf3 */
                  12,                                            /* 0xf4 */
                      13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13 /* 0xf5-0xff */
};

/* Machine states */
#define FSM_START         0
#define FSM_80BF          1
#define FSM_A0BF          2
#define FSM_80BF80BF      3



( run in 0.615 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )