Lingua-NATools
view release on metacpan or search on metacpan
/* -*- Mode: C; c-file-style: "stroustrup" -*- */
/* NATools - Package with parallel corpora tools
* Copyright (C) 1998-2001 Djoerd Hiemstra
* Copyright (C) 2002-2014 Alberto Simões
*
* This package is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
#include <stdio.h>
#include <wctype.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <wchar.h>
#include <NATools.h>
#include "standard.h"
#include "invindex.h"
#include "unicode.h"
#include "partials.h"
#include "ngramidx.h"
/**
* @file
* @brief Corpora pre-processing unit
*/
/**
* @brief maximum number of words in a translation unit
*/
#define MAXBUF 500
/**
* @brief number of iterations between updating progress information
*/
#define STEP 100
/**
* @brief value used as default size when alloccating the buffer for
* the index
*/
#define DEFAULT_INDEX_SIZE 150000
static nat_boolean_t quiet;
static wchar_t *my_lowercase(wchar_t *sen, nat_boolean_t ignore_case) {
if (ignore_case) {
wchar_t *ptr = sen;
while (*ptr) {
*ptr = towlower(*ptr);
ptr++;
}
}
return sen;
}
void show_help(void) {
printf("Usage: nat-pre [-iq] cp1 cp2 lex1 lex2 crp1 crp2\n");
printf("Supported options:\n"
" -h shows this help message and exits\n"
" -V shows "PACKAGE" version and exits\n"
" -v activates verbose mode (incompatible with quiet mode)\n"
" -i activates ignore case\n"
" -q activates quiet mode\n"
"Check nat-pre manpage for details.\n");
}
static int AddSentence(wchar_t **sen, unsigned long len,
Words* wl, Corpus *Corpus,
InvIndex *Index, nat_uint32_t sentence_number,
PartialCounts *partials, nat_boolean_t ignore_case)
{
nat_uint32_t i, wid = 0;
/* Add each sentence word */
for (i = 0; i < len; i++) {
int flag; /* 1: lowercase; 2: Capital; 3: UPPERCASE */
if (isCapital(*sen)) flag = 2;
else if (isUPPERCASE(*sen)) flag = 3;
else flag = 1;
if (wcslen(*sen) >= MAXWORDLEN) {
fprintf(stderr, "**WARNING** Truncating word '%ls'\n", *sen);
(*sen)[MAXWORDLEN - 1] = L'\0';
}
wid = words_add_word(wl, my_lowercase(*sen, ignore_case));
if (wid) {
partials = PartialCountsAdd(partials, wid);
if (corpus_add_word(Corpus, wid, flag)) return 1;
( run in 1.355 second using v1.01-cache-2.11-cpan-71847e10f99 )