Lingua-NATools

 view release on metacpan or  search on metacpan

src/pre.c  view on Meta::CPAN

/* -*- Mode: C; c-file-style: "stroustrup" -*- */

/* NATools - Package with parallel corpora tools
 * Copyright (C) 1998-2001  Djoerd Hiemstra
 * Copyright (C) 2002-2014  Alberto Simões
 *
 * This package is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 02111-1307, USA.
 */


#include <stdio.h>
#include <wctype.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <wchar.h>
#include <NATools.h>

#include "standard.h"
#include "invindex.h"
#include "unicode.h"
#include "partials.h"
#include "ngramidx.h"


/**
 * @file
 * @brief Corpora pre-processing unit
 */

/**
 * @brief maximum number of words in a translation unit
 */
#define MAXBUF 500

/**
 * @brief number of iterations between updating progress information
 */
#define STEP 100

/**
 * @brief value used as default size when alloccating the buffer for
 * the index
 */
#define DEFAULT_INDEX_SIZE 150000

static nat_boolean_t quiet;

static wchar_t *my_lowercase(wchar_t *sen, nat_boolean_t ignore_case) {
    if (ignore_case) {
        wchar_t *ptr = sen;
        while (*ptr) {
            *ptr = towlower(*ptr);
            ptr++;
        }
    }
    return sen;
}

void show_help(void) {
    printf("Usage: nat-pre [-iq] cp1 cp2 lex1 lex2 crp1 crp2\n");
    printf("Supported options:\n"
           "  -h shows this help message and exits\n"
           "  -V shows "PACKAGE" version and exits\n"
           "  -v activates verbose mode (incompatible with quiet mode)\n"
           "  -i activates ignore case\n"
           "  -q activates quiet mode\n"
           "Check nat-pre manpage for details.\n");
}

static int AddSentence(wchar_t **sen, unsigned long len,
		       Words* wl, Corpus *Corpus,
		       InvIndex *Index, nat_uint32_t sentence_number,
		       PartialCounts *partials, nat_boolean_t ignore_case)
{
    nat_uint32_t i, wid = 0;
    
    /* Add each sentence word */
    for (i = 0; i < len; i++) {
		int flag; 		/* 1: lowercase; 2: Capital; 3: UPPERCASE */

		if (isCapital(*sen)) flag = 2;
		else if (isUPPERCASE(*sen)) flag = 3;
		else flag = 1;
        
		if (wcslen(*sen) >= MAXWORDLEN) {
		    fprintf(stderr, "**WARNING** Truncating word '%ls'\n", *sen);
	            (*sen)[MAXWORDLEN - 1] = L'\0';
		}

        wid = words_add_word(wl, my_lowercase(*sen, ignore_case));

        if (wid) {
            partials = PartialCountsAdd(partials, wid);
		
            if (corpus_add_word(Corpus, wid, flag)) return 1;
		



( run in 1.355 second using v1.01-cache-2.11-cpan-71847e10f99 )