AI-MicroStructure

 view release on metacpan or  search on metacpan

bin/micro-dict  view on Meta::CPAN

#!/bin/bash
IFS=$'\n';

#REMOVETHESE="gov|search|cid|aaa|bbb|ccc|ddd|eee|fff|ggg|hhh|iii|jjj|kkk|lll|mmm|nnn|ooo|ppp|qqq|rrr|sss|ttt|eee|fff|ggg|hhh|iii|jjj|kkk|lll|mmm|nnn|ooo|ppp|qqq|rrr|sss|ttt|uuu|vvv|www|xxx|yyy|zzz|org|wiki|png|jpg|thumb|pdf|ref|idx|php|html|json|abc|...
#  egrep -v "($REMOVETHESE)" |

                                    #+ options to sort. Changed from




stop=$(perl -MAI::MicroStructure::WordBlacklist -E  "my \$s=AI::MicroStructure::WordBlacklist::getStopWords('de'); my @s = keys %\$s; print join('|',@s);")



function uniquemmasher(){

if [ -f "$1" ]
then                                #+ valid file argument.
cmd=cat
else
cmd=echo
fi



stop=$(perl -MAI::MicroStructure::WordBlacklist -E  "my \$s=AI::MicroStructure::WordBlacklist::getStopWords('de'); my @s = keys %\$s; print join('|',@s);")
IFS=$'\n';

$cmd $1 |   tr A-Z a-z |                # Convert to lowercase.
        tr ' ' '_' |             # New: change spaces to newlines.
       #tr -cd '\012[a-z][0-9]' |   #  Get rid of everything
                                    #+ non-alphanumeric (in orig. script).
        tr -c '\012a-z'  '\012' |   #  Rather than deleting non-alpha
        egrep -v '^#' |              # Delete lines starting with hashmark.
        egrep -v "^[ ]*([A-Za-z][A-Za-z]|[A-Za-z])$" | egrep -v "^$" | egrep -v -i "^ (denkbarer|ganze|bez|ver�ffentlichtes|uns�gliches|ungew�hnliche|vollstaendig|erstem|Inf.|titel|unsaeglichem|beforehand|denkbares|yours|contains|gedurft|seithe...


 stop=$(perl -MAI::MicroStructure::WordBlacklist -E  "my \$s=AI::MicroStructure::WordBlacklist::getStopWords('de'); my @s = keys %\$s; print join('|',@s);")

 cat /tmp/micro-dict.tmp | sort -n | egrep -v "^.*.[\ ].*.[1-9][\:][\ ][\ ]($stop)";


 #if [ !  "$(echo  "$stop" | egrep -i zzzzzzzzzzzz)" ]; then  echo cool; fi




}

function masher(){

if [ -f "$1" ]
then                                #+ valid file argument.
cmd=cat
else
cmd=echo
fi

$cmd $1 | tr A-Z a-z |                # Convert to lowercase.
        tr ' ' '\012' |             # New: change spaces to newlines.
   #    tr -cd '\012[a-z][0-9]' |   #  Get rid of everything
                                    #+ non-alphanumeric (in orig. script).
        tr -c '\012a-z'  '\012' |   #  Rather than deleting non-alpha
                                    #+ chars, change them to newlines.
        egrep -v '^#' |              # Delete lines starting with hashmark.
        egrep -v "^[ ]*([A-Za-z][A-Za-z]|[A-Za-z])$" |
        egrep -v '^$'




}


if [ "$2" == 1 ]                    #  Need at least one
then
uniquemmasher $*;
else
masher $*;
fi
exit 0

 view all matches for this distribution
 view release on metacpan -  search on metacpan

( run in 0.507 second using v1.00-cache-2.02-grep-82fe00e-cpan-2c419f77a38b )