Mail-Digest-Tools

 view release on metacpan or  search on metacpan

Tools.pm  view on Meta::CPAN

                        : "${$config_out_ref}{'dir_digest'}/archived_today.txt";
    my $dir_archive_top = ${$config_out_ref}{'dir_archive_top'};
    die "Missing top archive directory: $!" unless (-d $dir_archive_top);
    foreach ('a'..'z') {
        die "Missing archive subdirectory $_: $!" unless (-d "$dir_archive_top/$_");
    }
    die "Missing archive subdirectory 'other': $!" unless (-d "$dir_archive_top/other");

    open ARCH, ">$archfile" or die "Couldn't open $archfile for writing: $!";
    print ARCH 'Archived today (', scalar(localtime), "):\n";
    print ARCH '-' x 41, "\n";

    my ($thr, $archstr);
    my $toarchive = 0;
    foreach $thr (sort keys %{$nonrecentref}) {
        my $initial = lc(substr $thr, 0, 1);
        print "Archiving: $thr\n";
        $archstr .= $thr . "\n";
        if ($initial =~ /[a-zA-Z]/) {
            rename($thr, "$dir_archive_top/$initial/$thr") or die "Couldn't move $thr: $!";
        } else {
            rename($thr, "$dir_archive_top/other/$thr") or die "Couldn't move $thr: $!";
        }
        $toarchive++;
        print "$toarchive files archived\n\n" if ($toarchive % 100 == 0);
    }
    print "$toarchive files archived\n\n";
    $toarchive ? print ARCH $archstr : print ARCH "[None.]\n";
    close ARCH or die "Couldn't close $archfile after writing: $!";
}

sub _kill_old_files {
    my ($config_out_ref, $nonrecentref) = @_;
    my $dir_threads = ${$config_out_ref}{'dir_threads'};
    my $killfile = defined ${$config_out_ref}{'deleted_today'}
                 ? ${$config_out_ref}{'deleted_today'}
                 : "${$config_out_ref}{'dir_digest'}/deleted_today.txt"; # v1.95
    open KILL, ">$killfile" or die "Couldn't open $killfile for writing: $!";
    print KILL 'Deleted today (', scalar(localtime), "):\n";
    print KILL '-' x 40, "\n";

    my ($thr, $killstr);
    my $tokill = 0;
    foreach $thr (sort keys %{$nonrecentref}) {
        print "Unlinking: $thr\n";
        $killstr .= $thr . "\n";
        unlink $thr or die "Couldn't unlink $thr: $!";
        $tokill++;
        print "$tokill files deleted\n" if ($tokill % 100 == 0);
    }
    print "$tokill files deleted\n";
    $tokill ? print KILL $killstr : print KILL "[None.]\n";
    close KILL or die "Couldn't close $killfile after writing: $!";
}

sub _get_digest_list {
    my ($config_in_ref, $config_out_ref) = @_;
    opendir(DIR, ${$config_out_ref}{'dir_digest'}) || die "no ${$config_out_ref}{'dir_digest'}?: $!";
    my @digests = 
        sort { lc($a) cmp lc($b) } 
        grep { /${$config_in_ref}{'grep_formula'}/ } 
        readdir(DIR);
    closedir(DIR) || die "Could not close ${$config_out_ref}{'dir_digest'}: $!";
    return \@digests;
}

sub _prep_source_file {
    my ($config_in_ref, $config_out_ref, $digests_ref) = @_;  # v1.94
    # %in_out: hash of all instances in directory of a given digest, 
    # value refers to digest's title and its message topics
    my (%in_out, $id);
    foreach (@{$digests_ref}) {
        $_ =~ m/${$config_in_ref}{'pattern_target'}/;
        $id = eval(${$config_out_ref}{'id_format'});  # v1.94
        $in_out{$id} = [ $_ ];
    }
    return \%in_out;
}

sub _identify_target_digest {
    my ($config_in_ref, $config_out_ref, 
            $dig_number, $dig_entry, $digests_ref) = @_;
    my ($hit);
    foreach my $digfile (@{$digests_ref}) {
        $digfile =~ m/${$config_in_ref}{'pattern_target'}/;
        if (defined $2) {
            next unless ($2 == $dig_number);
            $hit = $digfile;
            last;
        } elsif ((defined $1) and (! defined $2)) {
            next unless ($1 == $dig_number);
            $hit = $digfile;
            last;
        } else {
            die "Could'nt process digest filename to identify target digest: $!";
        }
    }
    if (defined $hit) {
        return $hit;
    } else {
        print STDERR "No ${$config_out_ref}{'title'} digest numbered $dig_number could be found in directory\n";
        print STDERR "  ${$config_out_ref}{'dir_digest'}\n";
        exit 0;
    }
}

sub _get_log_data {
    my ($config_out_ref, $choice, $in_out_ref) = @_;
    my $dir_digest  = ${$config_out_ref}{'dir_digest'};
    my $dir_threads = ${$config_out_ref}{'dir_threads'};
    my $logfile     = ${$config_out_ref}{'digests_log'};
    my $readfile    = defined ${$config_out_ref}{'digests_read'}  # new in 1.95
                    ? ${$config_out_ref}{'digests_read'}
                    : "$dir_digest/digests_read.txt";

    # hash which pulls in data from an external log file that 
    # records which digests have been previously processed
    my (%hashlog);
    open(LOG, $logfile) || die "cannot open $logfile for reading: $!";
    while (<LOG>) {
        chomp;

Tools.pm  view on Meta::CPAN

questions:

=over 4

=item 1

What internal structure has the mailing list sponsor provided for a given 
digest?

=item 2

How do I want to structure the results of applying Mail::Digest::Tools to a 
particular digest on my system?

=back

Each of these two questions breaks down into sub-parts.  Their answers 
supply you with the information with which you will construct the two 
configuration hashes passed to most Mail::Digest::Tools functions.  
Let us take each in turn.

=head1 C<%config_in>: THE INTERNAL STRUCTURE OF A DIGEST

The best way to learn about the internal structure of a mailing list digest 
(other than to study the application which created the digest in the first 
place) is to accumulate several instances of the digest on your system in a 
directory devoted to that purpose.  Examine the way the digest's filename is 
formed.  Then examine the digest file itself.  You will soon pick up a feel 
for the structure of the digest, which will guide you in configuring 
Mail::Digest::Tools for your system.  That configuration will take the form 
of a Perl hash which, for illustrative purposes, we shall here call 
C<%xxx_config_in> where C<xxx> is a short-hand title for a particular digest.

For heuristic purposes we will examine the characteristics of two mailing 
list digests which the author has been following and archiving for several 
years:  ActiveState's 'Perl-Win32-Users' digest and Yahoo! Groups' Perl 
Beginners group digest.

=head2 Analysis of Digest's File Name

We must study a digest's file name in order to be able to write a pattern 
with which we will be able to distinguish a digest file from any non-digest 
file sitting in the same directory, as well as to be able to extract the 
digest number from that file name.

Once saved as plain-text files, Perl-Win32-Users digest files typically look 
like this in a directory:

    Perl-Win32-Users Digest, Vol 1 Issue 1771.txt
    Perl-Win32-Users Digest, Vol 1 Issue 1772.txt

Similarly, the Perl Beginner digest files look like this:

    [PBML] Digest Number 1491.txt
    [PBML] Digest Number 1492.txt

To correctly identify Perl-Win32-Users digest files from any other files in 
the same directory, we compose a string which would form the core of a Perl 
regular expression, I<i.e.,> everything in a pattern except the outer 
delimiters.  Internally, Mail::Digest::Tools passes the file name through a 
C<grep { /regexp/ }> pattern, so the first key is called C<grep_formula>.

    %pw32u_config_in = (
        grep_formula            => 'Perl-Win32-Users Digest',
        ...
    );

The equivalent pattern for the Perl Beginners digest would be:

    %pbml_config_in = (
        grep_formula            => '\[PBML\]',
        ...
    );

Note that the C<[> and C<]> characters have to be escaped with a C<\> 
backslash because they are normally metacharacters inside Perl regular 
expressions.

We next have to extract the digest number from the digest's file name.  
Certain mailing list programs give individual digests both a 'Volume' number 
as well as an individual digest number.  Perl-Win32-Users typifies this.  In 
the example above we need to capture both the C<1> as volume number and C<1771> 
as digest number.  The next key in our configuration hash is called 
C<pattern_target>:

    %pw32u_config_in = (
        grep_formula            => 'Perl-Win32-Users Digest',
        pattern_target          => '.*Vol\s(\d+),\sIssue\s(\d+)\.txt',
        ...
    );

Note the two sets of capturing parentheses.

Other digests, such as those at Yahoo! Groups, dispense with a volume number 
and simply increment each digest number:

    %pbml_config_in = (
        grep_formula            => '\[PBML\]',
        pattern_target          => '.*\s(\d+)\.txt$',
        ...
    );

Note that this C<pattern_target> contains only one pair of capturing 
parentheses.

=head2 Analysis of Digest's Internal Structure

A digest's internal structure is discussed in detail above (see 
'A TYPICAL MAILING LIST DIGEST').  Here we need to identify two 
characteristics:  the way the digest introduces its list of today's topics 
and the string it uses to delimit the list of today's topics from the first 
individual message in the digest and all subsequent messages from one another.  
Continuing with our two examples from above, we provide values for keys 
C<topics_intro> and C<source_msg_delimiter>: 

    %pw32u_config_in = (
        grep_formula            => 'Perl-Win32-Users digest',
        pattern_target          => '.*Vol\s(\d+),\sIssue\s(\d+)\.txt',
        topics_intro            => 'Today\'s Topics:',
        source_msg_delimiter    => "--__--__--\n\n",
        ...
    );

Note the escaped C<'> apostrophe character in the value for key 
C<topics_intro>.

    %pbml_config_in = (
        grep_formula            => '\[PBML\]',
        pattern_target          => '.*\s(\d+)\.txt$',
        topics_intro            => 'Topics in this digest:',
        source_msg_delimiter    => "________________________________________________________________________\n________________________________________________________________________\n\n",
        ...
    );

Note that the values provided for the respective C<source_msg_delimiter> keys 
had to be double-quoted strings.  That's because all such delimiters include 
two or more C<\n> newline characters so that they form paragraphs unto 
themselves.  Unless indicated otherwise, the values for all other values in 
the configuration hash are single-quoted strings.

Note:  In early 2004, while Mail::Digest::Tools was being prepared for its 
initial distribution on CPAN, ActiveState changed certain features in the 
daily digest versions of its mailing lists.  Hence, the code example presented 
above should not be 'copied-and-pasted' into a configuration hash with which 
you, the user, might follow the current Perl-Win32-Users digest.  In 
particular, the source message delimiter was changed to a string of 30 
hyphens followed by 2 C<\n> newline characters:

    "------------------------------\n\n"

However, since it is not unheard of for contributors to a mailing list to use 
such a string of hyphens within their postings or signatures, using a string 
of hyphens is not a particularly apt choice for a source message delimiter.  
In this particular case, the author is getting better (but not fully tested) 
results by including an additional newline I<before> the hyphen string in 
order to more uniquely identify the source message delimiter:

    "\n------------------------------\n\n"

=head2 Analysis of Individual Messages

The internal structure of an individual message within a digest is also 
discussed in detail above.  Here we need to identify patterns with which we 
can extract the content of the message's headers.

Certain mailing list digest programs allow a wide variety of headers to appear 
in digested messages.  The Perl-Win32-Users digest typifies this.  Each 
message in a Perl-Win32_Users digest I<must> have a message number and headers 
for the message's author, recipients, subject and date.

    Message: 1
    From: Chris Smithson <ChrisSmithson@some.web.address.com>
    To: "'Carter Kraus'" <carter@some.web.address.com>,
           "Perl-Win32-Users (E-mail)" <perl-win32-users@activestate.com>
    Subject: RE: OO Perl Issue.
    Date: Wed, 4 Feb 2004 14:17:24 -0600 

But a message in this digest may have additional headers for the author's 
organization, reply address and/or carbon-copy recipients.

    Message: 5
    Date: Wed, 4 Feb 2004 15:15:44 -0800
    From: Sam Spade <sspade@some.web.address.com>
    Organization: Some Web Address
    Reply-To: Sam Spade <sspade@some.web.address.com>
    To: "Time" <summers@some.web.address.com>
    CC: "Perl List" <perl-win32-users@listserv.activestate.com>
    Subject: Re: New IE Update causes script problems

Patterns are easily developed to capture this information and store it in the 
configuration hash:

    %pw32u_config_in = (
        grep_formula            => 'Perl-Win32-Users digest',
        pattern_target          => '.*Vol\s(\d+),\sIssue\s(\d+)\.txt',
        topics_intro            => 'Today\'s Topics:',
        source_msg_delimiter    => "--__--__--\n\n",
        message_style_flag      => '^Message:\s+(\d+)$',
        from_style_flag         => '^From:\s+(.+)$',
        org_style_flag          => '^Organization:\s+(.+)$',
        to_style_flag           => '^To:\s+(.+)$',
        cc_style_flag           => '^CC:\s+(.+)$',
        subject_style_flag      => '^Subject:\s+(.+)$',
        date_style_flag         => '^Date:\s+(.+)$',
        reply_to_style_flag     => '^Reply-To:\s+(.+)$',
        ...
    );

Other mailing list digest programs allow much fewer headers in digested 
messages.  The Yahoo! Groups digests such as Perl Beginner typify this.

    Message: 4
       Date: Sun, 7 Dec 2003 19:24:03 +1100
       From: Philip Streets <phil@some.web.address.com.au>
    Subject: RH9.0, perl 5.8.2 and qmail-localfilter question

The patterns developed to capture this information and store it in the 
configuration hash would be as follows:

    %pbml_config_in = (
        grep_formula            => '\[PBML\]',
        pattern_target          => '.*\s(\d+)\.txt$',
        topics_intro            => 'Topics in this digest:',
        source_msg_delimiter    => "________________________________________________________________________\n________________________________________________________________________\n\n",
        message_style_flag      => '^Message:\s+(\d+)$',
        from_style_flag         => '^\s+From:\s+(.+)$',
        subject_style_flag      => '^Subject:\s+(.+)$',
        date_style_flag         => '^\s+Date:\s+(.+)$',
        ...
    );

Note that this pattern is written to expect 1 or more whitespaces at the 
beginning of the C<from_style_flag> and the C<date_style_flag>.

We could -- but do not need to -- add the following key-value pairs to the 
C<%pbml_config_in> hash.

        org_style_flag          => undef,
        to_style_flag           => undef,
        cc_style_flag           => undef,
        reply_to_style_flag     => undef,

=head2 Inspection of Messages for Multipart MIME Content

Certain mailing lists allow subscribers to post messages in either plain-text 
or HTML.  Certain lists allow subscribers to post attachments; others do not.  
When it comes to preparing digests of these messages, the programs which 
different lists take lead to different results.  The most annoying situation 
occurs when a list allows a subscriber to post in 'multipart MIME format' and 
then fails to strip out the redundant HTML part after printing the needed 
plain-text part.

I<Example:>  An all too typical example from an older version of an ActiveState 
list digest.  (ActiveState changed the format of its digests in early 2004 to 
strip out HTML attachments.  Hence, the following code no longer accurately 
represents what a subscriber to an ActiveState digest will see.  Other mailing 
lists still suffer from MIME bloat, however, so treat the following code as 
illustrative.)  The message begins:

    Message: 1
    To: Perl-Win32-Users@activestate.com
    Subject: Can not tie STDOUT to scolled Tk widget
    From: John_Wonderman@some.web.address.ca
    Date: Thu, 15 Jan 2004 16:25:17 -0500
    This is a multipart message in MIME format.
    --=_alternative 00750F0485256E1C_=
    Content-Type: text/plain; charset="US-ASCII"
    Hi;
    I am trying to implement a scrolling text widget to capture output for for 
    at tk app. Without scrolling:
    my $text = $mw->Text(-width => 78,
           -height => 32,
           -wrap => 'word',
           -font => ['Courier New','11']
    )->pack(-side => 'bottom',
           -expand => 1,
           -fill => 'both',
    );
    ...

When the plain-text part of the message is finished, it is then repeated in 
HTML:

    --=_alternative 00750F0485256E1C_=
    Content-Type: text/html; charset="US-ASCII"
    <br><font size=2 face="Tahoma">Hi;</font>
    <p><font size=2 face="Tahoma">I am trying to implement a scrolling text
    widget to capture output for for at tk app. Without scrolling:</font>
    <p><font size=2 face="Bitstream Vera Sans Mono">my $text = $mw-&gt;Text(-width
    =&gt; 78,</font>
    <br><font size=2 face="Bitstream Vera Sans Mono">&nbsp; &nbsp; &nbsp; &nbsp;
    -height =&gt; 32,</font>
    <br><font size=2 face="Bitstream Vera Sans Mono">&nbsp; &nbsp; &nbsp; &nbsp;
    -wrap =&gt; 'word',</font>
    <br><font size=2 face="Bitstream Vera Sans Mono">&nbsp; &nbsp; &nbsp; &nbsp;
    -font =&gt; ['Courier New','11']</font>
    <br><font size=2 face="Bitstream Vera Sans Mono">)-&gt;pack(-side =&gt;
    'bottom',</font>
    <br><font size=2 face="Bitstream Vera Sans Mono">&nbsp; &nbsp; &nbsp; &nbsp;
    -expand =&gt; 1,</font>
    <br><font size=2 face="Bitstream Vera Sans Mono">&nbsp; &nbsp; &nbsp; &nbsp;
    -fill =&gt; 'both',</font>

There is no reason to retain this bloat in your thread file.  The digest 
providers should have stripped it out, but the program they were using failed 
to do so.  Other digests, such as those at Yahoo! Groups, eliminate all this 
blather.

Now, with Mail::Digest::Tools, you can eliminate much of the bloat yourself.  
After examining 6-10 instances of a particular mailing list digest, you should 
be able to determine whether the digest needs a dose of digital castor oil or 
not, and you set key C<MIME_cleanup_flag> accordingly.  If the digest contains 
unnecessary multipart MIME content, you set this flag to C<1>; otherwise, to 
C<0>.

And with that you have completed your analysis of the internal structure of a 
given digest and entered the relevant information into the first configuration 
hash:

    %pw32u_config_in = (
        grep_formula            => 'Perl-Win32-Users digest',
        pattern_target          => '.*Vol\s(\d+),\sIssue\s(\d+)\.txt',
        topics_intro            => 'Today\'s Topics:',
        source_msg_delimiter    => "--__--__--\n\n",
        message_style_flag      => '^Message:\s+(\d+)$',
        from_style_flag         => '^From:\s+(.+)$',
        org_style_flag          => '^Organization:\s+(.+)$',
        to_style_flag           => '^To:\s+(.+)$',
        cc_style_flag           => '^CC:\s+(.+)$',
        subject_style_flag      => '^Subject:\s+(.+)$',
        date_style_flag         => '^Date:\s+(.+)$',
        reply_to_style_flag     => '^Reply-To:\s+(.+)$',
        MIME_cleanup_flag       => 1,
    );

    %pbml_config_in = (
        grep_formula            => '\[PBML\]',
        pattern_target          => '.*\s(\d+)\.txt$',
        topics_intro            => 'Topics in this digest:',
        source_msg_delimiter    => "________________________________________________________________________\n________________________________________________________________________\n\n",
        message_style_flag      => '^Message:\s+(\d+)$',
        from_style_flag         => '^\s+From:\s+(.+)$',
        subject_style_flag      => '^Subject:\s+(.+)$',
        date_style_flag         => '^\s+Date:\s+(.+)$',
        MIME_cleanup_flag       => 0,
    );

=head1 C<%config_out>: HOW TO PROCESS A DIGEST ON YOUR SYSTEM

C<%config_in> holds the answers to the question:  What internal structure has 
the mailing list sponsor provided for a given digest?  In contrast, 
C<%config_out> will hold the answer to this question:  How do I want to 
structure the results of applying Mail::Digest::Tools to a particular digest 
on my system?

For purpose of illustration, we will continue to assume that we are processing 
digest files received from the Perl-Win32-Users and Perl Beginner lists.  We 
will make slightly different choices as to how we process those digest files 
so as to illustrate different options available from Mail::Digest::Tools.

We shall also assume that we going to place the scripts from which we call 
Mail::Digest::Tools functions in the directory I<above> the directories in 
which we store the digest files once they have been saved as plain-text files.  
If we call this directory C<digest> and place the scripts in that directory, 
then we will have a directory structure that starts out like this:

    digest/
        process_new.pl
        process_ALL.pl
        reply_digest_message.pl
        repair_digest_order.pl
        consolidate_threads.pl
        deletables.pl
        pw32u/
            Perl-Win32-Users Digest, Vol 1 Issue 1771.txt
            Perl-Win32-Users Digest, Vol 1 Issue 1772.txt
        pbml/
            [PBML] Digest Number 1491.txt
            [PBML] Digest Number 1492.txt

=head2 Required C<%config_out> Keys

There are 9 keys which are required in C<%config_out> in order for 
Mail::Digest::Tools to function properly.  They correspond to 9 decisions 
which you must make in setting up a Mail::Digest::Tools configuration on 
your system.

=over 4

=item 1 Title

Each digest must be given a title which is used whenever Mail::Digest::Tools 
needs to prompt or warn you on standard output.  The key which holds this 
information in C<%config_out> must be called C<title>; the value for this 
element should be sensible.

    %pw32u_config_out = (

Tools.pm  view on Meta::CPAN

=back        

=head1 HELPFUL HINTS

... in which the module author shares what he has learned using 
Mail::Digest::Tools and its predecessors since August 2000.

=head2 Initial Configuration and Testing

As mentioned above, if you are considering creating a local archive of threads 
originating in daily digest versions of a mailing list, you should first 
accumulate 6-10 instances of such digests and both:

=over 4

=item 1

study the internal structure of the digest -- needed to develop a 
C<%config_in> for the digest; and

=item 2

carefully consider how you wish to structure the output from the module's 
use on your system -- needed to develop C<%config_out> for the digest

=back

Once you have developed the initial configuration, you should call 
C<reprocess_ALL_digests()> on the digests, then open the files created to see 
if the results are what you want.  If they are I<not> what you want, then you 
need to think about what you should change in C<%config_in> and/or 
C<%config_out>.  Make those changes, then call C<reprocess_ALL_digests()> 
again.  Repeat as needed, making sure not to delete any of the digest files 
you are using as sources until you are completely satisfied with your 
configuration.

Once, however, you I<are> satisfied with your configuration, you should call 
C<process_new_digests()> on new instances of digests and I<never> call 
C<reprocess_ALL_digests()> for that digest again (lest you not be able to 
regenerate threads containing messages from digests you have deleted over 
time).

=head2 Where to Store the Configuration Hashes

As mentioned above, you will probably find it convenient to write separate 
Perl scripts to call each one of Mail::Digest::Tool's public functions.  You 
could code C<%config_in> and C<%config_out> in each of those scripts just 
before the respective function calls.  But that would violate the principle of 
'Repeated Code Is a Mistake' and multiply maintenance problems.  It's far 
better to code the two configuration hashes in a separate plain-text file and 
'require' that file into your script.  That way, any changes you make in the 
configuration will be automatically picked up by each script that calls a 
Mail::Digest::Tools function.

Here is an example of such a file holding the configuration hashes governing 
use of the Perl-Win32-Users digest, along with a script making use of that file.

    # file:  pw32u.digest.data
    $topdir = "E:/Digest/pw32u";
    %config_in =  (
         grep_formula           => 'Perl-Win32-Users digest',
         pattern_target          => '.*Vol\s(\d+),\sIssue\s(\d+)\.txt',
         # next element's value must be double-quoted
         source_msg_delimiter   => "--__--__--\n\n",
         topics_intro           => 'Today\'s Topics:',
         message_style_flag     => '^Message:\s+(\d+)$',
         from_style_flag        => '^From:\s+(.+)$',
         org_style_flag         => '^Organization:\s+(.+)$',
         to_style_flag          => '^To:\s+(.+)$',
         cc_style_flag          => '^CC:\s+(.+)$',
         subject_style_flag     => '^Subject:\s+(.+)$',
         date_style_flag        => '^Date:\s+(.+)$',
         reply_to_style_flag    => '^Reply-To:\s+(.+)$',
         MIME_cleanup_flag      => 1,
    );

    %config_out =  (
         title                  => 'Perl-Win32-Users',
         dir_digest             => $topdir,
         dir_threads            => "$topdir/Threads",
         dir_archive_top        => "$topdir/Threads/archive",
         archived_today         => "$topdir/archived_today.txt",
         de_archived_today      => "$topdir/de_archived_today.txt",
         deleted_today          => "$topdir/deleted_today.txt",
         digests_log            => "$topdir/digests_log.txt",
         digests_read           => "$topdir/digests_read.txt",
         todays_topics          => "$topdir/todays_topics.txt",
         mimelog                => "$topdir/mimelog.txt",
         id_format              => 'sprintf("%03d",$1) . \'_\' . 
                                        sprintf("%04d",$2)',
         output_id_format       => 'sprintf("%04d",$1)',
         MIME_cleanup_log_flag  => 1,
         # next element's value must be double-quoted
         thread_msg_delimiter   => "--__--__--\n\n",
         archive_kill_trigger   => 1,
         archive_kill_days      => 14,
         digests_read_flag      => 1,
         archive_config         => 0,
    );

    # script:  dig.pl
    # USAGE:  perl dig.pl
    #!/usr/bin/perl
    use strict;
    use warnings;
    use Mail::Digest::Tools qw( process_new_digests );

    our (%config_in, %config_out);
    my $data_file = 'pw32u.digest.data';
    require $data_file;

    process_new_digests(\%config_in, \%config_out);

    print "\nFinished\n";

=head2 Maintaining Local Archives of More than One Digest

The module author has maintained local archives of more than a half dozen 
different mailing list digests over the past several years.  He has found it 
convenient to maintain the configuration information for I<all> the digests 
he is following at a given time in a I<single> configuration file.  The 
advantage to this approach is that if two digests share a similar internal 
structure (perhaps due to being generated by the same mailing list program or 
list provider) and if the user chooses to structure the output from the two 
digests in similar or identical ways, then getting the configuration hashes 
becomes much easier and the potential for error is reduced.

Here is a sample directory and file structure for maintaining archives of 
two different digests on a Win32 system:

    digest/
    digest.data
    process_new.pl
    process_ALL.pl
    reply_digest_message.pl
    repair_digest_order.pl
    consolidate_threads.pl
    deletables.pl
    pw32u/
        Perl-Win32-Users Digest, Vol 1 Issue 1771.txt
        Perl-Win32-Users Digest, Vol 1 Issue 1772.txt
        digest_log.txt
        digest_read.txt
        mimelog.txt
        Threads/
    pbml/
        [PBML] Digest Number 1491.txt
        [PBML] Digest Number 1492.txt
        digest_log.txt
        Threads/

File F<digest.data> would look like this:

    # digest.data
    $topdir = "E:/Digest";
    %digest_structure = (
        pbml =>    {
             grep_formula   => '\[PBML\]',
             pattern_target => '.*\s(\d+)\.txt$',
             ...
           },
        pw32u =>   {
             grep_formula   => 'Perl-Win32-Users digest',
             pattern_target => '.*Vol\s(\d+),\sIssue\s(\d+)\.txt',
             ...
           },
    );
    %digest_output_format = (
        pbml =>    {
             title          => 'Perl Beginner',
             dir_digest     => "$topdir/pbml",
             dir_threads    => "$topdir/pbml/Threads",
             ...
           },
        pw32u =>   {
             title          => 'Perl-Win32-Users',
             dir_digest     => "$topdir/pw32u",
             dir_threads    => "$topdir/pw32u/Threads",
             ...
           },
    );

To accomodate this slightly more complex structure in the configuration file, 
the calling script might be modified as follows:

    # script:  dig.pl
    # USAGE:  perl dig.pl [short-name for digest]
    #!/usr/bin/perl
    use Mail::Digest::Tools qw( process_new_digests );

    my ($this_key, %config_in, %config_out);
    # variables imported from $data_file
    our (%digest_structure, %digest_output_format);    

    my $data_file = 'digest.data';
    require $data_file;

    $this_key = shift @ARGV;
    die "\n     The command-line argument you typed:  $this_key\n     does not call an accessible digest$!" 
        unless (defined $digest_structure{$this_key}
            and defined $digest_output_format{$this_key});

    my ($k,$v);
    while ( ($k, $v) = each %{$digest_structure{$this_key}} ) {
        $config_in{$k} = $v;
    }
    while ( ($k, $v) = each %{$digest_output_format{$this_key}} ) {
        $config_out{$k} = $v;
    }

    process_new_digests(\%config_in, \%config_out);

    print "\nFinished\n";

=head2 Getting Your Mail to the Right Place on Your System

For several years the module author used the scripts which were predecessors 
to Mail::Digest::Tools on a Win32 system where mail was read with Microsoft 
Outlook Express.  He would do a "File/Save as.." on an instance of a digest, 
select text format (*.txt) and save it to an appropriate directory.  Later, 
the author used the shareware e-mail client Poco, in which the same operation 
was accomplished by highlighting a file and keying "Ctrl+S".



( run in 0.515 second using v1.01-cache-2.11-cpan-39bf76dae61 )