#!/usr/bin/perl
# -*- Perl -*-          # Fri 30 Jun 11:35:28 CDT 2006
###############################################################################
# Written by Tim Skirvin <tskirvin@killfile.org>.  Copyright 2004-2006,
# Tim Skirvin.  Redistribution terms are below.
###############################################################################
our $VERSION = "0.10";

###############################################################################
### User Configuration ########################################################
###############################################################################
use vars qw( $LOCALCONF $KIBOZEDIR $DB_TYPE $ARCHIVEGROUP $DEBUG $QUIET
             $VERBOSE $URL $PREFIX $DOMAIN $FROM );

## Things used to actually get the sentnews file and parse it.  I'll further
## document these later.

# $URL    = "http://www.stanford.edu/~tskirvin/sentnews";
$URL    = "http://www.killfile.org/~tskirvin/sentnews";
$PREFIX = "invalid.notrealid";
$DOMAIN = "news.killfile.org";
$FROM   = 'tskirvin@unknown.site.invalid (Tim Skirvin)';

## Rather than having everything in this shared configuration, load this
## file to get additional configuration.  This file contains additional
## perl.

$LOCALCONF = "$ENV{'HOME'}/.kibozerc";

## Where should we store all of our files?  This needs to be set or
## nothing will run; also, the directory must already exist.

$KIBOZEDIR = "";

## What kind of database should we use to store history information?

$DB_TYPE  = "DB_File";

## If we're storing everything to an archive group, you can set it here.

$ARCHIVEGROUP = "";

## Do we want to print debugging information?  Use verbose status messages?
## Be entirely silent?  Set these things here.

$DEBUG    = 0;
$VERBOSE  = 0;
$QUIET    = 0;

## If the modules are set up in a non-standard place, edit this line
## as appropriate.
BEGIN { use lib '/home/tskirvin/dev/news-archive'; }

###############################################################################
### main() ####################################################################
###############################################################################

use strict;
use News::Archive;
use News::Article;
use News::Article::Mbox;
use LWP::Simple;
use Date::Parse;
use Getopt::Std;

# Error Codes
our %ERROR = ( 'SUCCESS' => 0, 'CONFIG' => 1, 'SERVER' => 2 );

# Command-line configuration
use vars qw( %OPTS );
getopts('hvVQdc:a:', \%OPTS);

# Load local configuration from local configuration file
$LOCALCONF = $OPTS{'c'} if $OPTS{'c'};
if ( $LOCALCONF && -r $LOCALCONF ) { do $LOCALCONF }

&Usage   if $OPTS{'h'};         # Print usage information and exit
&Version if $OPTS{'v'};         # Print version information and exit

# Other command-line parsing; overrides localconf
$DEBUG   = $OPTS{'d'} if defined $OPTS{'d'};
$VERBOSE = $OPTS{'V'} || $DEBUG || $VERBOSE || 0;       # Give verbose messages?
$QUIET   = $OPTS{'Q'}           || $QUIET   || 0;
$ARCHIVEGROUP = $OPTS{'A'} if defined $OPTS{'A'};

# Create the options hash to start a News::Archive object
our %OPTHASH;
$OPTHASH{ 'basedir' } = $KIBOZEDIR  if $KIBOZEDIR;
$OPTHASH{ 'db_type' } = $DB_TYPE    if $DB_TYPE;
$OPTHASH{ 'debug'   } = $DEBUG      if $DEBUG;

## Download the most recent version of the sentnews file from $URL, and get
## the articles from it.

my $content = get($URL);
die "Couldn't get $URL: $!\n" unless defined $content;
my @content = split(/\n/, $content);

my (@lines, @from);
foreach ( @content ) {
  my $line = $_;
  if ($line =~ /^From: .*/) {
    push @from, $line;
  } elsif ($line =~ /^\s*$/) {
    push @lines, reverse @from if @from; @from = ();
    push @lines, $line;
  } else { push @lines, $line; }
}
map { s/$/\n/ } @lines ;        # Put the newlines back in
my @articles = News::Article::Mbox->read_mbox(\@lines);

## Parse the articles into something more reasonable, before we start pushing
## them into the archives.
foreach my $article (@articles) {
  my $timestamp = $$article{TIMESTAMP} || "";
  my $stamp = $timestamp;  $stamp =~ s/^From \S+\s*//;

  $article->add_message_id($PREFIX, $DOMAIN);
  if ($article->header('from') eq 'tskirvin') {
    $article->set_headers('from', $FROM );
  }
  my @body = $article->body;
  map { s/([Tt])umati/$1\*mati/g; } @body;
  unless ($article->header('date')) {
    $article->set_headers('date', $stamp);
  }
  $article->set_body(@body);
}

# Create the News::Archive object after we have the articles
my $archive = new News::Archive ( %OPTHASH )
        or Exit('SERVER', "Couldn't create/load archive item: ",
                                                News::Archive->error);
$archive->lock or Exit ('SERVER', "Couldn't lock archive: $!");

# If we've got $ARCHIVEGROUP set, then make sure we're subscribed to it.
$archive->subscribe($ARCHIVEGROUP) if ($ARCHIVEGROUP);

# Check for each article.
my $count = 0;
foreach my $article (@articles) {
  next unless $article;
  my $messageid = $article->header('message-id');

  # If this article has already been processed, skip it
  next unless $messageid;
  if ( $archive->article( $messageid ) ) {
    warn "Already processed '$messageid'\n" if $VERBOSE;
    next;
  }

  # We need at least this many more headers to do anything with it
  next unless $article->header('from');
  next unless $article->header('newsgroups');
  next unless $article->header('subject');
  next unless $article->header('date');

  # Debugging hook; don't worry about this in general.
  # $article->write(\*STDOUT);

  # Get the list of groups we're supposed to be saving the article into
  my @groups = split('\s*,\s*', $article->header('newsgroups') );
  map { s/\s+//g } @groups;

  # Make sure we're subscribed to all these groups
  foreach (@groups) { $archive->subscribe($_) }
  push @groups, $ARCHIVEGROUP if $ARCHIVEGROUP; # We're subscribed already

  # Actually save the article.
  my $ret = $archive->save_article(
        [ @{$article->rawheaders}, '', @{$article->body} ], @groups );
  $count++ if $ret;
  if ($VERBOSE) { warn $ret ? "Accepted article $messageid\n"
                            : "Couldn't save article $messageid\n" }

}

$archive->close;

Exit('SUCCESS', $count ? "$count articles archived" : "");

###############################################################################
### Subroutines ###############################################################
###############################################################################

## Exit ( CODE, REASON )
# Exits the program cleanly with a proper error message
sub Exit {
  my ($code, $reason, @details) = @_;
  exit $ERROR{$code} unless $reason;
  # map { $_ = " - $_" } @details;
  $reason = join("\n", $reason, @details, '');
  warn $reason if (!$QUIET && $reason);
  exit $ERROR{$code};
}

## version ( [VERSION] )
# Returns the program name and version number
sub version {
  my $version = shift || $VERSION || "unknown";
  my $prog = $0; $prog =~  s%.*/%%g;            # Clean up the program name
  "$prog v$VERSION"
}

###############################################################################
### Documentation #############################################################
###############################################################################

=head1 NAME

getposts - get posts for News::Archive from an mbox file

=head1 SYNOPSIS

  mbox2news.pl [-hvVQd] [-c configfile] [-a groupname] < ARTICLE

=head1 DESCRIPTION

(OLD) 

mbox2news.pl is a part of the News::Archive package, which is used to
archive news articles in a reasonably efficient manner.  This particular
script is meant for use for personal archives, loading articles (or
mbox-style archives) on STDIN and loading them into $KIBOZEDIR.

Without any options, mbox2news.pl will look for input from STDIN.
Configuration information is stored in the configuration file
(B<kibozerc>).  Command-line options are as follows:

=over 4

=item -h

Print this usage information and exit.

=item -v

Print version information and exit.

=item -d

Debug; print debugging information (implies -V)

=item -c configfile

Load this configuration file instead of default (~/.kibozerc).


=item -a groupname

Add this group to all articles we download, so that they can be accessed
uniformly.

=item -V

Verbose; print information on every article.

=item -Q

Quiet mode; only print that which is absolutely necessary (and even then
think about it).

=back

=head1 NOTES

This is essentially a counterpart to kiboze.pl; while that program goes to
news servers and downloads new messages, mbox2news.pl uses existing
messages.  It is primarily meant to convert from old kiboze archives, or
anything similar.

This script was meant to work with mbox-style archives; this is compatible
with some single articles, however, so you can feed them in directly as well.

You may well need a helper script to make some programs give you
mbox-compatible archives.  This distribution comes with 'nnparse', which
is used to fix the 'sentnews' items from the nn news reader; you can use
it as an example if you wish.

=head1 REQUIREMENTS

B<News::Archive>, B<News::Article::Mbox>, [...]

=head1 SEE ALSO

B<kibozerc>, B<newsarchive.pl>, B<kiboze.pl>

=head1 AUTHOR

Tim Skirvin <tskirvin@killfile.org>

=head1 LICENSE

This code may be redistributed under the same terms as Perl itself.

=head1 HOMEPAGE

http://www.killfile.org/~tskirvin/software/news-archive/

=head1 COPYRIGHT

Copyright 2003-2006, Tim Skirvin <tskirvin@killfile.org>

=cut

###############################################################################
### Version History ###########################################################
###############################################################################
# 0.10          Fri 30 Jun 11:40:44 CDT 2006    tskirvin
### Initial version, based off of mbox2news.pl and some other code.  Not 
### really documented worth anything.  Oh well.
