#!/usr/bin/perl -w

# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at:
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>

use strict;
use warnings;

sub mywarn;

use Getopt::Long qw(:config bundling auto_help);
use Pod::Usage;
our ($opt_noannotate, $opt_h, $opt_ids);
GetOptions("noannotate" => \$opt_noannotate,
           "ids" => \$opt_ids,
	   "h|headers" => \$opt_h
	  );

=head1 NAME

mboxget - Extract message(s) from an mbox (or other mail file)

=head1 SYNOPSIS

 mboxget [options] <target> ...

 Options:
  -h, --headers       Display only message headers
  --ids               List only the IDs, do not generate an mbox
  --noannotate        Do not add X-Mass-Check-Id header with message path

=head1 DESCRIPTION

B<mboxget> extracts and displays the messages specified on the command
line and on STDIN. Messages are expected to be listed as
<path_to_file>.<offeset> or simply <path_to_file>. Multiple messages
can be specified, in which case, each message will be output. This
script is very useful in combination with mass-check logs and
mboxes. Targets given on STDIN may be in B<mass-check> format, and may
even be preceeded by a filename and a colon (like the output of
B<grep> would provide)

Use the B<-h> option to just output headers.

By default an X-Mass-Check-Id header is added to make it easier to
find a message. Use B<--noannotate> to disable this.

=head1 EXAMPLES

To show spam messages that hit the rule BAYES_99

grep BAYES_99 masses.log | mboxget

To show the message indicated by "/path/to/my/mbox.1234"

mboxget /path/to/my/mbox.1234

To find the Subject lines of all spam messages that hit SUBJ_AS_SEEN

grep SUBJ_AS_SEEN spam.log | mboxget -h | grep ^Subject:

To get the ids or paths of messages that hit SUBJ_AS_SEEN

grep SUBJ_AS_SEEN spam.log | mboxget --ids

=cut



my $prog = $0;
$prog =~ s@.*/@@;

my $annotate = ($opt_noannotate ? 0 : 1);

foreach (@ARGV) {
  handle_input($_);
}

while (<>) {
  s/^[^\s:]+://;  # filenames, from "grep foo *"

  if (/^[Y\.]\s+-?\d+\s+(\S+)\s+\S+/) {
    # mass-check format: . 0 /home/jm/Mail/Spam2/3475 FOO_BAR,BAZ
    handle_input($1);
  }
  elsif (/^[Y\.]\s+-?\d+\s+(\S+)\s*$/) {
    # mass-check format: . 0 /home/jm/Mail/Spam2/3475
    handle_input($1);
  }
  else {
    next if /^#/;
    chomp;
    handle_input($_);
  }
}

exit;

sub handle_input {
  my $where = shift;

  if ($opt_ids) {
    print "$where\n";
    return;
  }

  my ($file, $offset);

  if (-f $where) {
    ($file, $offset) = ($where);
  } else {
    ($file, $offset) = ($where =~ m/(.*?)(?:\.(\d+))?$/);
  }

  if ($file =~ /\.gz$/) {
    open (INPUT, "gunzip -cd $file |") or mywarn "gunzip $file failed: $!";
  } elsif ($file =~ /\.bz2$/) {
    open (INPUT, "bzip2 -cd $file |") or mywarn "bunzip2 $file failed: $!";
  } else {
    open (INPUT, "<$file") or mywarn "open $file failed: $!";
  }

  if ($offset) {
    # TODO: steal open-file caching code from old revisions of
    # mass-check-results-to-mbox
    if (!seek(INPUT, $offset, 0)) {
      mywarn "$prog: seek $offset failed: $!\n";
      close INPUT;
      return;
    }
  }

  # read the message into @msg
  my $past = 0;
  my @msg = ();
  while (<INPUT>) {
    if ($past && defined($offset)) {
      # only do this for mboxes
      last if substr($_,0,5) eq "From ";
    }
    else {
      $past = 1;
    }
    if ($opt_h) {
      last if /^$/;
    }
    push (@msg, $_);
  }
  close INPUT;

  # now chop off the leading headers that may have come from a previous
  # run, or will interfere with insertion of the X-Mass-Check-Id hdr
  my $fromline = "From nobody\@nowhere  Wed Jan  1 00:00:00 2000\n";
  while (scalar @msg > 0 &&
      $msg[0] =~ /^(?:From|X-Mass-Check-Id:) /)
  {
    if ($msg[0] =~ /^From /) { $fromline = $msg[0]; }
    shift @msg;
  }

  # and output
  $annotate and unshift (@msg, "X-Mass-Check-Id: $where\n");
  print $fromline, @msg, "\n";
}

sub mywarn {
  warn @_;
  if ($annotate) { print "X-Mass-Check-Warning: ".join ('',@_)."\n"; }
}

