#!/usr/bin/perl -w use Getopt::Long; use File::Basename; use strict; use warnings; my($me, $mepath, $mesuffix) = fileparse($0, '\.pl'); #-------------------------------------------------------------- # Initialization - Sort out the options and parameters #-------------------------------------------------------------- my %opts = (taxonomy => ""); Getopt::Long::GetOptions( \%opts,qw( help taxonomy=s )) || die "Failed to parse options\n"; # Simple help if requested if ($opts{help}) { print STDERR "$me [options]\n" . "Remove HTML from a file in preparation for sending to Drual site.\n"; #if ($opts{help}) { system("perldoc $0"); } exit 1; } #-------------------------------------------------------------- # Read in all the HTML files, clean them up #-------------------------------------------------------------- my $w = 0; foreach my $f (@ARGV) { open(IN,$f) || die $!; my $tline = ''; # Try to find a title line my $newsletter = 0; # Strip header while() { if (/
(Volume \d+[^<]+) found. Maybe this file has no HTML headers. Retrying\n"; close(IN); open(IN,$f) || die $!; } # Strip font crap, go until body my @h = (); while(my $l = ) { $l =~ s/\r//; if ($l =~ /^(.*)]+>(.*)/) { # Remove any
$l = $1 .' ' . $2 . "\n"; if ($l eq " \n") { next; } } if ($l =~ /<\/body>/) { last; } # Remove misuses of various things $l =~ s///g; $l =~ s/<\/o:p>//g; $l =~ s/<\/span>//g; $l =~ s/<\/font>//g; $l =~ s/’/'/g; $l =~ s/“/'/g; $l =~ s/”/'/g; $l =~ s/—//g; $l =~ s/–/,/g; $l =~ s/’/'/g; $l =~ s/“/"/g; $l = rmfont($l); $l = rmspan($l); if ($l =~ /(.*)

]+>(.*)/) { $l = $1 . '

' . $2 . "\n"; } if ($l eq " \n") { next; } # Try to guess the title if (! $tline) { if ($l =~ /(.+)<\/strong>/) { $tline = $1; } elsif ($l =~ /

(.+)<\/h3>/) { $tline = $1; } } push @h,$l; } close(IN); # If the title was taken from the first line, remove it from the HTML if ($tline && $h[0] =~ /^

.+$tline.*<\/p>$/) { $_ = shift @h; } # Write the new HTML into a stripped file $f .= '.stripped'; open(OUT,'>' . $f) || die "Unable to create file '$f'\n"; print OUT @h; close(OUT); printf("file=%-30s; title=%-30s; taxonomy=%s;\n", $f, $tline, $opts{taxonomy}); if (! $tline) { warn "Unable to parse out title line in '$f'\n"; $w++; } } exit $w; sub rmfont { my ($s) = @_; for my $i (0,1,2,3) { if ($s !~ /(.+)]+>(.*)/) { return $s; } $s = $1 . $2 . "\n"; } return $s; } sub rmspan { my ($s) = @_; for my $i (0,1,2,3) { if ($s !~ /]+>(.*)/) { $s = $1 . $2 . "\n"; } } return $s; }