#!/usr/bin/perl -w
# bmconv1.pl - Convert HTML entities to ISO8850-1 (Latin 1)
# (c) Marco Vieth, 2006
#
# 0.01  25.02.2006 first tests
#
#
  $VERSION = '0.01';
  use 5.004;
  use strict;

  #use Getopt::Std ();

#############

# Charmap of named HTML entities for ISO 8859-1 (since HTML 3.2)
my %charmap = (
  '' => '&iexcl;',
  '' => '&cent;',
  '' => '&pound;',
  '' => '&curren;',
  '' => '&yen;',
  '' => '&brvbar;',
  '' => '&sect;',
  '' => '&uml;',
  '' => '&copy;',
  '' => '&ordf;',
  '' => '&laquo;',
  '' => '&not;',
  '' => '&shy;',
  '' => '&reg;',
  '' => '&macr;',
  '' => '&deg;',
  '' => '&plusmn;',
  '' => '&sup2;',
  '' => '&sup3;',
  '' => '&acute;',
  '' => '&micro;',
  '' => '&para;',
  '' => '&middot;',
  '' => '&cedil;',
  '' => '&sup1;',
  '' => '&ordm;',
  '' => '&raquo;',
  '' => '&frac14;',
  '' => '&frac12;',
  '' => '&frac34;',
  '' => '&iquest;',
  '' => '&Agrave;',
  '' => '&Aacute;',
  '' => '&Acirc;',
  '' => '&Atilde;',
  '' => '&Auml;',
  '' => '&Aring;',
  '' => '&AElig;',
  '' => '&Ccedil;',
  '' => '&Egrave;',
  '' => '&Eacute;',
  '' => '&Ecirc;',
  '' => '&Euml;',
  '' => '&Igrave;',
  '' => '&Iacute;',
  '' => '&Icirc;',
  '' => '&Iuml;',
  '' => '&ETH;',
  '' => '&Ntilde;',
  '' => '&Ograve;',
  '' => '&Oacute;',
  '' => '&Ocirc;',
  '' => '&Otilde;',
  '' => '&Ouml;',
  '' => '&times;',
  '' => '&Oslash;',
  '' => '&Ugrave;',
  '' => '&Uacute;',
  '' => '&Ucirc;',
  '' => '&Uuml;',
  '' => '&Yacute;',
  '' => '&THORN;',
  '' => '&szlig;',
  '' => '&agrave;',
  '' => '&aacute;',
  '' => '&acirc;',
  '' => '&atilde;',
  '' => '&auml;',
  '' => '&aring;',
  '' => '&aelig;',
  '' => '&ccedil;',
  '' => '&egrave;',
  '' => '&eacute;',
  '' => '&ecirc;',
  '' => '&euml;',
  '' => '&igrave;',
  '' => '&iacute;',
  '' => '&icirc;',
  '' => '&iuml;',
  '' => '&eth;',
  '' => '&ntilde;',
  '' => '&ograve;',
  '' => '&oacute;',
  '' => '&ocirc;',
  '' => '&otilde;',
  '' => '&ouml;',
  '' => '&divide;',
  '' => '&oslash;',
  '' => '&ugrave;',
  '' => '&uacute;',
  '' => '&ucirc;',
  '' => '&uuml;',
  '' => '&yacute;',
  '' => '&thorn;',
  '' => '&yuml;',
);


# unused...
sub do_convert_iso_html1() {
  my $findchars = join('', keys %charmap);

  while (<>) {
    s/([$findchars])/$charmap{$1}/g; # convert 'Umlaute' to HTML notation (not needed for HTML 4.0)
    print;
  }
  return 1;
}

sub do_convert_html_iso1() {
  my %reverse_charmap = reverse %charmap;
  my $pat_str = join('|', keys %reverse_charmap);

  #my %misc_reverse_charmap = (
  #  '&lsquo;' => "'",
  #  '&rsquo;' => "'",
  #);
  #my $misc_pat_str = join('|', keys %misc_reverse_charmap);

  while (<>) {
    s/($pat_str)/$reverse_charmap{$1}/g; # convert HTML entities to latin-1

    #s/($misc_pat_str)/$misc_reverse_charmap{$1}/g; # convert some specialities (optinal)

    #s/(?:(\w+) ([:?!]))/$1$2/g; # remove space before : ? !
    print;
  }
  return 1;
}



#
# main
#
sub main() {
  #my %opts = (
  #);
  #if (!Getopt::Std::getopts("hd:", \%opts) or (@ARGV == 0) or exists($opts{'h'})) {
  #  require File::Basename;  # load dynamically for help
  #  print STDERR "Usage: ". File::Basename::basename($0) ." [options] <destination dir>\n";
  #  print STDERR "-h       : help\n";
  #  print STDERR "-d level : debug level (0=off, 1=normal, >1=extended)\n";
  #  print STDERR "\n";
  #  exit 1;
  #}

  my $rc = do_convert_html_iso1();
  return ($rc) ? 0 : 1;
}

exit(main());

__END__
