#!/usr/local/bin/perl
#
#  Perl5 + p5-HTML-Parser 専用
#  ul li /li /ul を処理して、閉じタグの省略をおぎなう。
#  ol li /li /ol を処理して、閉じタグの省略をおぎなう。
#  dl dt /dt dd /dd /dlを処理して、閉じタグの省略をおぎなう。
#
#  2001.11.02 first version
#  2001.11.09 ul / ol / dl のハードコーディングから、リスト形式のデータに変更。
#
#  $Id: html-tag-ul3,v 1.2 2001/11/09 08:07:01 george Exp $
#


use HTML::Parser();		# use declare

#
#  subclassing and method callbacks for HTML::Parser
#
{
  package MyParser;
  use base 'HTML::Parser';

  my $dbg = 0;			# デバッグ用フラグ

  #  処理対象タグリスト指定
  #  各リストの先頭1要素はブロック開始/終了タグ。
  #  各リストの先頭2要素以降は内部用タグ。
  @proc_tag_list = (
		  [ "ul" , "li" ] ,
		  [ "ol" , "li" ] ,
		  [ "dl" , "dt" , "dd"]
		  );

  my $outbuf;
  my @tag_stack;

  sub bufprint {
    my $s;

    print "bufprint::start\n" if ($dbg > 5);
    foreach $s (@_) {
      $outbuf .= $s;
      print "bufprint::" . $s . "\n" if ($dbg > 5);
    }
    print "bufprint:: outbuf string is " . $outbuf . "\n" if ($dbg > 5);
  }

  sub bufflush {
    print "bufflush::start\n" if ($dbg);
    print $outbuf;
    $outbuf = "";
  }

  sub tag_stack_to_string {
    my $aryref;

    print "#tag_stack dump start.\n";
    foreach $aryref (@tag_stack) {
      foreach $s (@$aryref) {
	print $s . " ";
      }
      print "\n";
    }
    print "#tag_stack dump end.\n";
  }

  sub bufappend {
    my $s;
    my $right_spc;

    print "bufappend::start\n" if ($dbg);
    # 右側の改行やスペースなどを分離する。
    $right_spc = "";
    if ( $outbuf =~ /(.*?)([\f\t\s\r\n]+)$/s ) {
      $outbuf = $1;
      $right_spc = $2;
      print "bufappend:: left = :" . $outbuf . ": right = :" . $right_spc . ":\n" if ($dbg);
    }
    foreach $s (@_) { $outbuf .= $s; }
    $outbuf .= $right_spc;
    print "bufappend:: final outbuf = :" . $outbuf . ":\n" if ($dbg);
  }

  sub match_block_tag {
    my ($s) = @_;
    my $iref;
    foreach $iref (@proc_tag_list) {
      if ( $s eq $$iref[0] ) {
	return 1;
      }
    }
    return 0;
  }

  sub match_inner_tag {
    my ($s) = @_;
    my $iref;
    my $irefin;
    foreach $iref (@proc_tag_list) {
      for ( $i = 1 ; $i <= $#$iref ; $i++ ) {
	if ( $s eq $$iref[$i] ) {
	  return 1;
	}
      }
    }
    return 0;
  }

  sub check_inner_tag {
    my ($intag, $blocktag) = @_;
    my $iref;
    my $irefin;
    foreach $iref (@proc_tag_list) {
      for ( $i = 1 ; $i <= $#$iref ; $i++ ) {
	if ( $intag eq $$iref[$i] ) {
	  if ( $blocktag eq $$iref[0] ) {
	    return 1;
	  } else {
	    return 0;
	  }
	}
      }
    }
    return 0;
  }

  sub start {
    my ($self, $tagname, $attr, $attrseq, $origtext, $line) = @_;

    print "start:: current tag is " . $tagname . "\n" if ($dbg);
    if ( match_block_tag($tagname) ) {
      push @tag_stack, [ $tagname ];
    } elsif ( match_inner_tag($tagname) ) {
      $aryref = pop @tag_stack;
      check_inner_tag($tagname, $$aryref[0]) || die "nest error $tagname in line $line. abort.";
      if ($$aryref[$#$aryref] eq $tagname) {
	bufappend "</" . $tagname . ">";
	pop @$aryref;
      }
      push @$aryref, $tagname;
      push @tag_stack, $aryref;
    }

    if ( $dbg ) {
      print "===== start:: current tag stack is =====\n";
      tag_stack_to_string;
    }

    # print out
    bufprint '<' . $tagname;
    foreach $key (@$attrseq) {
      bufprint ' ' . $key . '=' . '"' . $$attr{$key} . '"';
    }
    bufprint '>';
  }

  sub end {
    my($self, $tagname, $origtext, $line) = @_;
    my $aryref;
    my $s;

    print "end:: current tag is " . $tagname . "\n" if ($dbg);
    if ( match_block_tag ($tagname) ) {
      $aryref = pop @tag_stack;
      if ( $$aryref[0] ne $tagname ) {
	die "nest error $tagname in line $line. abort.";
      }
      while ( $#$aryref > 0 ) {
	$s = pop @$aryref;
	bufappend '</' . $s . '>';
      }
    } elsif ( match_inner_tag($tagname) ) {
      $aryref = pop @tag_stack;
      check_inner_tag($tagname, $$aryref[0]) || die "nest error $tagname in line $line. abort.";
      if ($$aryref[$#$aryref] eq $tagname) {
	  pop @$aryref;
      }
      push @tag_stack, $aryref;
    }

    if ( $dbg ) {
      print "===== end:: current tag stack is =====\n";
      tag_stack_to_string;
    }

    # print out
    bufprint '</' . $tagname . '>';

    # ネスト関係が解消されていたら、早めにバッファをフラッシュ
    if ( $#tag_stack == 0 ) {
      bufflush;
    }
  }

  sub text {
    my($self, $origtext, $is_cdata) = @_;

    bufprint $origtext;
  }

  sub declaration {
    my($self, $origtext) = @_;

    bufprint $origtext;
  }

  sub comment {
    my($self, $origtext) = @_;

    bufprint $origtext;
  }

  sub process {
    my($self, $origtext) = @_;

    bufprint $origtext;
  }

  sub start_document {
  }

  sub end_document {
    bufflush;
  }

  sub default {
    my($self, $origtext) = @_;

    bufprint $origtext;
  }
}

$p = MyParser->new;
$p->handler(start          => "start",          "self,tagname,attr,attrseq,text,line");
$p->handler(end            => "end",            "self,tagname,text,line");
$p->handler(text           => "text",           "self,text,is_cdata");
$p->handler(process        => "process",        "self,token0,text");

$p->handler(comment        => "comment",        'self,text');
$p->handler(declaration    => "declaration",    'self,text');
$p->handler(process        => "process",        'self,text');
$p->handler(default        => "default",        'self,text');
$p->handler(start_document => "start_document", 'self');
$p->handler(end_document   => "end_document",   'self');

#
#  here we go!
#
$p->parse_file("foo.html");

#
#  end of file
#