#! /usr/bin/perl
#
# mkindex - Rebuild index file for HTML files in the Internet Encyclopedia
#
# "Connected: An Internet Encyclopedia" has an index file in its root
# directory containing entries for every HTML file, link, header item
# and emphasized text.  This file serves several functions.  First,
# it provides a way to verify the correctness of every link.
# Second, it allows for the creation of list_of_children files,
# which are basically hypertext outlines of HTML directory hierarchies.
# Finally, it provides a basis for constructing simple UNIX scripts
# to examine and manipulate the HTML interrelations.  I also hope to use it
# to enhance the performance of the search engine.
#
# This script maintains that file.  From any point in the Connected
# directory tree, you can run "mkindex" to rebuild the index file.
# This should be done regularly as files are changed.  Since the
# old file is only replaced once the entire script has run,
# interrupts or fatal errors will not corrupt the index.
#
# The script will transverse the entire Connected directory structure,
# and compare the modification time on each file with the modification
# time of the index.  Entries for unchanged files will be copied from
# the old index file to the new one.  For each changed file, the
# "htmlcheck" script will be run, which checks the file for conformance
# with encyclopedia formatting guidelines, complains to stderr about
# problems, and prints index entries to stdout, which this script redirects
# into the index file.
#
# When the entire directory tree has been analyzed, the old copy of the
# index file is moved to a backup location, and the new copy replaces
# it.  Summary information is printed to stdout, listing the number
# of index file passes (explained below), the number of changed files
# (for which htmlcheck was run), and the total number of HTML files
# in the encyclopedia.  A diff is run, comparing the old index
# file to the new one.  Finally, "mkoutlines" is run twice to
# rebuild the outline files in the Topical Core and Project Documentation.
#
# As the script checks files in the directory structure, it copies
# entries for unchanged files from the old index file to the new one.
# For each such file, the script advances through the old index
# file, looking for a group of lines corresponding to the file
# in question.  It skips any lines that don't match, thus silently
# discarding old entries for deleted or changed files.  Remember,
# all the entries for changed files come from "htmlcheck".
# Since the ordering of files shouldn't change, normally only
# one pass is needed.  After all, if file A was processed before
# file B last time, it will probably be processed first again this
# time - especially since we sort the file lists.  Of course, it is
# possible that something might get out of sync.  If this happens, the
# script will read all the way to the end of the old file without
# finding any matching entries.  In this case, a warning message
# is printed, the old index file is rewound, "htmlcheck" is run
# on the file in question (just for the hell of it), and the
# algorithm starts again, at the top of the old index file,
# for the next HTML file to be processed.  This is the meaning
# of the "index file passes" statistic - the number of times
# this script read through the old index file.  It should normally
# be one, but don't be too concerned if it is a small number.
#
# This script seems pretty specific to the Internet Encyclopedia,
# but you never know - maybe the ideas will prove useful.
#
#        CHANGES TO BE CONSIDERED
#
# - Make the index file an NDBM database.  This is probably a prerequisite
#   for using it in the search script.
#
# - Embed this functionality deeper in the file system.  It be really
#   nice to be able to run a program every time a file is changed,
#   and have that reliably enforced by the operating system.
#


# $rootdir is a UNIX path leading to the Encyclopedia root directory

(($rootdir) = (`pwd` =~ m:^(.*/Connected):))
    || die "This script must be run from with the Connected hierarchy\n";

sub do_file {
    local($file) = @_;

    ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
     $atime,$mtime,$ctime,$blksize,$blocks) = stat($file);

    $files ++;

    if (! -r $file) {
	print STDERR "Warning: $file is unreadable, and has been omitted\n";
	return 0;
    }

    if ($mtime > $basetime) {
	# The file has been modified more recently than OLDIDX
	# Therefore, run htmlcheck on it, the output going to NEWIDX (stdout)

	(system "htmlcheck $file | sed 's:^:/Connected/:'")/256 == 0
	    || die "Fatal: can't run htmlcheck | sed pipeline!\n";
	$newfiles ++;
    } else {
	# The file has not been modified more recently than OLDIDX
	# Therefore, scan through OLDIDX looking for its entires, and
	# copy them to NEWIDX (stdout).

	while ((! m:^/Connected/$file\t:) && ($_ = <OLDIDX>)) {}
	if (eof(OLDIDX)) {
	    $passes ++;
	    print STDERR "Can't find $file - rewinding OLDIDX\n";
	    seek(OLDIDX,0,0) || die "Can't rewind OLDIDX";
	    $_ = <OLDIDX>;

	    (system "htmlcheck $file | sed 's:^:/Connected/:'")/256 == 0
		|| die "Fatal: can't run htmlcheck | sed pipeline!\n";
	}
	else {
	    while (m:^/Connected/$file\t:) {
		print;
		$_ = <OLDIDX>;
	    }
	}
    }
    return 1;
}

# Handle symlinks
#
# This routine could probably use a lot more error checking, as
# symlinks could go, well, concievably anywhere.  I just assume
# (there's that word again) that the link points to a nice, valid
# HTML directory or file in our web hierarchy.

sub do_symlink {
    local($src) = @_;
    local($target1, $target2);

    $target1 = $src;
    $target2 = readlink($src);

    # $target1 -> $target2, and they look something like
    # a/b/link -> file.html
    #
    # We want to get them so we can concat them together,
    # so the first step is to strip "link" - the name of the link

    $target1 =~ s:[^/]*$::;

    # Now we've got             a/b/ -> file.html
    # Of course, we could have  a/b/ -> ../file.html
    # so let's loop around and strip all parent references

    while ($target2 =~ m:^../:) {
	$target1 =~ s:[^/]*/$::;
	$target2 =~ s:^../::;
    }

    # Finally, we can combine them to get the target name,
    # relative to the root of the hierarchy

    $target = $target1 . $target2;

    # If the target name is a directory, find its index file and
    # attach its name to the end of $target.  This isn't absolutely
    # necessary, but I like to have only one name for each web page.

    if (-d $target) {
	if (-r "$target/index.shtml") {
	    $target .= "/index.shtml";
	} elsif (-r "$target/index.html") {
	    $target .= "/index.html";
	} else {
	    print STDERR "Warning: SymLink $src points to $target, a directory without index files\n";
	}
    } elsif (! -r $target) {
	print STDERR "Warning: SymLink $src points to $target, an apparently non-existant file\n";
    }

    print "/Connected/$src	SymLink	/Connected/$target\n";
}

sub do_directory {
    local($directory) = @_;
    local($i, @directory_files);

    if ($directory ne "" && $directory !~ m:/$:) {
        $directory .= "/";
    }

    if (-r "${directory}index.shtml" && -r "${directory}index.html") {
	print STDERR "Warning: ${directory} has both index.shtml and index.html; using index.shtml\n";
	&do_file("${directory}index.shtml");
    } elsif (-r "${directory}index.shtml") {
	&do_file("${directory}index.shtml");
    } elsif (-r "${directory}index.html") {
	&do_file("${directory}index.html");
    } else {
	print STDERR "Warning: ${directory} has neither index.shtml nor index.html\n";
    }

    @directory_files = sort <${directory}*>;
    foreach $i (@directory_files) {
        if ($i =~ /\.s?html$/ &&
	    $i !~ /index.s?html$/ && $i !~ /Template.s?html$/) {
	    &do_file($i);
	} elsif (-l $i) {
	    &do_symlink($i);
	} elsif (-d $i && ($i !~ m:^bin$:) && ($i !~ m:^cgi-bin$:) &&
		 ($i !~ m:/RCS$:) && ($i !~ m:^RCS$:)) {
	    &do_directory($i);
	}
    }
}

# Wherever we are, change to the rootdir of the encyclopedia, and
# run through it from there.

chdir $rootdir || die "Can't cd to encyclopedia rootdir - $rootdir\n";

($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
 $atime,$mtime,$ctime,$blksize,$blocks) = stat("index");
$basetime = $mtime;

$idxold = "index";
$idxnew = "index.new";
$idxbak = "index.bak";

open(OLDIDX, "<$idxold")     || die "Can't open $idxold";
open(NEWIDX, ">$idxnew")     || die "Can't open $idxnew";
open(STDOUT, ">&NEWIDX")     || die "Can't dup NEWIDX to STDOUT";
select(STDOUT); $| = 1;
select(NEWIDX); $| = 1;

$passes = 1;
$newfiles = 0;
$files = 0;

$_ = <OLDIDX>;
&do_directory("");

close(OLDIDX);
close(NEWIDX);

if ((system "mv $idxold $idxbak")/256 == 0) {
    (system "mv $idxnew $idxold")/256 == 0
	|| print STDERR "Warning: can't mv $idxnew $idxold!!\n";
} else {
    print STDERR "Warning: can't mv $idxold $idxbax; leaving new data in $idxnew\n";
}

printf STDERR "Index file passes:      %2d\n", $passes;
printf STDERR "New/changed files:      %2d\n", $newfiles;
printf STDERR "Total files:            %2d\n", $files;
printf STDERR "\n";

system "diff $idxbak $idxold 1>&2";

print STDERR "\nRebuilding outlines in the Topical Core...";
chdir "$rootdir/Topics";
system "mkoutlines"
    || die "Can't run mkoutlines!\n";

print STDERR "\nRebuilding outlines in Project Documentation...";
chdir "$rootdir/Project";
system "mkoutlines"
    || die "Can't run mkoutlines!\n";

print STDERR "\nRebuilding outlines in Programmed Instruction Course...";
chdir "$rootdir/Course";
system "mkoutlines"
    || die "Can't run mkoutlines!\n";

print STDERR "\n";
