#!/usr/bin/perl 

use Getopt::Std;

#  Routine to generate an index file from PDS labels by extracting keywords 
#  from the labels as specified by the input.
#
#  15 Mar 1995, acr.
#  21 Apr 1995, acr: shaking out the bugs
#  17 May 1995, acr: made ".LBL" extension check case-insensitive
#  19 Jun 1995, acr: fixed file name case bug in 'runlabels'
#  13 Jul 1995, acr: REALLY made ".LBL" extension check case-insensitive;
#                    fixed bug which kept negative widths from left-justifying
#  13 Nov 1995, acr: fixed leftover bug with last keyword width being negative
#  29 Apr 1996, acr: fixed bug that caused keywords inside objects to be ignored
#  30 May 1996, acr: made %val array local to &process_label to fix bug in 
#                    which old values of keywords missing in subsequent labels
#                    were repeated
#  08 Aug 1996, acr: Added some comments, removed some debug flags
#  04 Mar 1997, acr: Fixed processing of object numbering to restart numbering
#                    in each label
#  19 May 1997, acr: beefed-up label line processing so it wouldn't choke on
#                    '=' not surrounded by space, among other things; tweaked 
#                    ^PATH handling a bit so that relative ("../" etc.) paths
#                    and the trailing slash are clipped.
#  02 Feb 1999, acr: Removed gratuitous blank written onto the end of the last
#                    field in each line.
#  06 Mar 2003, acr: Added FILE_SPECIFICATION_NAME handling
#  01 Oct 2003, acr: Fixed bug that wouldn't allow negative widths (for left-
#                    justifying fields) from the command line.
#  06 Dec 2005, acr: Modified to handle namespace IDs (local data dictionary
#                    support; modified default fields list to coincide with
#                    current INDEX_TABLE requirements; added default value
#                    handling; use the now-standard Getopts package
#
#  Input options:
#
#     -a            Check all files for attached labels
#     -n            Do not use default keywords
#     -r            Descend recursively through subdirectories
#     -o <file>     Send output to <file>
#     -f <file>     Read keywords from <file>
#     -d <dir>      Start at directory <dir>
#     -v            Include "VOLUME_ID" in default list.
#
#
#  Define the list of valid special fields as an associative array for easy
#  searching:

%SPECIAL = ('^PATH', 1, '^FILE', 1, '^LABEL', 1,
            '^OFFSET', 1, '^OBJECT', 1, '^FILE_SPECIFICATION_NAME', 1);

#
#  Start by dealing with the command line arguments and collecting the various
#  flags, fields, and file names.  Getopts collects the options and their
#  arguments, adjusting the ARGV array accordingly.
#  

getopts('anro:f:d:v');


#
#  Formatting and keyword selection are taken care of through two arrays:
#  one associative, the other a straight array.  The associative array is
#  indexed on keyword name (including path) and returns the number of bytes
#  for the output field width.  The second contains a list of the keywords
#  to be indexed, in the order in which they appeared on the command line
#  and/or in the field list file (default fields come first).  This is used
#  to output the fields in order.  A third array is used to hold any literal
#  values provided for use when the indicated element is not present in a 
#  label.
#
#
#  If the "-n" switch was used, then no default fields are defined.  
#  Otherwise, start with the default field list:
#

if (!$opt_n)
  { if ($opt_v)  # Include VOLUME_ID
      { $num_fields = 5;
        %size = ( '^FILE_SPECIFICATION_NAME', -50, 
                  'PRODUCT_ID',               -30,
                  'DATA_SET_ID',              -40,
                  'VOLUME_ID',                -11,
                  'PRODUCT_CREATION_TIME',     19 );
        @name = ( '^FILE_SPECIFICATION_NAME', 'PRODUCT_ID', 
                  'DATA_SET_ID', 'VOLUME_ID', 'PRODUCT_CREATION_TIME' );
        @literal = ( "", "", "", "NULL_0001", "");
      }
    else
      { $num_fields = 4;
        %size = ( '^FILE_SPECIFICATION_NAME', -50, 
                  'PRODUCT_ID',               -30,
                  'DATA_SET_ID',              -40,
                  'PRODUCT_CREATION_TIME',     19 );
        @name = ( '^FILE_SPECIFICATION_NAME', 'PRODUCT_ID', 
                  'DATA_SET_ID', 'PRODUCT_CREATION_TIME' );
        @literal = ( "", "", "", "" );
      }
  }

#
#  Look for an explicit output file and open it:
#

if ($opt_o)
  { open (OUTPUT,">$opt_o") || die "Could not open '$opt_o' for output.\n"; }

#
#  Next, check to see if a file of field descriptions was included.  If so,
#  read it:
#

if ($opt_f)
  { open (FIELDS,$opt_f)  ||  die "Could not open '$opt_f'\n";

    while ($line=<FIELDS>)
      { chop $line;
        if ( $line =~/^\s*$/  ||  $line =~ /^\s*#.*$/ )
          { next; }

        $line =~ s/#.*$//;                          # Delete trailing comments
        $line =~ s/\s//g;                           # Delete whitespace
        ($field,$width,$val) = split(/\//,$line);   # Split into fields
        if ($width eq "")                           # Insert default width
          { $width = ($val eq "")? 10 : length($val); }
        if ($width !~ /^(-|)[0-9]+$/)               # Not a number - assume lit.
          { $val = $width;
            $width = length($val);
          }
        $tmpsize[$num_fields] = $width;
        $name[$num_fields]    = $field;
        $literal[$num_fields] = $val;
        $num_fields++;
      }
    close(FIELDS);
  }

#
#  Finally, retrieve field names from the command line, if any:
#

for ($i=0; $i<=$#ARGV; $i++)     # Get field descriptions
  { ($fname, $fwidth, $val) = split(/\//,$ARGV[$i]);

    if ($fwidth eq "") 
      { $fwidth = ($val eq "")? 10 : length($val); }
    if ($fwidth !~ /^(-|)[0-9]+$/)
          { $val = $fwidth;
            $fwidth = length($val);
          }
    $tmpsize[$num_fields] = $fwidth;
    $name[$num_fields]    = $fname;
    $literal[$num_fields] = $val;
    $num_fields++;
  }

#
#  Format and do syntax-checking for all collected field names:
#

$num_fields = &checknames($num_fields,*name,*tmpsize);
if ($num_fields < 1)
  { printf STDOUT "pdsidx: no valid fields entered.\n";
    exit;
  }

#
#  And store output field sizes in an associative array:
#

for ($i=0; $i<$num_fields; $i++)
  { if ($tmpsize[$i] != 0)     #  Special Fields may have no size in the array
      { $size{$name[$i]} = $tmpsize[$i]; }
  }

## *** print keyword list ***
##for ($i=0; $i<$num_fields; $i++)
##  { printf "Keyword: %-50.50s  Width: %2d\n", $name[$i], $size{$name[$i]};
##  }

#
#  Determine the starting directory:
#

if ($opt_d)
  { if (-d $opt_d)        # Make sure it's a directory
      { $start_dir = $opt_d; }
    else
      { printf STDOUT "pdsidx: Invalid root directory (%s).\n", $opt_d;
        exit;
      }
  }
else
  { $start_dir = '.'; }

#
#  Now call the recursive routine which will process this and all
#  subdirectories, if requested:
#

&runlabels($start_dir,$opt_r);

#=============================================================================

sub checknames
  { local ($n,*fldnm,*width) = @_;
    local ($name,$i,$j,$k,$count,@tmpname,@tmpsize);
    local ($elm,$ldd);

    # First, run through the names, do syntax-checking and put in
    # uniform format:

    foreach $name (@fldnm)
      { # Delete blanks:
 
        $name =~ s/  *//g;

        # Force to upper case:

        $name =~ tr/a-z/A-Z/;

        # First, check for special fields (beginning with '^'):

        if ($name =~ /^\^\w+/)
          { if (!$SPECIAL{$name})     # Is it on the Special Field list?
              { printf STDOUT "pdsidx: Invalid field name, \"%s\" (%s).\n",
                              $name,"not a special field";
                $name = "";
              }
            next;    # No more checks for special names
          }

        # If there's a namespace, split if off, validate it and save it:

        if ($name =~ /:/)
          { ($ldd,$elm) = split(/:/,$name);
            if ($ldd !~ /^[A-Z][A-Z_0-9]*$/)
              { printf STDOUT "pdsidx: Invalid namespace, \"%s\" (%s).\n",
                              $ldd, "invalid character";
                $name = "";
                next;
              }
          }
        else
          { $elm = $name; 
            undef $ldd;
          }

        # Make sure there are only valid characters in the element name, and
        # that it begins with a letter:

        if ($elm !~ /^[A-Z][A-Z_0-9\-\[\]\.]*[^\.]$/)
          { printf STDOUT "pdsidx: Invalid field name, \"%s\" (%s).\n", 
                          $name, "invalid character";
            $name = "";
            next;
          }

        # Insert "[1]" before unnumbered objects:

        $elm =~ s/([A-Z]+)\./\1\[1\]./g;
        $name = ($ldd)? "$ldd:$elm" : $elm;

        # Check for equal numbers of []:

        if (split(/\[/,$name) != split(/\]/,$name))
          { printf STDOUT "pdsidx: Invalid field name, \"%s\" (%s).\n",
                          $name, "unbalanced brackets";
            $name = "";
            next;
          }

        # If there are brackets, make sure there are only digits 
        # between them:

        if ( ($name =~ /\[/)  &&  ($name =~ /\[\d*[^0-9\]]\d*\]/) )
          { printf STDOUT "pdsidx: Invalid field name, \"%s\" (%s).\n", 
                          $name, "invalid subscript";
            $name = "";
            next;
          }

      }

    #  Now, step through the lists and remove null field names:

    $count = 0;
    for ($i=0; $i<$n; $i++)
      { if ($fldnm[$i] ne "")
          { $tmpname[$count] = $fldnm[$i];
            $tmpsize[$count] = $width[$i];
            $count++;
          }
      }

    @fldnm = @tmpname;
    @width = @tmpsize;
    $count;

  }

#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

sub runlabels

#
# Subroutine to run the loop through label files and subdirectories as needed
#
#   Arguments:  $_[0] = directory (unopened)
#               $_[1] = $opt_r    (recursion flag)
#
  { local ($dir=$_[0], $rflag=$_[1]);
    local (@label_file);                 # file names ending with ".lbl"
    local (@subdir);                     # subdirectory names
    local ($lf,$sd);                     # element counters for filename lists
    local ($file,$i);                    # loop/subscript variables
    local ($full_name);                  # full file name

    # Open the directory:

    opendir(DIR,$dir) || die "pdsidx: Could not open \"$dir\".\n";

    # Get a sorted list of files in the directory, and parse it into lists
    # of label file names and subdirectories:

    $lf = $sd = 0;
    foreach $file (sort readdir(DIR))
      { if ($file =~ /^\./)
          { next; }
        else
          { $file = $dir . '/' . $file; }
        $tfile = $file;
        $tfile =~ tr/a-z/A-Z/;
        if ($tfile =~ /\.LBL$/)
          { $label_file[$lf] = $file;
            $lf++;
          }
        elsif ($opt_a  &&  is_label($tfile))
          { $label_file[$lf] = $file;
            $lf++;
          }
        elsif (-d $file  &&  $rflag)
          { $subdir[$sd] = $file;
            $sd++;
          }
      }

    #  Now process the label files:

    foreach $file (@label_file)
      { open (LABEL,$file) || die "pdsidx: Could not open \"$file\".\n";
        &process_label($file);
      }

    # Close the current directory and descend through subdirectories:

    close(DIR);

    foreach $file (@subdir)
      { &runlabels($file,$rflag); }
  }

#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    
sub process_label

#  Routine to read through the label file and pull out the desired keywords.
#
#  Arguments: $_[0]  = the full name of the file (already opened on LABEL)
#
  { local ($file=$_[0]);      # Full name of the file on LABEL
    local ($line);            # input buffer
    local ($buffer);          # string buffer
    local ($found_first) = 0; # flag for first object pointer
    local (%val);             # Array of found values
    local ($name);            # Object names for initialization loop
    local ($i);               # string subscript/index
    local ($prtval);          # Holding place for value to be printed

    # Before we can begin processing object names, we must initialize the
    # %object_count array.  For the very first label, this array will have 
    # no elements.  In subsequent labels, all existing elements should be 
    # re-initialized to zero, since object numbering is only relative to
    # other objects in the same label:

    foreach $name (keys %object_count)
      { $object_count{$name} = 0; }

    #  First, assign the special values for ^PATH, ^LABEL and 
    #  ^FILE_SPECIFICATION_NAME values:

    $buffer = $file;
    $buffer =~ s/\/[a-zA-Z0-9_.]*$//;  # Remove file name
    $buffer =~ s/^[\.\/]+//;           # Remove relative part of path name
    $val{'^PATH'} = $buffer;

    $buffer = $file;
    $buffer =~ s/^.*\///;
    $buffer =~ s/\.lbl//;
    $val{'^LABEL'} = $buffer;

    $val{'^FILE_SPECIFICATION_NAME'} = $val{'^PATH'}  . "/" .
                                       $val{'^LABEL'} . ".lbl";

    $object = "";
    while ($line=&nextline)
      { # Do some format cleanup to ease processing later:

        $line =~ s/^\s+//;         # Clip leading blanks
        $line =~ s/\s*=\s*/=/;     # Remove blanks around first '='

        # Split the line at the first '=':

        $i = index($line,"=");
        if ($i >= 0)
          { $keyword = substr($line,0,$i);
            $value   = substr($line,$i+1);
            $value   =~ s/^("|')//;
            $value   =~ s/("|')$//;
          }
        else
          { $keyword = $line;
            undef ($value);
          }

        $keyword =~ tr/a-z/A-Z/;          # Force keyword to upper case
        last if ($keyword eq "END");      # Check for end of label

        # Check for end of object and pop object name as needed.  Note that
        # we're assuming the label is well-formatted:

        if ($keyword eq "END_OBJECT")
          { $object =~ s/[^.]*\.$//;
            next;
          }

        # Any remaining one-word lines should be ignored as well:

        next unless ($value);


        #
        # OK, now we've got the keyword from the line.  Time to check for new
        # OBJECT definitions and adjust the $object variable accordingly;
        #

        if ($keyword eq "OBJECT")
          { $object = $object . $value;
#printf "   New object: $object\n";
            # Increment the counter for this type of object, and add the 
            # appropriate subscript:

            $object_count{$object}++;
            $object = sprintf("%s[%d].",$object,$object_count{$object}); 

            next;
          }


        # Add the object path to the keyword value.  If this string has a value
        # in the %size array, save its corresponding value in the %val array:

        $keyword = $object . $keyword;
#printf "      Checking $keyword\n";
        if ($size{$keyword})
          { $val{$keyword} = $value; }

        # Check for the special fields associated with the data file pointer:

        if ( $keyword =~/^\^/  &&  !$found_first)   # Ignore all but first
          { $keyword =~ s/^\^//;                    #  pointer field

            # Check to see if the value is enclosed in parentheses, If so,
            # this indicates a filename with an offset, either in records 
            # or bytes (BYTES flag included):

            if ( $value =~ /[(]/ )
              { $value =~ s/  *//g;           # Collapse blanks
                $value =~ s/[()]//g;          # Remove parentheses
                ($filename, $offset, $byte_flag) = split(/,/,$value);
                $filename =~ s/("|')//g;      # Remove quotes
              }
            else
              { $filename = $value; }

           # Assign the Filename and offset values to their respective elements:

            if ($byte_flag)
              { $offset .= " " . $byte_flag; }
            $val{'^FILE'}   = $filename;
            $val{'^OFFSET'} = $offset;
            $val{'^OBJECT'} = $keyword;
            $found_first = 1;
          }

      }

    #
    #  End of label.  Print the index line for this label.  Values are printed
    #  in the order in which they were listed in the @name array:
    #

    if ($opt_o)
      { $oldhandle = select(OUTPUT); }

    #  In each case, first we write the format string, then substitute the
    # literal value, if needed, for missing values, then write the value:

    for ($i=0; $i<$num_fields-1; $i++)
      { $bytes = $size{$name[$i]};
        if ($bytes < 0)
          { $format = sprintf ("%%%d.%ds ",$bytes,-$bytes); }
        else
          { $format = sprintf ("%%%d.%ds ",$bytes,$bytes); }
        $prtval = ($val{$name[$i]} eq "")? $literal[$i] : $val{$name[$i]};
        printf $format,$prtval;
      }
    $bytes = $size{$name[$num_fields-1]};
    if ($bytes < 0)
      { $format = sprintf ("%%%d.%ds",$bytes,-$bytes); }
    else
      { $format = sprintf ("%%%d.%ds",$bytes,$bytes); }
    $prtval = ($val{$name[$num_fields-1]} eq "")? $literal[$num_fields-1] :
                                                  $val{$name[$num_fields-1]};
    printf "$format",$prtval;
    printf "\n";
    if ($opt_o)
      { select($oldhandle); }

  }


#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

sub nextline

  # Routine to return the next complete line from the input label file open
  # on <LABEL>.

{ local($buff);       # Line buffer
  local($line);       # Line to be returned
  local($not_done);   # Loop flag
  local($val,$t);     # value part of line and remainder
  local($qt);         # Quote character

  $line = "";

  # Get the next line, formatted for processing:

  return undef unless ($buff = &cleanline);

  # If there is no "=", return now:

  return $buff if ($buff !~ /=/);

  # We have at least the beginning of an assignment statement.  Save
  # it into $line:

  $line = $buff;

  # Now, we have to make sure we have the entire value.  The cases are:
  #
  #  1. If nothing follows the "=", we must read the next line to find 
  #     the value.
  #  2. If a simple value follows the "=", return
  #  3. If a quote follows the "=", we must read lines until we find a
  #     matching end quote.
  #

  ($t,$val) = split(/=/,$buff);

  # 1. Read next line

  if (! $val)
    { return $line unless ($buff = &cleanline);
      $line = "$line $buff";
      $val = $buff;
    }

  # 2. Simple value (and check for a matching quote on this line):

  if ($val !~ /^("|')/  ||  $val =~ /^".*"/  ||  $val =~ /^'.*'/)
    { return $line; }

  # 3. Find matching quote.

  $qt = substr($val,0,1);

  $not_done = 1;
  while ($not_done)
    { return $line unless ($buff = &cleanline);

      # Look for closing quote.  If found, delete anything after it and set
      # loop flag:

      if ($buff =~ /$qt/)
        { $buff =~ s/$qt.*$/$qt/;
          $not_done = 0;
        }

      # If the line is more than 255 characters long, then assume we're in
      # some kind of description field and ignore it.  Otherwise, concatenate
      # it to the end of the current buffer.  The last line is always added
      # so that the quotes will balance.

      if (length($line) < 255  ||  ! $not_done)
        { $line = "$line $buff"; }
    }

  # Return the line:

  return $line;

}

#-----------------------------------------------------------------------------

sub cleanline

  # Reads the next line from <LBL> and cleans it up for processing

{ local($buff);     # input buffer

  $buff = "";

  while ($buff eq "")
    { if (!($buff = <LABEL>))
        { return undef; }

      chomp $buff;     # Remove all record delimiters

      # Remove comments.  This is tricky, since the comment delimiter may
      # appear within an open quote, or after a closing quote:

      if ($buff =~ /=\s*"/)
        { $buff =~ s/(=\s*"[^"]*").*$/$1/; } # Drops everything after closing "
      elsif ($buff =~ /=\s*'/)
        { $buff =~ s/(=\s*'[^']*').*$/$1/; }
      else
        { $buff =~ s/\s*\/\*.*$//; }

      $buff =~ s/^\s+//;
      $buff =~ s/\s*$//;
      $buff =~ s/\s*=\s*/=/;
    }

  return $buff;
}

#-----------------------------------------------------------------------------

sub is_label

  # Checks for PDS_VERSION_ID as first line of a file to determine if it is a
  # PDS label.

{ local ($file) = $_[0];
  my ($line);

  open(CHK,$file) || die "Could not open $file to check for label ($!)";

  $line=<CHK>;
  while (($line =~ /^\s*#/) ||  ($line =~ /^\s*$/)  || ($line =~/SFDU/))
    { $line = <CHK>; }

  if ($line =~ /^\s*PDS_VERSION_ID/)
    { return 1; }
  else
    { return 0; }
}