Logo Search packages:      
Sourcecode: bibclean version File versions

match.c

/***********************************************************************

==========
BACKGROUND
==========

This file contains an implementation of limited regular-expression
pattern matching code.  The pattern syntax is simpler, more limited,
and different from normal regular-expression pattern matching syntax.
It is described in more detail below.

The motivation for this new code is that I found considerable
inconsistency in the matching behavior between versions of either
re_comp()/re_exec() or compile()/step() on these systems

      DECstation 3100
      IBM 3090
      IBM PS/2
      IBM RS/6000 AIX 3.2
      NeXT Mach 3.0
      Silicon Graphics IRIX 4.0
      Stardent OS 2.2
      Sun SPARC

That makes use of those regular-expression pattern matching unreliable
across systems.

One possible solution would be to use the GNU re_comp() and re_exec()
from the regexp distribution on prep.ai.mit.edu (as of writing,
pub/gnu/regex-0.11.*).  However, that code is large (5000+ lines), and
its installation uses configuration facilities that only work under
some variants of UNIX, and are completely useless on other operating
systems.

By contrast, the pattern matching code here is quite adequate for
bibclean's needs, and can be expressed in fewer than 140 lines.  In
addition, it provides special handling of TeX control sequences and
braces that would be rather awkward to express in conventional
regular-expression syntax.

If the symbol TEST is defined at compile time, a main program will be
included that can be used for testing patterns supplied from stdin.


==============
PATTERN SYNTAX
==============

The string values to be pattern-matched are tab-free single-line
values delimited by quotation marks.

The patterns are represented by the following special markers:

      a     exactly one letter
      A     one or more letters
      d     exactly one digit
      D     one or more digits
      r     exactly one Roman numeral
      R     one or more Roman numerals (i.e. a Roman number)
      w     exactly one word (one or more letters and digits)
      W     one or more space-separated words, beginning and ending
            with a word
      X     one or more special-separated words, beginning and ending
            with a word
      .     one special character (see SPECIAL_CHARS defined below)
      :     one or more special characters
      <space>     one or more spaces
      \x    exactly one x (x is an character)
      x     exactly the character x (x is anything but aAdDrRwW.:<space>\)

Special characters are a subset of punctuation characters that are
typically used in values.

Note the <space> represents a single literal space, \\ a single
literal backslash, \a the letter a, \A the letter A, \d the letter d,
\D the letter D, and so on.  Remember to double all backslashes in C
strings: \a must be entered as \\a, and "and" as "\\an\\d".

Each pattern is matched against the entire string and must match
successfully for a YES return from match_pattern().  Consequently,
there is no need for an analogue of ^ and $ in full regular
expressions.  Neither is there provision for matching on arbitrary
sets of characters.  Instead, fixed sets of characters are provided
(conventional regular-expression equivalents are shown in
parentheses):

      digits ([0-9]),
      alphanumerics ([A-Za-z0-9]),
      space ([ \t\f\r\n\v]), and
      special ([][" !#()*+,-./:;?~])

In addition, TeX control sequences of the form
<one-special-character> or <letter-sequence> in the string are
ignored in the match, together with any following whitespace.

Braces are also ignored, but not whitespace following them.

Thus "{TR\slash A87}" matches the patterns "AD" and "W", and
"{TR A\slash 87}" matches the patterns "A AD" and "A W".

[29-Jan-1993]
***********************************************************************/

#include <config.h>
#include "xstdlib.h"
#include "xstring.h"
#include "xctype.h"
#include "yesorno.h"
#include "match.h"                  /* must come AFTER yesorno.h */

RCSID("$Id: match.c,v 1.2 1996/05/03 20:19:33 beebe Exp beebe $")

#ifndef EXIT_SUCCESS
#define EXIT_SUCCESS 0
#endif

#define SPECIAL_CHARS   " !#()*+,-./:;?[]~"

#define isspecial(c)    (strchr(SPECIAL_CHARS,(c)) != (char*)NULL)

static const char *next_s ARGS((const char *s_));
int               isroman ARGS((int c_));

#if defined(HAVE_STDC)
YESorNO
match_pattern(const char *s, const char *pattern)
#else /* K&R style */
YESorNO
match_pattern(s,pattern)
const char *s;
const char *pattern;
#endif
{
    s = next_s(s-1);
    for ( ; *pattern; ++pattern)
    {
      switch(*pattern)
      {
      case 'a':               /* single letter */
          if (!isalpha(*s))
            return (NO);
          s = next_s(s);
          break;

      case 'A':               /* one or more letters */
          if (!isalpha(*s))
            return (NO);
          while (isalpha(*s))
            s = next_s(s);
          break;

      case 'd':
          if (!isdigit(*s))         /* single digit */
            return (NO);
          s = next_s(s);
          break;

      case 'D':               /* one or more digits */
          if (!isdigit(*s))
            return (NO);
          while (isdigit(*s))
            s = next_s(s);
          break;

      case 'r':               /* single Roman numeral */
          if (!isroman(*s))
            return (NO);
          s = next_s(s);
          break;

      case 'R':               /* one or more Roman numerals */
          if (!isroman(*s))
            return (NO);
          while (isroman(*s))
            s = next_s(s);
          break;

      case 'w':               /* one word (letters and digits) */
          if (!isalnum(*s))
            return (NO);
          while (isalnum(*s))
            s = next_s(s);
          break;

      case 'W':               /* one or more space-separated words */
          if (!isalnum(*s))
            return (NO);
          while (isalnum(*s))       /* parse first word */
            s = next_s(s);
          for (;;)
          {
            if (!isspace(*s))
                break;
            while (isspace(*s))     /* parse separators */
                s = next_s(s);
            while (isalnum(*s))     /* parse another word */
                s = next_s(s);
          }
          break;

      case 'X':         /* one or more special-separated words */
          if (!isalnum(*s))
            return (NO);
          while (isalnum(*s))       /* parse first word */
            s = next_s(s);
          for (;;)
          {
            if (!isspecial(*s))
                break;
            while (isspecial(*s))   /* parse separators */
                s = next_s(s);
            while (isalnum(*s))     /* parse another word */
                s = next_s(s);
          }
          break;

      case ' ':               /* one or more whitespace characters */
          if (!isspace(*s))
            return (NO);
          while (isspace(*s))
            s = next_s(s);
          break;

      case '.':               /* exactly one special character */
          if (!isspecial(*s))
            return (NO);
          break;

      case ':':               /* one or more special characters */
          if (!isspecial(*s))
            return (NO);
          while (isspecial(*s))
            s = next_s(s);
          break;

        case '\\':                  /* literal next character */
          pattern++;
          /* fall through to exact match test */

      default:                /* anything else: exact match */
          if (*pattern != *s)
            return(NO);
          s = next_s(s);
      }                       /* end switch */
    }                         /* end for (; ;) */
    return (*s == '\0' ? YES : NO); /* YES if reached end of string */
}


#if defined(HAVE_STDC)
static const char *
next_s(const char *s)
#else /* K&R style */
static const char *
next_s(s)
const char *s;
#endif
{
    /* find next position in s, ignoring braces and ignoring TeX control
       sequences and any space that follows them */
    for (++s; *s; )
    {
      switch (*s)
      {
      case '\\':              /* TeX control sequence */
          ++s;                /* look at next character */
          if (isalpha(*s))          /* <one-or-more-letters> */
          {
            while (isalpha(*s))
                ++s;
          }
          else                /* <non-letter> */
            ++s;
          while (isspace(*s))       /* advance over trailing whitespace */
              ++s;                  /* since TeX does too */
          break;

        case '{':
      case '}':
          ++s;
          break;

      case BIBTEX_HIDDEN_DELIMITER: /* ignore delimited inline comment */
          for (++s; *s; ++s)
          {
            if (*s == BIBTEX_HIDDEN_DELIMITER)
            {
                ++s;
                break;
            }
          }
          break;

      default:
          return (s);
      }                       /* end switch */
    }                         /* end for */
    return (s);
}

#ifdef TEST
#define MAXLINE 256

#define NO_WARNING      (const char *)NULL

MATCH_PATTERN month_patterns[] =
{
    {"aaa",             "oct"},
    {"aaa # \" D\"",          "oct # \" 10\""},
    {"aaa # \" D--D\"",       "oct # \" 20--24\""},
    {"\"D \" # aaa",          "\"10 \" # oct"},
    {"\"D--D \" # aaa",       "\"10--24 \" # oct"},
    {"aaa # \"\" # aaa",      "jul # \"\\emdash \" # aug"},
    {"aaa # \"--\" # aaa",    "jul # \"--\" # aug"},
    {"aaa # \" -- \" # aaa",  "jul # \" -- \" # aug"},
    {"aaa # \"/\" # aaa",     "jul # \"/\" # aug"},
    {"aaa # \" A \" # aaa",   "jul # \" and \" # aug"},
    {(const char*)NULL,       NO_WARNING},
};

MATCH_PATTERN number_patterns[] =
{
    {"\"A AD\"",        "PN LPS5001"},
    {"\"A D(D)\"",            "RJ 34(49)"},
    {"\"A D\"",               "XNSS 288811"},
    {"\"A D\\.D\"",           "Version 3.20"},
    {"\"A-A-D-D\"",           "UMIAC-TR-89-11"},
    {"\"A-A-D\"",       "CS-TR-2189"},
    {"\"A-A-D\\.D\"",         "CS-TR-21.7"},
    {"\"A-AD-D\"",            "TN-K\\slash 27-70"},
    {"\"A-D D\"",       "PB-251 845"},
    {"\"A-D-D\"",       "ANL-30-74"},
    {"\"A-D\"",               "TR-2189"},
    {"\"AD-D-D\"",            "GG24-3611-00"},
    {"\"AD-D\"",        "SP43-29"},
    {"\"AD\"",                "LPS0064"},
    {"\"A\\#D-D\"",           "TR\\#89-24 ????"},
    {"\"D  D\"",        "23 \\& 24"},
    {"\"D \\an\\d D\"",       "11 and 12"},
    {"\"D+D\"",               "3+4"},
    {"\"D-D\"",               "23-27"},
    {"\"D/D\"",               "23/27"},
    {"\"DA\"",                "23A"},
    {"\"D\"",                 "23"},
    {"\"D\\.D\"",       "3.4"},
    {"\"W-W W\"",       "AERE-R 12329"},
    {"\"W-W-WW-W\"",          "OSU-CISRC-4\\slash 87-TR9"},
    {"\"W\"",                 "Computer Science Report 100"},
    {"\"X\"",                 "TR/AB/3-43.7-3/AB"},
    {(const char*)NULL,       NO_WARNING},
};

MATCH_PATTERN pages_patterns[] =
{
    {"\"D\"",                 "23"},
    {"\"aD\"",                "L23"},
    {"\"D--D\"",        "23--27"},
    {"\"aD--aD\"",            "L23--L27"},
    {"\"D, D\"",        "23, 27"},
    {"\"aD, aD\"",            "L23, L27"},
    {"\"D, D, D\"",           "23, 27, 45"},
    {"\"aD, aD, aD\"",        "L23, L27, L45"},
    {"\"D, D, D, D\"",        "23, 27, 45, 98"},
    {"\"aD, aD, aD, aD\"",    "L23, L27, L45, L98"},
    {"\"R + D\"",       "viii + 445"},
    {"\"R + D, w D w\"",      "viii + 445, with 30 illustrations"},
    {"\"D, w D w\"",          "239, with 27 illustrations"},
    {"\"D--D, D--D\"",        "23--27, 29--32"},
    {"\"D--D, D--D, D--D\"",  "23--27, 29--32, 35--37"},
    {"\"aD--aD, aD--aD\"",    "L23--L27, L29--L32"},
    {"\"aD--aD, aD--aD, aD--aD\"", "L23--L27, L29--L32, L35--L37"},
    {(const char*)NULL,       NO_WARNING},
};

MATCH_PATTERN volume_patterns[] =
{
    {"\"D\"",                 "27"},
    {"\"DA\"",                "27A"},
    {"\"D/D\"",               "27/3"},
    {"\"DA D\"",        "27A 3"},
    {"\"w-D\"",               "SMC-13"},
    {"\"A\"",                 "VIII"},
    {"\"D.D\"",               "1.2"},
    {"\"D \\an\\d D\"",       "11 and 12"},
    {"\"W\"",                 "Special issue A"},
    {(const char*)NULL,       NO_WARNING},
};

MATCH_PATTERN year_patterns[] =
{
    {"\"DDDD\"",        NO_WARNING},
    {"\"DDDD,WDDDD\"",        NO_WARNING},
    {"\"DDDD, DDDD, DDDD\"",  NO_WARNING},
    {"\"18dd, 18dd, 18dd\"",  "1889, 1890, 1891"},
    {"\"18dd, 18dd\"",        "1889, 1890"},
    {"\"18dd--d\"",           "1891--2"},
    {"\"18dd\"",        "1892"},
    {"\"18dda18dd\"",         "{\\noopsort{1885a}}1885"},
    {"\"19dd (19dd)\"",       "1989 (1990)" },
    {"\"19dd, 19dd, 19dd\"",  "1989, 1990, 1991"},
    {"\"19dd, 19dd\"",        "1989, 1990"},
    {"\"19dd--d\"",           "1991--2"},
    {"\"19dd\" # \"--\"",     "\"1989\" # \"\\unskip--\""},
    {"\"19dd\"",        "1992"},
    {"\"19dda19dd\"",         "{\\noopsort{1985a}}1985"},
    {"\"200d--d\"",           "2001--2"},
    {"\"200d\"",        "2009"},
    {(const char*)NULL,       NO_WARNING},
};

static long line_number;

int         main ARGS((int argc,char* argv[]));
static void process ARGS((const char *line_, MATCH_PATTERN patterns_[]));

#if defined(HAVE_STDC)
int
main(int argc, char* argv[])
#else /* K&R style */
int
main(argc,argv)
int argc;
char* argv[];
#endif
{
    char line[MAXLINE];

    /* Input lines should look like
                  key = "value",
       where key is either number or year. Lines with any other key
       values are ignored. */

    line_number = 0L;
    while (fgets(line,MAXLINE,stdin) != (char*)NULL)
    {
      char *p;

      line_number++;

      p = strchr(line,'\0');
      if (p != (char *)NULL)
      {
          for (--p; isspace(*p) || (*p == ','); --p)
            *p = '\0';  /* remove trailing whitespace and commas */
          for (p = line; isspace(*p); ++p)
                  /* NO-OP */;
          if (strncmp(p,"month",4) == 0)
            process(line,month_patterns);
          else if (strncmp(p,"number",6) == 0)
            process(line,number_patterns);
          else if (strncmp(p,"pages",4) == 0)
            process(line,pages_patterns);
          else if (strncmp(p,"volume",6) == 0)
            process(line,volume_patterns);
          else if (strncmp(p,"year",4) == 0)
            process(line,year_patterns);
          else
                  printf("%%%% %ld [%-24s]: %s\n", line_number, line, "ignored");
      }
    }

    exit (EXIT_SUCCESS);
    return (EXIT_SUCCESS);
}


#if defined(HAVE_STDC)
static void
process(const char *line, MATCH_PATTERN patterns[])
#else /* K&R style */
static void
process(line,patterns)
const char *line;
MATCH_PATTERN patterns[];
#endif
{
    int k;
    const char *p;

    p = strchr(line,'\"');
    if (p != (char *)NULL)
        line = p;

    for (k = 0; patterns[k].pattern != (const char*)NULL; ++k)
    {
      if (match_pattern(line,patterns[k].pattern) == YES)
      {
          if (patterns[k].message != NO_WARNING)
            printf("%%%% %ld [%-24s]: matches %s\n", line_number, line,
                   patterns[k].message);
          return;
      }
    }
    printf("?? %ld [%-24s]: illegal value\n", line_number, line);
}
#endif /* TEST */

Generated by  Doxygen 1.6.0   Back to index