parse.c

/*                        Copyright (c) 1988 Bellcore
**                            All Rights Reserved
**       Permission is granted to copy or use this program, EXCEPT that it
**       may not be sold for profit, the copyright notice must be reproduced
**       on copies, and credit should be given to Bellcore where it is due.
**       BELLCORE MAKES NO WARRANTY AND ACCEPTS NO LIABILITY FOR THIS PROGRAM.
*/


#ifndef lint
static char rcsid[]= "$Header: parse.c,v 1.1 88/09/15 11:33:57 daniel Rel $";
#endif

#include "misc.h"
#include "flagdefs.h"
#include "float.h"
#include "tol.h"
#include "token.h"
#include "line.h"
#include "command.h"
#include "comment.h"
#include "parse.h"


#include <ctype.h>

#define _P_PARSE_CHATTER	1000


static	int _P_realline;	/* loop counter */
static  int _P_fnumb;

static  char *_P_nextchr;	/* pointer to the next character to parse */
static	char *_P_firstchr;		/* pointer to the beginning of the line being parsed */
static	int _P_next_tol;		/* number of floats seen on this line */
static	int _P_stringsize;		/* count of number of characters that are being
					read into a comment or literal */
static int _P_has_content;	/* flag to indicate if the line being
					parsed has any tokens on it */
static int _P_start;		/* first line to parse */
static int _P_lcount;		/* number of lines to parse */

static int _P_flags;		/* location for global flags */

/*
**	by default, "words" can be made up of numbers and letters
**	the following code allows for extending the alphabet that can
**	be used in words. this is useful for handling languages such
**	as C where the underscore character is an allowable character
**	in an identifier.  If a character (such as underscore) is NOT added
**	to the alphabet, the identifier will be broken into 2 or more "words"
**	by the parser.  as such the two sequences
**			one_two
**		and
**			one _ two
**	would look identical to spiff.
*/
#define _P_ALPHALEN 256
static char _P_alpha[_P_ALPHALEN];

static void
_P_alpha_clear()
{
	*_P_alpha = '\0';
}

static char *
_P_in_alpha(chr)
char chr;
{
	/*
	**	special case when string terminator
	**	is handed to us
	*/
	if ('\0' == chr)
		return(0);

	return strchr(_P_alpha,chr);
}

void
P_addalpha(ptr)
char *ptr;
{
	char buf[Z_LINELEN];

	S_wordcpy(buf,ptr);		/* copy up to (but not including)
						the first whitespace char */

	if ((strlen(_P_alpha) + strlen(buf)) >= _P_ALPHALEN)
	{
		Z_fatal("too many characters added to extended alphabet");
	}
	(void) strcat(_P_alpha,buf);
}

/*
**	put parser in a default state
*/

static char _P_dummyline[2];	/* a place to aim wild pointers */
static void
_P_initparser()
{
	_P_dummyline[0] = '\0';

	/*
	**	now reset all the state of each module
	*/
	C_clear_cmd();		/* disable embedded command key word */ 
	T_clear_tols();
	W_clearcoms();
	W_clearlits();
	_P_alpha_clear();	/* disable extended alphabet */

	/*
	**	and set state as defined by execute-time commands.
	*/
	C_docmds();
	return;
}


static
_P_needmore()
{
	return(*_P_nextchr == '\0');
}

static
_P_nextline()
{
	/*
	**	if the line that we just finished had
	**		some content,  increment the count
	*/
	if (_P_has_content)
	{
		L_incclmax(_P_fnumb);
		/*
		**	if the previous line had a token
		**		increment the line
		*/
		if (L_getcount(_P_fnumb,L_gettlmax(_P_fnumb)))
		{
			L_inctlmax(_P_fnumb);
			L_setcount(_P_fnumb,L_gettlmax(_P_fnumb),0);
		}
		_P_has_content = 0;
	}

	/*
	**	reset the number of floats seen on the line
	*/
	_P_next_tol = 0;

	/*
	**	get another line if there is one available
	*/
	_P_realline++;
	if (_P_realline >= _P_start+_P_lcount)
	{
		return(1);
	}

	_P_firstchr = _P_nextchr = L_getrline(_P_fnumb,_P_realline);
	/*
	**	and look for a command
	*/
	if (C_is_cmd(_P_firstchr))
	{
		_P_nextchr = _P_dummyline;
		_P_has_content = 0;
	}
	else
	{
		/*
		**	we have a real line, so set up the index
		*/
		L_setclindex(_P_fnumb,L_getclmax(_P_fnumb),_P_realline);
		_P_has_content = 1;
	}
	return(0);
}

/*
**	the following three routines (_P_litsnarf, _P_bolsnarf, and _P_comsnarf
**	all do roughly the same thing. they scan ahead and collect the
**	specified string, move _P_nextchr to the end of the
**	comment or literal and return 1 if we run off the end of file,
**	0 otherwise.  it would have been nice to have 1 routine handle
**	all three task (there is much common code), however there were
**	so enough differences, (for instance, only comments check for nesting,
**	only literals need to set _P_stringsize, etc)
**	that I decided to split them up.
*/
static int
_P_litsnarf(litptr)
W_lit litptr; 
{
	_P_stringsize = 0;
	/*
	**	skip the start of literal string
	*/
	_P_nextchr += strlen(W_litbegin(litptr));
	_P_stringsize += strlen(W_litbegin(litptr));
	/*
	**	is there a separate end string?
	**		if not, then we're done
	*/
	if ('\0' == *(W_litend(litptr)))
	{
		return(0);
	}
	/*
	**	loop once for each character in the literal
	*/
	while(1)
	{
		/*
		**	if we are out of characters, move on to next line
		*/
		if (_P_needmore())
		{
			if (_P_nextline())
			{
				return(1);
			}
			if (!_P_has_content)
			{
				/*
				**	since we've just gotten a command
				**		check to see if this literal
				**		is still legit ...
				**		could have just been reset
				**		by the command
				*/
				if (!W_is_lit(litptr))
				{
					return(0);
				}
			}
		} /* if _P_needmore */

		/*
		**	see if we have an escaped end of literal string
		*/
		if (('\0' != *(W_litescape(litptr))) && /* escape string exists */
		  !S_wordcmp(_P_nextchr,
			   W_litescape(litptr)) &&     /* and escape matches */
		  !S_wordcmp(_P_nextchr+strlen(W_litescape(litptr)),
			   W_litend(litptr)))	     /* and endstring matches */
		{
			_P_nextchr += strlen(W_litescape(litptr))
					+ strlen(W_litend(litptr));
			_P_stringsize += strlen(W_litescape(litptr))
					+ strlen(W_litend(litptr));
			continue;
		}

		/*
		**	see if we have an end of literal string
		*/
		if (!S_wordcmp(_P_nextchr,W_litend(litptr))) /* escape matches */
		{
			_P_nextchr += strlen(W_litend(litptr));
			_P_stringsize += strlen(W_litend(litptr));
			return(0);
		}
		/*
		**	this must be yet another character in the literal, so
		**	just snarf it up
		*/
		_P_nextchr++;
		_P_stringsize++;
	}	/* while loop once for each character */

#ifndef lint
	Z_fatal("shouldn't execute this line at the end of _P_litsnarf");
#endif
} /* _P_litsnarf */

static int
_P_bolsnarf(bolptr)
W_bol bolptr; 
{
	/*
	**	skip the start of comment string
	*/
	_P_nextchr += strlen(W_bolbegin(bolptr));
	/*
	**	is there a separate end string
	**		if not, then we're done
	*/
	if ('\0' == *(W_bolend(bolptr)))
	{
		return(0);
	}
	/*
	**	loop once for each character in the comment
	*/
	while(1)
	{
		/*
		**	if we are out of characters,move on to next line
		*/
		if (_P_needmore())
		{
			if (_P_nextline())
			{
				return(1);
			}
			if (!_P_has_content)
			{
				/*
				**	since we've just gotten a command
				**		check to see if this comment
				**		is still legit ... comments
				**		could have just been reset
				**		by the command
				*/
				if (!W_is_bol(bolptr))
				{
					return(0);
				}
			}
		} /* if at end of line */

		/*
		**	see if we have an escaped end of comment string
		*/
		if ('\0' != *(W_bolescape(bolptr)) && /* escape string exists */
		  !S_wordcmp(_P_nextchr,
			   W_bolescape(bolptr)) &&     /* and escape matches */
		  !S_wordcmp(_P_nextchr+strlen(W_bolescape(bolptr)),
			   W_bolend(bolptr)))	 /* and end string matches */
		{
			_P_nextchr += strlen(W_bolescape(bolptr))
					+ strlen(W_bolend(bolptr));
			continue;
		}

		/*
		**	see if we have an end of comment string
		*/
		if (!S_wordcmp(_P_nextchr,W_bolend(bolptr)))
		{
			_P_nextchr += strlen(W_bolend(bolptr));
			return(0);
		}
		/*
		**	this must be yet another character in the comment, so
		**	just snarf it up
		*/
		_P_nextchr++;
	}	/* while loop once for each character */

#ifndef lint
	Z_fatal("shouldn't execute this line in at end of _P_bolsnarf");
#endif
} /* _P_bolsnarf */

/*
**	pass over a comment -- look for nexting
*/
static
_P_comsnarf(comptr)
W_com comptr; 
{
	int depth = 1; /* nesting depth */
	/*
	**	skip the start of comment string
	*/
	_P_nextchr += strlen(W_combegin(comptr));

	/*
	**	is there a separate end string
	**		if not, then we're done
	*/
	if ('\0' == *(W_comend(comptr)))
	{
		return(0);
	}
	/*
	**	loop once for each character in the comment
	*/
	while(1)
	{
		/*
		**	if we are out of characters, move on to next line
		*/
		if (_P_needmore())
		{
			if (_P_nextline())
			{
				return(1);
			}
			if (!_P_has_content)
			{
				/*
				**	since we've just gotten a command
				**		check to see if this comment
				**		is still legit ... comments
				**		could have just been reset
				**		by the command
				*/
				if (!W_is_com(comptr))
				{
					return(0);
				}
			}
		} /* if at end of line */

		/*
		**	see if we have an escaped end of comment string
		*/
		if ('\0' != *(W_comescape(comptr)) &&  /* escape string exists */
		  !S_wordcmp(_P_nextchr,
			   W_comescape(comptr)) &&    /* and escape matches */
		  !S_wordcmp(_P_nextchr+strlen(W_comescape(comptr)),
			   W_comend(comptr)))	/* and end string matches */
		{
			/*
			** skip over the escape sequence and the end sequence
			*/
			_P_nextchr += strlen(W_comescape(comptr))
					+ strlen(W_comend(comptr));
			continue;
		}

		/*
		**	see if we have an end of comment string
		*/
		if (!S_wordcmp(_P_nextchr,W_comend(comptr))) /* end  matches */
		{
			/*
			**	skip over the end sequence
			*/
			_P_nextchr += strlen(W_comend(comptr));
			if (W_is_nesting(comptr))
			{
				depth--;
				if (0 == depth)
					return(0);
			}
			else
			{
				return(0);
			}
			continue;
		}
		/*
		**	see if we have another beginning of comment string
		*/
		if (W_is_nesting(comptr) &&
			!S_wordcmp(_P_nextchr,W_comend(comptr))) /* end matches */
		{
			_P_nextchr += strlen(W_comend(comptr));
			depth++;
			continue;
		}
		/*
		**	this must be yet another character in the comment, so
		**	just snarf it up
		*/
		_P_nextchr++;
	}	/* while loop once for each character */

#ifndef lint
		Z_fatal("should not execute this line in _P_comsnarf\n");
#endif

} /* _P_comsnarf */


/*
**	parse a file
*/
static void
_P_do_parse()
{

	char *ptr;		/* scratch space */
	int tmp;
	int ret_code;

	K_token newtoken;
	W_bol bolptr;
	W_com comptr;
	W_lit litptr;

	int startline, endline, startpos;

	/*
	**	main parsing loop
	*/
	while (1)
	{
		/*
		**	get more text if necessary
		*/
		if (_P_needmore())
		{
			if (_P_nextline())
			{
				return;
			}

			/*
			**	if the line contains nothing of interest,
			**		try again
			*/
			if (!_P_has_content)
			{
				continue;
			}

			/*
			**	check to see if this line starts a comment
			*/
			if ((bolptr = W_isbol(_P_firstchr)) != W_BOLNULL)
			{
				if (_P_bolsnarf(bolptr))
				{
					return;
				}
				continue;
			}
		} /* if _P_needmore */

		/*
		**	skip whitespace
		*/
		if (!(U_INCLUDE_WS & _P_flags) && isspace(*_P_nextchr))
		{
			_P_nextchr++;
			continue;
		}

		/*
		**	check to see if this character starts a comment
		*/
		if ((comptr = W_iscom(_P_nextchr)) != W_COMNULL)
		{
			if (_P_comsnarf(comptr))
			{
				return;
			}
			continue;
		}

		/*
		**	if there aren't any tokens on this line already
		**	set up the index from the token line to the content line
		*/
		if (!L_getcount(_P_fnumb,L_gettlmax(_P_fnumb)))
		{
			L_settlindex(_P_fnumb,
					L_gettlmax(_P_fnumb),
					L_getclmax(_P_fnumb));
			/*
			**	and the pointer from the token line to the 
			** 	first  token on the line
			*/
			L_setindex(_P_fnumb,
					L_gettlmax(_P_fnumb),
					K_gettmax(_P_fnumb));
		}

		startline =  L_tl2cl(_P_fnumb,L_gettlmax(_P_fnumb));
		startpos = _P_nextchr-_P_firstchr;

		newtoken = K_maketoken();
		K_setline(newtoken,L_gettlmax(_P_fnumb));
		K_setpos(newtoken,startpos);

		ret_code = 0;
		/*
		**	check to see if this character starts a
		**		delimited literal string
		*/
		if ((litptr = W_islit(_P_nextchr)) != W_LITNULL)
		{
			ret_code = _P_litsnarf(litptr);
			K_settype(newtoken,K_LIT);
			S_allocstr(&ptr,_P_stringsize);
			/*
			**	fixed nasty memory bug here by adding else
			**	old code copied entire line even if literal
			**	ended before the end of line
			**		should check into getting strcpy loaded
			**		locally
			*/
			endline = L_getclmax(_P_fnumb);
			if (endline > startline)
			{
				/*
				**	copy in the first line of the literal
				*/
				(void) strcpy(ptr,
					      L_getcline(_P_fnumb,startline)
							+startpos);
				/*
				**	now copy all the lines between
				**		the first and last
				*/
				for (tmp=startline+1;tmp<endline;tmp++)
				{
					(void) strcat(ptr,
						      L_getcline(_P_fnumb,tmp));
				}
				/*
				**	and now copy in the last line
				*/
				(void) strncat(ptr,
					       L_getcline(_P_fnumb,endline),
					       _P_stringsize-strlen(ptr));
			}
			else
			{
				(void) strncpy(ptr,
					       L_getcline(_P_fnumb,startline)
								+startpos,
					      _P_stringsize);
				/*
				**	terminate the string you just copied
				*/
				ptr[_P_stringsize] = '\0';
			}
			K_settext(newtoken,ptr);
		} /* if is_lit */

		/*
		**	see if this is a floating point number
		*/
		else if (tmp = F_isfloat(_P_nextchr,
				       _P_flags & U_NEED_DECIMAL,
				       _P_flags & U_INC_SIGN))
		{
			K_saventext(newtoken,_P_nextchr,tmp);
			K_settype(newtoken,K_FLO_NUM);
			if (!(_P_flags & U_BYTE_COMPARE))
			{
				K_setfloat(newtoken,
					   F_atof(K_gettext(newtoken),
					   USE_ALL));

				/*
				**	assign the curent tolerance
				*/
				K_settol(newtoken,T_gettol(_P_next_tol));
			}

			/*
			**	use next tolerance in the
			**		specification if there is one
			*/
			if (T_moretols(_P_next_tol))
			{
				_P_next_tol++;
			}
			/*
			**	and move pointer past the float
			*/
			_P_nextchr += tmp;
		}

		/*
		**	is this a fixed point number
		*/
		else if (isdigit(*_P_nextchr))
		{
			for(ptr=_P_nextchr; isdigit(*ptr); ptr++)
			{
			}
			K_saventext(newtoken,_P_nextchr,ptr-_P_nextchr);
			K_settype(newtoken,K_LIT);
			_P_nextchr = ptr;
		}

		/*
		**	try an alpha-numeric word
		*/
		else if (isalpha(*_P_nextchr) || _P_in_alpha(*_P_nextchr))
		{
			/*
			**	it's a multi character word
			*/
			for(ptr = _P_nextchr;
			    isalpha(*ptr)
				|| isdigit(*ptr)
				|| _P_in_alpha(*ptr);
			    ptr++)
			{
			}
			K_saventext(newtoken,_P_nextchr,ptr-_P_nextchr);
			K_settype(newtoken,K_LIT);
			_P_nextchr = ptr;
		}
		else
		{
			/*
			**	otherwise, treat the char itself as a token
			*/
			K_saventext(newtoken,_P_nextchr,1);
			K_settype(newtoken,K_LIT);
			_P_nextchr++;
		}

		K_settoken(_P_fnumb,K_gettmax(_P_fnumb),newtoken);
		L_inccount(_P_fnumb,L_gettlmax(_P_fnumb));
		/*
		**	if we are out of space, complain and quit
		*/
		if (K_inctmax(_P_fnumb))
		{
			(void) sprintf(Z_err_buf,
     "warning -- too many tokens in file only first %d tokens will be used.\n",
				       K_MAXTOKENS);
			Z_complain(Z_err_buf);
			return;
		}
#ifndef NOCHATTER
		if (0 == (K_gettmax(_P_fnumb) % _P_PARSE_CHATTER))
		{
			int max = K_gettmax(_P_fnumb);
			(void) sprintf(Z_err_buf,
				"scanned %d words from file #%d\n",
					max,_P_fnumb+1);
			Z_chatter(Z_err_buf);
		}
#endif

		/*
		**	are we done?
		*/
		if(ret_code)
		{
			return;
		}
	}   /* loop once per object on a line */

#ifndef lint 
	Z_fatal("this line should never execute");
#endif
}

void
P_file_parse(num,strt,lcnt,flags)
int num;	/* file number */
int strt;	/* first line to parse expressed in real line numbers */
int lcnt;	/* max number of lines to parse */
int flags;	/* flags for controlling the parse mode */
{
	/*
	**	set module-wide state variables
	*/
	_P_fnumb = num;		
	_P_start = strt;	
	_P_lcount = lcnt;
	_P_flags = flags;

	_P_initparser();

	_P_nextchr = _P_dummyline;

	_P_has_content = 0;
	_P_next_tol = 0;
	L_setcount(_P_fnumb,L_gettlmax(_P_fnumb),0);
	/*
	**	start everything back one line (it will be incremented
	**		just before the first line is accessed
	*/
	_P_realline = _P_start-1;

	_P_do_parse();

	/*
	**	if the last line had content, increment the count
	*/
	if (_P_has_content)
	{
/*
**	this code will get executed if we stopped parsing in the middle
**	of a line.  i haven't looked at this case carefully.
**	so, there is a good chance that it is buggy.
*/
(void) sprintf(Z_err_buf,"parser got confused at end of file\n");
Z_complain(Z_err_buf);
		L_incclmax(_P_fnumb);
		if (L_getcount(_P_fnumb,L_gettlmax(_P_fnumb)))
			L_inctlmax(_P_fnumb);
	}
	return;
}