view tok.c @ 0:a7e41e1a79c8

Initial revision
author Guido Berhoerster <guido+pwm@berhoerster.name>
date Thu, 19 Jan 2017 22:39:51 +0100
parents
children 722a45b4028b
line wrap: on
line source

/*
 * Copyright (C) 2016 Guido Berhoerster <guido+pwm@berhoerster.name>
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

#include "compat.h"

#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "tok.h"

enum tok_states {
	STATE_INITIAL,
	STATE_IN_WORD,
	STATE_IN_QUOTE,
	STATE_IN_WORD_ESCAPE,
	STATE_IN_QUOTE_ESCAPE
};

static inline int
strbuf_appendc(char **bufp, size_t *buf_sizep, int c)
{
	char	*buf = *bufp;
	size_t	buf_size = *buf_sizep;
	size_t	len;

	len = ((buf != NULL) && (c >= 0)) ? strlen(buf) : 0;

	/* allocate buffer if *bufp is NULL and *buf_sizep is 0 */
	if (buf_size < len + (c >= 0) + 1) {
		buf_size = (buf_size * 2 > BUFSIZ) ? buf_size * 2 : BUFSIZ;
		buf = realloc(buf, buf_size);
		if (buf == NULL) {
			return (-1);
		}
	}

	/* append character to string buffer or reset buffer if c is -1 */
	if (c >= 0) {
		buf[len++] = c;
	}
	buf[len] = '\0';

	*bufp = buf;
	*buf_sizep = buf_size;

	return (0);
}

enum tok_err
tok_tokenize(const char *s, int *tokencp, char ***tokenvp)
{
	int		retval = TOK_ERR_SYSTEM_ERROR;
	int		saved_errno = 0;
	char		**tokenv;
	size_t		tokenc = 0;
	const char	*p = s;
	enum tok_states	state = STATE_INITIAL;
	char		quote;
	char		*buf = NULL;
	size_t		buf_size = 0;
	char		*token;
	size_t		i;

	/*
	 * allocate maximum number of tokens including the terminating NULL
	 * pointer: ceil(length / 2) + 1
	 */
	tokenv = malloc(((strlen(s) + 2 - 1) / 2 + 1) * sizeof (char *));
	if (tokenv == NULL) {
		saved_errno = errno;
		goto out;
	}
	tokenv[0] = NULL;

	for (;;) {
		switch (state) {
		case STATE_INITIAL:
			switch (*p) {
			case ' ':	/* FALLTHROUGH */
			case '\t':	/* FALLTHROUGH */
			case '\n':
				/* skip initial whitespace */
				break;
			case '"':	/* FALLTHROUGH */
			case '\'':
				/* start quoted part of token */
				state = STATE_IN_QUOTE;
				quote = *p;
				if (strbuf_appendc(&buf, &buf_size, -1) != 0) {
					saved_errno = errno;
					goto out;
				}
				break;
			case '\\':
				/* start token with a backslash escape */
				state = STATE_IN_WORD_ESCAPE;
				if (strbuf_appendc(&buf, &buf_size, -1) != 0) {
					saved_errno = errno;
					goto out;
				}
				break;
			case '\0':
				/* end of input */
				retval = 0;
				goto out;
			default:
				/* start token with a word */
				state = STATE_IN_WORD;
				if (strbuf_appendc(&buf, &buf_size, -1) != 0) {
					saved_errno = errno;
					goto out;
				}
				if (strbuf_appendc(&buf, &buf_size, *p) != 0) {
					saved_errno = errno;
					goto out;
				}
			}
			break;
		case STATE_IN_WORD:
			switch (*p) {
			case ' ':	/* FALLTHROUGH */
			case '\t':	/* FALLTHROUGH */
			case '\n':	/* FALLTHROUGH */
			case '\0':
				/* end of token */
				token = strdup(buf);
				if (token == NULL) {
					saved_errno = errno;
					goto out;
				}
				tokenv[tokenc++] = token;
				tokenv[tokenc] = NULL;
				if (*p == '\0') {
					retval = 0;
					goto out;
				}
				state = STATE_INITIAL;
				break;
			case '"':	/* FALLTHROUGH */
			case '\'':
				/* start quoted part of token */
				state = STATE_IN_QUOTE;
				quote = *p;
				break;
			case '\\':
				/* start backslash escape */
				state = STATE_IN_WORD_ESCAPE;
				break;
			default:
				/* regular character */
				if (strbuf_appendc(&buf, &buf_size, *p) != 0) {
					saved_errno = errno;
					goto out;
				}
			}
			break;
		case STATE_IN_QUOTE:
			switch (*p) {
			case '"':	/* FALLTHROUGH */
			case '\'':
				if (*p == quote) {
					/* end quoted part of token */
					state = STATE_IN_WORD;
				} else {
					/* quote quote character */
					if (strbuf_appendc(&buf, &buf_size,
					    *p) != 0) {
						saved_errno = errno;
						goto out;
					}
				}
				break;
			case '\\':
				/* start quoted backslash escape */
				state = STATE_IN_QUOTE_ESCAPE;
				break;
			case '\0':
				/* unclosed quote */
				retval = TOK_ERR_UNTERMINATED_QUOTE;
				goto out;
			default:
				/* regular character */
				if (strbuf_appendc(&buf, &buf_size, *p) != 0) {
					saved_errno = errno;
					goto out;
				}
			}
			break;
		case STATE_IN_WORD_ESCAPE:	/* FALLTHROUGH */
		case STATE_IN_QUOTE_ESCAPE:
			if (*p == '\0') {
				/* trailing backslash */
				retval = TOK_ERR_TRAILING_BACKSLASH;
				goto out;
			}
			/* escaped character */
			state = (state == STATE_IN_WORD_ESCAPE) ?
			    STATE_IN_WORD : STATE_IN_QUOTE;
			if (strbuf_appendc(&buf, &buf_size, *p) != 0) {
				saved_errno = errno;
				goto out;
			}
			break;
		}
		p++;
	}

out:
	if (retval < 0) {
		for (i = 0; i < tokenc; i++) {
			free(tokenv[i]);
		}
		free(tokenv);
	} else {
		*tokencp = tokenc;
		*tokenvp = realloc(tokenv, (tokenc + 1) * sizeof (char *));
	}
	free(buf);
	if (retval < 0) {
		errno = saved_errno;
	}

	return (retval);
}