/*
 * File: utf2any.l
 *
 * (c) Peter Kleiweg 2000
 *
 * This is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2,
 * or (at your option) any later version.
 *
 * Compile:
 *     flex -B -8 utf2any.l
 *     gcc -s -Wall -o utf2any lex.yy.c -lfl
 *     rm lex.yy.c
 *
 */

%{

#define UTFanyVERSION "1.0"

/*
 * MAPDIR is the directory were symbol maps are searched.
 * This should be a path, ending with a slash, surrounded by double quotes,
 * or it should be NULL.
 */

#ifndef MAPDIR
#  ifdef __MSDOS__
#    define MAPDIR "c:\\utf\\"
#  else
#    define MAPDIR "/usr/local/lib/utf/"
#  endif
#endif

#ifdef __MSDOS__
#  ifndef __COMPACT__
#    error Memory model COMPACT required
#  endif
#  include <dir.h>
#  include <fcntl.h>
#else
#  include <unistd.h>
#endif
#include <ctype.h>
#include <errno.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifdef __MSDOS__
#define strcasecmp(A, B) (stricmp((A), (B)))
#endif

#define BUFSIZE 2048

typedef enum { FALSE = 0, TRUE } BOOL_;

typedef enum { uUNDEF = 0, uUTF7, uUTF8 } UTF_;

typedef enum { aECHO, aSPACE, aSKIP, aFORMAT } ACTION_;

typedef enum { cUCHAR, cUNSIGNED, cULONG, cNONE } CAST_;

typedef struct {
    unsigned long
        ul,
	order;
    char
        *s;
} TRANS_;

typedef struct {
    long unsigned
        from,
        to;
    ACTION_
        action;
    CAST_
        cast;
    char
        *format;
} RANGE_;

BOOL_
    verbose = FALSE,
    warnings = FALSE;

UTF_
    utf_type = uUNDEF;

TRANS_
    *trans = NULL;

RANGE_
    *range = NULL;

char
    buf2    [BUFSIZE + 1],
    buffer  [BUFSIZE + 1],
    bufword [BUFSIZE + 1],
    f_unsigned [] = "[U+%04X]",
    f_ulong    [] = "[U+%08lX]",
    *infile,
    *lower [256],
    *no_mem_buffer,
    out_of_memory [] = "Out of memory",
    *programname,
    s_echo  [] = "#ECHO#",
    s_skip  [] = "#SKIP#",
    s_space [] = "#SPACE#";

int
    bufp,
    max_range = 0,
    max_trans = 0,
    n_range = 0,
    n_trans = 0,
    wtable [256];

unsigned int
    instep,
    outcode [2],
    outstep;

unsigned long
    order = 0,
    incount = 1;

void
    addchar (char *filename, int lineno, unsigned long ul, char *s),
    addaction (
        char *filename,
        int lineno,
        long unsigned from,
        long unsigned to,
        ACTION_ action,
	CAST_ cast,
        char *format
    ),
    bytes2 (void),
    bytes3 (void),
    bytes4 (void),
    bytes5 (void),
    bytes6 (void),
    codewarn (unsigned long ul),
    errit (char const *format, ...),
    ferrit (char *filename, int lineno, char const *format, ...),
    get_programname (char const *argv0),
    nextout (void),
    outchar (unsigned char i),
    outsymbol (unsigned long ul),
    readtrans (char *file, char *dir, int level),
    *s_malloc (size_t size),
    *s_realloc (void *block, size_t size),
    syntax (void),
    utf7 (void);

char
    *getbasename (char *s),
    *getdirname (char *s),
    *getword (char *filename, int lineno),
    *s_strdup (char const *s);

int
    getline (FILE *fp, int *lineno),
    nlcount (void),
    searchcmp (const void *p1, const void *p2),
    srtcmp (const void *p1, const void *p2);

long unsigned
    getvalue (char *filename, int lineno);

#define YY_NO_UNPUT
#define YY_SKIP_YYWRAP
#ifdef yywrap
#  undef yywrap
#endif
int yywrap()
{
    return 1;
}

%}

%Start _utf7 _utf7b _utf8

%%

<INITIAL>.|\n                 { yyless (0);
                                BEGIN ((utf_type == uUTF7) ? _utf7 : _utf8);
                              }

<_utf7>{
"+-"                          { outchar ('+'); }
"+"      		      { instep = outstep = 0;
                                BEGIN _utf7b; }
}

<_utf7b>{
[A-Za-z0-9+/]   	      { utf7 (); }
"-"     		      { BEGIN _utf7; }
.|\n	        	      { if (yytext [0] == '\n')
                                    incount++;
                                outchar (yytext [0]);
                                BEGIN _utf7; }
}

<_utf8>{
[\300-\337].		      { incount += nlcount (); bytes2 (); }
[\340-\357]..                 { incount += nlcount (); bytes3 (); }
[\360-\367]...                { incount += nlcount (); bytes4 (); }
[\370-\373]....		      { incount += nlcount (); bytes5 (); }
[\374-\375].....	      { incount += nlcount (); bytes6 (); }
}

<_utf7,_utf8>.|\n             { if (yytext [0] == '\n')
                                    incount++;
                                outchar (yytext [0]);
                              }

%%

/*
 * Helper functions for UTF-7 parser
 */

void utf7 ()
{
    unsigned
        i,
        c;

    i = yytext [0];
    if (i >= 'A' && i <= 'Z')
        c = i - 'A';
    else if (i >= 'a' && i <= 'z')
        c = i + 26 - 'a';
    else if (i >= '0' && i <= '9')
        c = i + 52 - '0';
    else if (i == '+')
        c = 62;
    else if (i == '/')
        c = 63;

    switch (instep) {
        case 0:
            outcode [outstep] = (c << 2);
            break;
        case 1:
            outcode [outstep] |= (c >> 4);
            nextout ();
            outcode [outstep] = (c << 4);
            break;
        case 2:
            outcode [outstep] |= (c >> 2);
            nextout ();
            outcode [outstep] = (c << 6);
            break;
        case 3:
            outcode [outstep] |= c;
            nextout ();
            break;
    }
    if (++instep == 4)
        instep = 0;
}

void nextout ()
{
    unsigned
        c;

    if (outstep == 0) {
        outstep = 1;
    } else {
        outstep = 0;
        c = ((outcode [0] & 0xFF) << 8) | (outcode [1] & 0xFF);
        outsymbol (c);
    }
}

/*
 * Helper functions for UTF-8 parser
 */

void bytes2 ()
{
    unsigned
        u [2],
        c;
    int
        i;

    for (i = 0; i < 2; i++)
        u [i] = (unsigned char) yytext [i];

    c =   ( u [1] & 0x3F)
        | ((u [0] & 0x1F) << 6);

    outsymbol (c);
}


void bytes3 ()
{
    unsigned
        u [3],
        c;
    int
        i;

    for (i = 0; i < 3; i++)
        u [i] = (unsigned char) yytext [i];

    c =   ( u [2] & 0x3F)
        | ((u [1] & 0x3F) <<  6)
        | ((u [0] & 0x0F) << 12);

    outsymbol (c);
}

void bytes4 ()
{
    long unsigned
        u [4],
        c;
    int
        i;

    for (i = 0; i < 4; i++)
        u [i] = (unsigned char) yytext [i];

    c =   ( u [3] & 0x3F)
        | ((u [2] & 0x3F) <<  6)
        | ((u [1] & 0x3F) << 12)
        | ((u [0] & 0x07) << 18);

    outsymbol (c);
}

void bytes5 ()
{
    long unsigned
        u [5],
        c;
    int
        i;

    for (i = 0; i < 5; i++)
        u [i] = (unsigned char) yytext [i];

    c =   ( u [4] & 0x3F)
        | ((u [3] & 0x3F) <<  6)
        | ((u [2] & 0x3F) << 12)
        | ((u [1] & 0x3F) << 18)
        | ((u [0] & 0x03) << 24);

    outsymbol (c);
}

void bytes6 ()
{
    long unsigned
        u [6],
        c;
    int
        i;

    for (i = 0; i < 6; i++)
        u [i] = (unsigned char) yytext [i];

    c =   ( u [5] & 0x3F)
        | ((u [4] & 0x3F) <<  6)
        | ((u [3] & 0x3F) << 12)
        | ((u [2] & 0x3F) << 18)
        | ((u [1] & 0x3F) << 24)
        | ((u [0] & 0x01) << 30);

    outsymbol (c);
}

/*
 * General helper functions for parser
 */

void outchar (unsigned char c)
{
    if (wtable [c])
	codewarn (c);

    if (! lower [c])
        fputc (c, yyout);
    else
	fputs (lower [c], yyout);
}

void outsymbol (unsigned long ul)
{
    int
	i;
    TRANS_
        *p;

    if (ul < 256) {
        if (wtable [ul])
	    codewarn (ul);
	if (! lower [ul])
	    fputc ((unsigned int) ul, yyout);
	else
	    fputs (lower [ul], yyout);
	return;
    }

    p = (TRANS_ *) bsearch (&ul, trans, n_trans, sizeof (TRANS_), searchcmp);
    if (p) {
	fputs (p->s, yyout);
	return;
    }
    
    codewarn (ul);

    for (i = n_range - 1; i >= 0; i--)
	if (ul >= range [i].from && ul <= range [i].to)
	    break;
    switch (range [i].action) {
	case aSPACE:
	    fputc (' ', yyout);
	    break;
        case aFORMAT:
	    if (range [i].cast == cUCHAR)
		fprintf (yyout, range [i].format, (unsigned char) ul);
	    else if (range [i].cast == cUNSIGNED)
		fprintf (yyout, range [i].format, (unsigned) ul);
	    else
		fprintf (yyout, range [i].format, (unsigned long) ul);
	    break;
	case aECHO:
	    /* won't happen beyond 255 */
	case aSKIP:
	    break;
    }
}

void codewarn (unsigned long ul)
{
    if (! warnings)
	return;

    if (ul < 0x10000)
        fprintf (stderr, "%s:%lu: U+%04X %5u\n", infile, incount, (unsigned) ul, (unsigned) ul);
    else
        fprintf (stderr, "%s:%lu: U+%08lX %10lu\n", infile, incount, ul, ul);
}

int nlcount ()
{
    int
        i,
        sum;

    sum = 0;
    for (i = 0; yytext [i]; i++)
        if (yytext [i] == '\n')
	    sum++;
    return sum;
}


int main (int argc, char *argv [])
{
    int
        i;

    no_mem_buffer = (char *) malloc (1024);

    get_programname (argv [0]);

    /* pre-defined actions */
    for (i = 0; i < 256; i++) {
        lower [i] = NULL;
	wtable [i] = 0;
    }
    addaction (NULL, 0, 0x10000, 0x7FFFFFFF, aFORMAT, cULONG,    f_ulong);
    addaction (NULL, 0,   0x100,     0xFFFF, aFORMAT, cUNSIGNED, f_unsigned);
    addaction (NULL, 0,    0x7F,       0x9F, aFORMAT, cUNSIGNED, f_unsigned);
    addaction (NULL, 0,       0,       0x1F, aFORMAT, cUNSIGNED, f_unsigned);
    addchar (NULL, 0, '\t', s_echo);
    addchar (NULL, 0, '\n', s_echo);
    addchar (NULL, 0, '\r', s_echo);
    addchar (NULL, 0, '\f', s_echo);

    while (argc > 1 && argv [1][0] == '-') {
	if (! strcmp (argv [1], "-7"))
	    utf_type = uUTF7;
	else if (! strcmp (argv [1], "-8"))
	    utf_type = uUTF8;
	else if (argv [1][1] == 'f') {
	    if (argv [1][2])
	        readtrans (argv [1] + 2, MAPDIR, 0);
	    else {
		if (argc == 2)
		    errit ("Missing argument for option '-f'");
		argv++;
		argc--;
		readtrans (argv [1], MAPDIR, 0);
	    }
	} else if (! strcmp (argv [1], "-v"))
	    verbose = TRUE;
	else if (! strcmp (argv [1], "-w"))
	    warnings = TRUE;
	else
	    syntax ();
	argv++;
	argc--;
    }

    if (n_trans) {
	qsort (trans, n_trans, sizeof (TRANS_), srtcmp);
	i = 0;
	while (i < n_trans - 1)
	    if (trans [i].ul == trans [i + 1].ul) {
		memmove (trans + i, trans + i + 1, (n_trans - i - 1) * sizeof (TRANS_));
		n_trans--;
	    } else
		i++;
    }

    switch (argc) {
        case 1:
            if (isatty (fileno (stdin)))
                syntax ();
            yyin = stdin;
	    infile = "(stdin)";
            break;
        case 2:
            yyin = fopen (argv [1], "r");
            if (! yyin)
                errit ("Opening file \"%s\": %s", argv [1], strerror (errno));
	    infile = argv [1];
            break;
        default:
            syntax ();
    }

    if (! utf_type)
	errit ("Missing option '-7' or '-8'");

    yyout = stdout;

#ifdef __MSDOS__
    setmode (fileno (yyin ), O_BINARY);
    setmode (fileno (yyout), O_BINARY);
#endif

    yylex ();

    if (yyin != stdin)
	fclose (yyin);
    if (yyout != stdout)
	fclose (yyin);

    return 0;
}

int srtcmp (const void *p1, const void *p2)
{
    unsigned long
        ul1,
	ul2;

    
    ul1 = ((TRANS_ *)p1)->ul;
    ul2 = ((TRANS_ *)p2)->ul;
    if (ul1 < ul2)
	return -1;
    else if (ul1 > ul2)
	return 1;

    ul1 = ((TRANS_ *)p1)->order;
    ul2 = ((TRANS_ *)p2)->order;
    if (ul1 < ul2)
	return -1;
    else
	return 1;
}

int searchcmp (const void *p1, const void *p2)
{
    unsigned long
        ul1,
	ul2;

    ul1 = *((unsigned long *)p1);
    ul2 = ((TRANS_ *)p2)->ul;

    if (ul1 < ul2)
	return -1;
    else if (ul1 > ul2)
	return 1;
    else
	return 0;
}

void readtrans (char *file, char *dir, int level)
{
    int
        lineno;
    long unsigned
	from,
	to,
	ul;
    char
	*s,
        *filename,
        *basename,
	*dirname;
    FILE
        *fp;
    CAST_
	cast;

    if (level > 10)
	errit ("File \"%s\": nesting too deep", file);

    /*
     * Try opening file
     * If failure and filename has no directory part, then try in dir
     */
    filename = file;
    fp = fopen (filename, "r");
    if ((! fp) && dir) {
	basename = getbasename (filename);
        if (! strcmp (basename, filename)) {
            filename = (char *) s_malloc (
		           (strlen (basename) + strlen (dir) + 1) * sizeof (char)
		       );
	    strcpy (filename, dir);
	    strcat (filename, basename);
	    fp = fopen (filename, "r");
	}
    }
    if (! fp)
        errit ("Opening file \"%s\": %s", filename, strerror (errno));
    if (verbose)
	fprintf (stderr, "Begin %s\n", filename);

    dirname = getdirname (filename);
    if (! dirname)
	dirname = dir;

    lineno = 0;
    while (getline (fp, &lineno)) {
	switch (buffer [bufp]) {
	    /* translation for range of characters */
	    case 'd':
	    case 'D':
		bufp++;
		from = getvalue (filename, lineno);
		to = getvalue (filename, lineno);
		s = getword (filename, lineno);
		if (! strcasecmp (s, s_skip))
		    addaction (filename, lineno, from, to, aSKIP, cNONE, NULL);
		else if (! strcasecmp (s, s_echo))
		    addaction (filename, lineno, from, to, aECHO, cNONE, NULL);
		else if (! strcasecmp (s, s_space))
		    addaction (filename, lineno, from, to, aSPACE, cNONE, NULL);
		else {
		    if (! strcmp (s, "uchar"))
			cast = cUCHAR;
		    else if (! strcmp (s, "unsigned"))
			cast = cUNSIGNED;
		    else if (! strcmp (s, "ulong"))
			cast = cULONG;
		    else
			ferrit (filename, lineno, "Illegal action \"%s\"", s);
		    addaction (filename, lineno, from, to, aFORMAT, cast, buffer + bufp);
		}
		break;
	    /* include file */
	    case 'i':
	    case 'I':
		bufp++;
		readtrans (s_strdup (getword (filename, lineno)), dirname, level + 1);
		break;
	    /* single character translation */
	    default:
		ul = getvalue (filename, lineno);
		addchar (filename, lineno, ul, buffer + bufp);
	}
    }
    fclose (fp);
    if (verbose)
	fprintf (stderr, "End %s\n", filename);
}

void addchar (char *filename, int lineno, unsigned long ul, char *s)
{
    BOOL_
        echo;

    if (ul > 0x7fffffff)
        ferrit (filename, lineno, "%s out of range: 0x%lX", s, ul);

    echo = FALSE;

    if (! strcasecmp (s, s_skip))
	s = "";
    else if (! strcasecmp (s, s_space))
	s = " ";
    else if (! strcasecmp (s, s_echo))
	echo = TRUE;

    if (ul < 256) {
	lower [ul] = echo ? NULL : s_strdup (s);
	wtable [ul] = 0;
    } else {
	if (echo)
	    ferrit (filename, lineno, "%s out of range: 0x%lX", s_echo, ul);
	if (n_trans == max_trans) {
	    max_trans += 1024;
	    trans = (TRANS_ *) s_realloc (trans, max_trans * sizeof (TRANS_));
	}
	trans [n_trans].ul = ul;
	trans [n_trans].order = order++;
	trans [n_trans++].s = s_strdup (s);
    }
}

void addaction (char *filename,
		int lineno,
		long unsigned from,
		long unsigned to,
		ACTION_ action,
		CAST_ cast,
		char *format)
{
    long unsigned
	u;

    if (from > to)
	ferrit (filename, lineno, "Illegal range");

    if (from > 0x7fffffff)
	ferrit (filename, lineno, "Begin of range to large: 0x%lX", from);

    if (to > 0x7fffffff)
	ferrit (filename, lineno, "End of range to large: 0x%lX", to);

    for (u = from; u <= to && u < 256; u++) {
	wtable [u] = 1;
	if (action == aSKIP)
	    lower [u] = "";
	else if (action == aSPACE)
	    lower [u] = " ";
	else if (action == aECHO)
	    lower [u] = NULL;
	else {
	    if (cast == cUCHAR)
		sprintf (buf2, format, (unsigned char) u);
	    else if (cast == cUNSIGNED)
		sprintf (buf2, format, (unsigned) u);
	    else
		sprintf (buf2, format, (long unsigned) u);
	    lower [u] = s_strdup (buf2);
	}
    }

    if (from < 256)
	from = 256;
    if (from > to)
	return;

    if (action == aECHO)
	ferrit (filename, lineno, "Out of range for %s", s_echo);

    if (n_range == max_range) {
	max_range += 256;
	range = (RANGE_ *) s_realloc (range, max_range * sizeof (RANGE_));
    }
    range [n_range].from = from;
    range [n_range].to = to;
    range [n_range].action = action;
    if (action == aFORMAT) {
	range [n_range].cast = cast;
	range [n_range].format = s_strdup (format);
    }
    n_range++;
}

long unsigned getvalue (char *filename, int lineno)
{
    long unsigned
        ulong;
    int
        n;
    char
        *format;

    while (buffer [bufp] && isspace ((unsigned char) buffer [bufp]))
	bufp++;
    if (((buffer [bufp] == 'u' || buffer [bufp] == 'U') && buffer [bufp + 1] == '+') ||
	(buffer [bufp] == '0' && (buffer [bufp + 1] == 'x' || buffer [bufp + 1] == 'X'))) {
	bufp += 2;
	format = "%lx%n";
    } else if (buffer [bufp] == '0')
	format = "%lo%n";
    else
	format = "%lu%n";
    if (sscanf (buffer + bufp, format, &ulong, &n) != 1)
	errit ("Missing value in \"%s\", line %i", filename, lineno);
    bufp += n;
    while (buffer [bufp] && isspace ((unsigned char) buffer [bufp]))
        bufp++;
    return ulong;
}

char *getword (char *filename, int lineno)
{
    int
        n;

    while (buffer [bufp] && isspace ((unsigned char) buffer [bufp]))
	bufp++;
    if (sscanf (buffer + bufp, "%s%n", bufword, &n) != 1)
	errit ("Missing word in \"%s\", line %i", filename, lineno);
    bufp += n;
    while (buffer [bufp] && isspace ((unsigned char) buffer [bufp]))
        bufp++;
    return bufword;
}

char *getbasename (char *filename)
{
    char
        *p;

#ifdef __MSDOS__
    p = strrchr (filename, '\\');
#else   /* unix */
    p = strrchr (filename, '/');
#endif    
    if (p)
        return p + 1;
    else
        return filename;
}

char *getdirname (char *filename)
{
    char
        c,
        *p,
        *dir;

#ifdef __MSDOS__
    p = strrchr (filename, '\\');
#else   /* unix */
    p = strrchr (filename, '/');
#endif    
    if (p) {
	c = p [1];
	p [1] = '\0';
	dir = s_strdup (filename);
	p [1] = c;
        return dir;
    } else
        return NULL;
}

int getline (FILE *fp, int *lineno)
{
    int
        i;

    for (;;) {
        if (fgets (buffer, BUFSIZE, fp) == NULL)
	    return 0;
	(*lineno)++;
        i = strlen (buffer);
	while (i)
            if (isspace ((unsigned char) buffer [i - 1]))
		buffer [--i] = '\0';
	    else
		break;
        bufp = 0;
	while (buffer [bufp] && isspace ((unsigned char) buffer [bufp]))
	    bufp++;
	if (buffer [bufp] == '#')
	    continue;
	if (buffer [bufp])
	    return 1;
    }
}

void ferrit (char *filename, int lineno, char const *format, ...)
{
    va_list
	list;

    fprintf (stderr, "\nError %s: in file \"%s\", line %i: ", programname, filename, lineno);

    va_start (list, format);
    vfprintf (stderr, format, list);

    fprintf (stderr, "\n\n");

    exit (1);
}

void errit (char const *format, ...)
{
    va_list
	list;

    fprintf (stderr, "\nError %s: ", programname);

    va_start (list, format);
    vfprintf (stderr, format, list);

    fprintf (stderr, "\n\n");

    exit (1);
}

void get_programname (char const *argv0)
{
#ifdef __MSDOS__
    char
        name [MAXFILE];
    fnsplit (argv0, NULL, NULL, name, NULL);
    programname = strdup (name);
#else   /* unix */
    char
        *p;
    p = strrchr (argv0, '/');
    if (p)
        programname = strdup (p + 1);
    else
        programname = strdup (argv0);
#endif    
}

void *s_malloc (size_t size)
{
    void
	*p;

    p = malloc (size);
    if (! p) {
        free (no_mem_buffer);
	errit (out_of_memory);
    }
    return p;
}

void *s_realloc (void *block, size_t size)
{
    void
	*p;

    p = realloc (block, size);
    if (! p) {
        free (no_mem_buffer);
	errit (out_of_memory);
    }
    return p;
}

char *s_strdup (char const *s)
{
    char
	*s1;

    if (s) {
	s1 = (char *) s_malloc (strlen (s) + 1);
	strcpy (s1, s);
    } else {
	s1 = (char *) s_malloc (1);
	s1 [0] = '\0';
    }
    return s1;
}

void syntax ()
{
    fprintf (
	stderr,
	"\n"
	"This is utf2any, version " UTFanyVERSION "\n"
	"\n"
        "Usage: %s -7|-8 [-f mapfile] [-v] [-w] [infile]\n"
	"\n"
	"  -7 : Input is UTF-7\n"
	"  -8 : Input is UTF-8\n"
	"  -f : File with definitions of the symbol mappings\n"
	"       If multiple -f options are given, the files are processed in turn\n"
	"  -v : Verbose\n"
	"  -w : Warning messages\n"
        "\n",
	programname
    );
    exit (1);
}
