
/* utf8.c */

#define CODESET_ASCII  "ANSI_X3.4-1968"
#define CODESET_LATIN1 "ISO-8859-1"
#define CODESET_UTF8   "UTF-8"
#define LOCALE_C       "C"
#define LOCALE_C2      "POSIX" /* a synonym for LOCALE_C */

/* This file provides the functions utf8() and locale().
 *
 * The function utf8() transliterates one or more strings from Latin-1
 * to utf-8 encoding.  The function is variadic (it accepts a variable
 * number of arguments), but returns a single concatenated translated
 * string.
 *
 * The function utf8() can return output in a caller-provided buffer or
 * in its own buffer.  Regarding its own buffer, the function actually
 * maintains a ring of SIZE_UTF8_RING separate buffers.  A return buffer
 * thus survives several calls, but eventually the ring wraps and the
 * return buffer is overwritten.  Such a design is inelegant but fast
 * (an elegant design would allocate buffers dynamically as needed), and
 * it is believed that speed matters here.
 *
 * Nothing prevents a caller from calling utf8() directly.  However, as
 * the interface is somewhat cumbersome and since the function does not
 * check if the -u command-line option is set, the typical caller will
 * probably access utf8() through one of the macros defined in utf8.h.
 * If utf8.h yet lacks the appropriate macro, the programmer may wish to
 * create the desired new macro there.
 *
 * The function locale() reads the locale the user has set, if any, and
 * defaults the character-encoding options accordingly.
 *
 * (The author has added this file and its associated header to the
 * program January and February 2005, when the program was nearly a year
 * old, as at least a partial response to the good advice he had
 * received in the list subthread beginning at
 * [http://lists.debian.org/debian-devel/2004/12/msg00571.html].
 * Because it represents a design afterthought, the file and header
 * merge into the whole program slightly less nimbly than one might
 * like.  Among other infelicities, the file and header force several
 * declarations in the program's various other files and headers to
 * comment out otherwise valid `const' qualifiers.)
 *
 * (The author wishes to acknowledge Maciej Dems' useful testing and
 * advice generally with respect to this file.)
 *
 */

#include "utf8.h"
#include "alloc.h"
#include "gen.h"
#include <stdarg.h>
#include <iconv.h>
#include <locale.h>
#include <langinfo.h>

static char ( *out0 )[UTF8_SIZE_BUF];
static int     i_out0;
static iconv_t conv_descriptor = (iconv_t)(-1);

static inline void convert_one_character(
  char  **const in ,
  char  **const out,
  size_t *const size_out
) {
  size_t size_in = 1; /* a dummy lvalue */
  if (
    iconv( conv_descriptor, in, &size_in, out, size_out )
    == (size_t)(-1)
  ) error(
    EPERM, 0,
    "utf-8 conversion failed"
  );
}

/* Transliterate Latin-1 to utf-8.  */
char *utf8(
  char              *out     , /* optional caller-provided output     *
                                *   buffer (0 to disable)             */
  size_t             size_out, /* output buffer size including the    *
                                *   terminating null (may give 0 if   *
                                *   the caller provides no buffer)    */
  int                n       , /* how many strings follow?            */
  ...                          /* the strings to be transliterate     */
) {

  char   *out_start;
  int     isownbuf = 0; /* is a local output buffer used (rather than
                         * one provided by the caller)? */
  va_list argp;

  if ( size_out < 0 || n < 0 ) error(
    EPERM, 0,
    "internal error: utf8() does not accept negative `size_out' or `n'"
  );

  /* Observe that the function requires certain resources, for speed's
   * sake reserved only once, the first time the function is called.
   * Once reserved, the resources are kept open here for use in later
   * calls.  */
  if ( conv_descriptor == (iconv_t)(-1) ) {
    conv_descriptor = iconv_open( CHARFMT_UTF8, CHARFMT_LATIN1 );
    if ( conv_descriptor == (iconv_t)(-1) ) error(
      EPERM, 0,
      "could not initialize the utf-8 conversion"
    );
    out0   = malloc2( SIZE_UTF8_RING*UTF8_SIZE_BUF );
    i_out0 = 0;
  }

  /* Initialize the output buffer.  */
  if ( !out ) {
    isownbuf = 1;
    out = out0[i_out0];
    if ( !size_out || size_out > UTF8_SIZE_BUF )
      size_out = UTF8_SIZE_BUF;
  }
  else if ( !size_out ) error(
    EPERM, 0,
    "internal error: utf8() needs at least one byte of output buffer"
  );
  out_start = out;

  /* Observe the actual iconv() transliteration here.  Variadic C
   * syntax, seen here, is admittedly a bit exotic: the various "va_*"
   * macros are provided by <stdarg.h>; the va_arg() macro returns
   * each "..." parameter in turn.  */
  va_start( argp, n );
    /* The variadic "va_start()" syntax here is clear to the C compiler,
     * but is unusual and even weird enough that it merits a special
     * comment for the benefit of the human reader.  The reader
     * unfamiliar with variadic C syntax is likely mistakenly to
     * interpret the "n" here as an argument count.  The parameter `n'
     * is indeed an argument count, but va_start() does not care about
     * that.  In fact, the va_start() macro does not care at all what
     * value `n' contains or even what `n' represents.  To va_start(),
     * the symbol "n" merely identifies the last parameter before
     * the "..." in the function prototype---whatever this parameter
     * happens to be.  In fact, C variadics do not count parameters at
     * all; they leave them to the programmer to count.  */
  {
    char *in;
    /* Notice the "n" here.  This "n" is interpreted in the normal way:
     * as an argument count.  */
    while ( n-- ) {
      in = va_arg( argp, char * );
      while ( *in ) convert_one_character( &in, &out, &size_out );
    }
    in = "";
    convert_one_character( &in, &out, &size_out );
  }
  va_end( argp );

  i_out0 = ( i_out0 + isownbuf ) % SIZE_UTF8_RING;
  return out_start;

}

/* Default properly with respect the user's locale.
 *
 * Remark:  At call-time, argp.c's parse_cl() will have already ensured
 * that no more than one of
 *
 *   -L opt.no_latin1 (probably should have been named "opt.ascii")
 *   -l opt.latin1
 *   -u opt.utf8
 *
 * is selected.
 */
void locale() {

  int locale_c, code_ascii, code_latin1, code_utf8;

  /* Read the locale.  */
  {
    char *lc_ctype, *codeset;
    {
      /* Temporarily modify the program's execution locale to extract
       * the user's locale and character encoding.  (Remark:  The
       * program formats output in colored columns of text, the
       * interaction of which with locales the programmer does not
       * understand well.  For this reason, the program does not
       * actually want to run under the user's locale; but it does want
       * to know what the user's locale and the user's character
       * encoding are.)  */
      char *const locale_orig  = strdup2( setlocale( LC_ALL, NULL ) );
      lc_ctype                 = setlocale( LC_CTYPE, "" );
      lc_ctype                 =
        strdup2( lc_ctype ? lc_ctype : LOCALE_C );
      setlocale( LC_CTYPE, lc_ctype );
      codeset                  = strdup2( nl_langinfo( CODESET ) );
      setlocale( LC_ALL, locale_orig );
      free( locale_orig );
    }
    #if DEBUG_LOCALE
      fprintf(
        stderr,
        "diagnostic: LC_CTYPE == %s; CODESET == %s\n",
        lc_ctype, codeset
      );
    #endif
    locale_c         =
      !strcmp( lc_ctype, LOCALE_C ) || !strcmp( lc_ctype, LOCALE_C2 );
    code_ascii       = !strcmp( codeset, CODESET_ASCII  );
    code_latin1      = !strcmp( codeset, CODESET_LATIN1 );
    code_utf8        = !strcmp( codeset, CODESET_UTF8   );
    free( lc_ctype );
    free( codeset  );
  }

  /* Default the character-encoding options accordingly.  */
  if ( opt.no_latin1 || opt.latin1 || opt.utf8 ) return;
  if      ( code_utf8                                 )
    opt.utf8       = 1;
  else if ( code_latin1 || ( locale_c && code_ascii ) )
    opt.latin1     = 1;
  else {
    opt.no_latin1  = 1;
    opt.ascii_dots = 1;
  }

}

