src/unicode.cpp

/**
 * @file unicode.cpp
 * Detects, read and writes characters in the proper format.
 *
 * @author  Ben Gardner
 * @license GPL v2+
 */
#include "uncrustify_types.h"
#include "prototypes.h"
#include "unc_ctype.h"
#include <cstring>
#include <cstdlib>


/**
 * See if all characters are ASCII (0-127)
 */
static bool is_ascii(const vector<UINT8>& data, int& non_ascii_cnt, int& zero_cnt)
{
   non_ascii_cnt = zero_cnt = 0;
   for (int idx = 0; idx < (int)data.size(); idx++)
   {
      if (data[idx] & 0x80)
      {
         non_ascii_cnt++;
      }
      if (!data[idx])
      {
         zero_cnt++;
      }
   }
   return((non_ascii_cnt + zero_cnt) == 0);
}


/**
 * Convert the array of bytes into an array of ints
 */
static bool decode_bytes(const vector<UINT8>& in_data, deque<int>& out_data)
{
   out_data.resize(in_data.size());
   for (int idx = 0; idx < (int)in_data.size(); idx++)
   {
      out_data[idx] = in_data[idx];
   }
   return true;
}


void encode_utf8(int ch, vector<UINT8>& res)
{
   if (ch < 0)
   {
      /* illegal code - do not store */
   }
   else if (ch < 0x80)
   {
      /* 0xxxxxxx */
      res.push_back(ch);
   }
   else if (ch < 0x0800)
   {
      /* 110xxxxx 10xxxxxx */
      res.push_back(0xC0 | (ch >> 6));
      res.push_back(0x80 | (ch & 0x3f));
   }
   else if (ch < 0x10000)
   {
      /* 1110xxxx 10xxxxxx 10xxxxxx */
      res.push_back(0xE0 | (ch >> 12));
      res.push_back(0x80 | ((ch >> 6) & 0x3f));
      res.push_back(0x80 | (ch & 0x3f));
   }
   else if (ch < 0x200000)
   {
      /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
      res.push_back(0xF0 | (ch >> 18));
      res.push_back(0x80 | ((ch >> 12) & 0x3f));
      res.push_back(0x80 | ((ch >> 6) & 0x3f));
      res.push_back(0x80 | (ch & 0x3f));
   }
   else if (ch < 0x4000000)
   {
      /* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
      res.push_back(0xF8 | (ch >> 24));
      res.push_back(0x80 | ((ch >> 18) & 0x3f));
      res.push_back(0x80 | ((ch >> 12) & 0x3f));
      res.push_back(0x80 | ((ch >> 6) & 0x3f));
      res.push_back(0x80 | (ch & 0x3f));
   }
   else /* (ch <= 0x7fffffff) */
   {
      /* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
      res.push_back(0xFC | (ch >> 30));
      res.push_back(0x80 | ((ch >> 24) & 0x3f));
      res.push_back(0x80 | ((ch >> 18) & 0x3f));
      res.push_back(0x80 | ((ch >> 12) & 0x3f));
      res.push_back(0x80 | ((ch >> 6) & 0x3f));
      res.push_back(0x80 | (ch & 0x3f));
   }
}


/**
 * Decode UTF-8 sequences from in_data and put the chars in out_data.
 * If there are any decoding errors, then return false.
 */
static bool decode_utf8(const vector<UINT8>& in_data, deque<int>& out_data)
{
   int idx = 0;
   int ch, tmp, cnt;

   out_data.clear();

   /* check for UTF-8 BOM silliness and skip */
   if (in_data.size() >= 3)
   {
      if ((in_data[0] == 0xef) &&
          (in_data[1] == 0xbb) &&
          (in_data[2] == 0xbf))
      {
         /* skip it */
         idx = 3;
      }
   }

   while (idx < (int)in_data.size())
   {
      ch = in_data[idx++];
      if (ch < 0x80)                   /* 1-byte sequence */
      {
         out_data.push_back(ch);
         continue;
      }
      else if ((ch & 0xE0) == 0xC0)    /* 2-byte sequence */
      {
         ch &= 0x1F;
         cnt = 1;
      }
      else if ((ch & 0xF0) == 0xE0)    /* 3-byte sequence */
      {
         ch &= 0x0F;
         cnt = 2;
      }
      else if ((ch & 0xF8) == 0xF0)    /* 4-byte sequence */
      {
         ch &= 0x07;
         cnt = 3;
      }
      else if ((ch & 0xFC) == 0xF8)    /* 5-byte sequence */
      {
         ch &= 0x03;
         cnt = 4;
      }
      else if ((ch & 0xFE) == 0xFC)    /* 6-byte sequence */
      {
         ch &= 0x01;
         cnt = 5;
      }
      else
      {
         /* invalid UTF-8 sequence */
         return false;
      }

      while ((cnt-- > 0) && (idx < (int)in_data.size()))
      {
         tmp = in_data[idx++];
         if ((tmp & 0xC0) != 0x80)
         {
            /* invalid UTF-8 sequence */
            return false;
         }
         ch = (ch << 6) | (tmp & 0x3f);
      }
      if (cnt >= 0)
      {
         /* short UTF-8 sequence */
         return false;
      }
      out_data.push_back(ch);
   }
   return true;
}


/**
 * Extract 2 bytes from the stream and increment idx by 2
 */
static int get_word(const vector<UINT8>& in_data, int& idx, bool be)
{
   int ch;

   if ((idx + 2) > (int)in_data.size())
   {
      ch = -1;
   }
   else if (be)
   {
      ch = (in_data[idx] << 8) | in_data[idx + 1];
   }
   else
   {
      ch = in_data[idx] | (in_data[idx + 1] << 8);
   }
   idx += 2;
   return ch;
}


/**
 * Deocde a UTF-16 sequence.
 * Sets enc based on the BOM.
 * Must have the BOM as the first two bytes.
 */
static bool decode_utf16(const vector<UINT8>& in_data, deque<int>& out_data, CharEncoding& enc)
{
   out_data.clear();

   if (in_data.size() & 1)
   {
      /* can't have and odd length */
      return false;
   }

   if (in_data.size() < 2)
   {
      /* we require the BOM or at least 1 char */
      return false;
   }

   int idx = 2;
   if ((in_data[0] == 0xfe) && (in_data[1] == 0xff))
   {
      enc = ENC_UTF16_BE;
   }
   else if ((in_data[0] == 0xff) && (in_data[1] == 0xfe))
   {
      enc = ENC_UTF16_LE;
   }
   else
   {
      /* If we have a few words, we can take a guess, assuming the first few
       * chars are ASCII */
      enc = ENC_ASCII;
      idx = 0;
      if (in_data.size() >= 6)
      {
         if ((in_data[0] == 0) && (in_data[2] == 0) && (in_data[4] == 0))
         {
            enc = ENC_UTF16_BE;
         }
         else if ((in_data[1] == 0) && (in_data[3] == 0) && (in_data[5] == 0))
         {
            enc = ENC_UTF16_LE;
         }
      }
      if (enc == ENC_ASCII)
      {
         return false;
      }
   }

   bool be = (enc == ENC_UTF16_BE);

   while (idx < (int)in_data.size())
   {
      int ch = get_word(in_data, idx, be);
      if ((ch & 0xfc00) == 0xd800)
      {
         ch  &= 0x3ff;
         ch <<= 10;
         int tmp = get_word(in_data, idx, be);
         if ((tmp & 0xfc00) != 0xdc00)
         {
            return false;
         }
         ch |= (tmp & 0x3ff);
         ch += 0x10000;
         out_data.push_back(ch);
      }
      else if (((ch >= 0) && (ch < 0xD800)) || (ch >= 0xE000))
      {
         out_data.push_back(ch);
      }
      else
      {
         /* invalid character */
         return false;
      }
   }
   return true;
}


/**
 * Looks for the BOM of UTF-16 BE/LE and UTF-8.
 * If found, set enc and return true.
 * Sets enc to ENC_ASCII and returns false if not found.
 */
static bool decode_bom(const vector<UINT8>& in_data, CharEncoding& enc)
{
   enc = ENC_ASCII;
   if (in_data.size() >= 2)
   {
      if ((in_data[0] == 0xfe) && (in_data[1] == 0xff))
      {
         enc = ENC_UTF16_BE;
         return true;
      }
      else if ((in_data[0] == 0xff) && (in_data[1] == 0xfe))
      {
         enc = ENC_UTF16_LE;
         return true;
      }
      else if ((in_data.size() >= 3) &&
               (in_data[0] == 0xef) &&
               (in_data[1] == 0xbb) &&
               (in_data[2] == 0xbf))
      {
         enc = ENC_UTF8;
         return true;
      }
   }
   return false;
}


/**
 * Figure out the encoding and convert to an int sequence
 */
bool decode_unicode(const vector<UINT8>& in_data, deque<int>& out_data, CharEncoding& enc, bool& has_bom)
{
   /* check for a BOM */
   if (decode_bom(in_data, enc))
   {
      has_bom = true;
      if (enc == ENC_UTF8)
      {
         return decode_utf8(in_data, out_data);
      }
      else
      {
         return decode_utf16(in_data, out_data, enc);
      }
   }
   has_bom = false;

   /* Check for simple ASCII */
   int non_ascii_cnt;
   int zero_cnt;
   if (is_ascii(in_data, non_ascii_cnt, zero_cnt))
   {
      enc = ENC_ASCII;
      return decode_bytes(in_data, out_data);
   }

   /* There are alot of 0's in UTF-16 (~50%) */
   if ((zero_cnt > ((int)in_data.size() / 4)) &&
       (zero_cnt <= ((int)in_data.size() / 2)))
   {
      /* likely is UTF-16 */
      if (decode_utf16(in_data, out_data, enc))
      {
         return true;
      }
   }

   if (decode_utf8(in_data, out_data))
   {
      enc = ENC_UTF8;
      return true;
   }

   /* it is an unrecognized byte sequence */
   enc = ENC_BYTE;
   return decode_bytes(in_data, out_data);
}


/**
 * Write for ASCII and BYTE encoding
 */
static void write_byte(int ch)
{
   if ((ch & 0xff) == ch)
   {
      if (cpd.fout)
      {
         fputc(ch, cpd.fout);
      }
      if (cpd.bout)
      {
         cpd.bout->push_back((UINT8)ch);
      }
   }
   else
   {
      /* illegal code - do not store */
   }
}


/**
 * Writes a single character to a file using UTF-8 encoding
 */
static void write_utf8(int ch)
{
   vector<UINT8> vv;
   vv.reserve(6);

   encode_utf8(ch, vv);
   for (int idx = 0; idx < (int)vv.size(); idx++)
   {
      write_byte(vv[idx]);
   }
}


static void write_utf16(int ch, bool be)
{
   /* U+0000 to U+D7FF and U+E000 to U+FFFF */
   if (((ch >= 0) && (ch < 0xD800)) || ((ch >= 0xE000) && (ch < 0x10000)))
   {
      if (be)
      {
         write_byte(ch >> 8);
         write_byte(ch & 0xff);
      }
      else
      {
         write_byte(ch & 0xff);
         write_byte(ch >> 8);
      }
   }
   else if ((ch >= 0x10000) && (ch < 0x110000))
   {
      int v1 = ch - 0x10000;
      int w1 = 0xD800 + (v1 >> 10);
      int w2 = 0xDC00 + (v1 & 0x3ff);
      if (be)
      {
         write_byte(w1 >> 8);
         write_byte(w1 & 0xff);
         write_byte(w2 >> 8);
         write_byte(w2 & 0xff);
      }
      else
      {
         write_byte(w1 & 0xff);
         write_byte(w1 >> 8);
         write_byte(w2 & 0xff);
         write_byte(w2 >> 8);
      }
   }
   else
   {
      /* illegal code - do not store */
   }
}


void write_bom()
{
   switch (cpd.enc)
   {
   case ENC_UTF8:
      write_byte(0xef);
      write_byte(0xbb);
      write_byte(0xbf);
      break;

   case ENC_UTF16_LE:
      write_utf16(0xfeff, false);
      break;

   case ENC_UTF16_BE:
      write_utf16(0xfeff, true);
      break;

   default:
      break;
   }
}


/**
 * @param ch the 31-bit char value
 */
void write_char(int ch)
{
   if (ch >= 0)
   {
      switch (cpd.enc)
      {
      case ENC_BYTE:
         write_byte(ch & 0xff);
         break;

      case ENC_ASCII:
      default:
         write_byte(ch);
         break;

      case ENC_UTF8:
         write_utf8(ch);
         break;

      case ENC_UTF16_LE:
         write_utf16(ch, false);
         break;

      case ENC_UTF16_BE:
         write_utf16(ch, true);
         break;
      }
   }
}


void write_string(const unc_text& text)
{
   for (int idx = 0; idx < (int)text.size(); idx++)
   {
      write_char(text[idx]);
   }
}