/*
 * The Apache Software License, Version 1.1
 *
 * Copyright (c) 1999-2003 The Apache Software Foundation.  All rights
 * reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * 3. The end-user documentation included with the redistribution,
 *    if any, must include the following acknowledgment:
 *       "This product includes software developed by the
 *        Apache Software Foundation (http://www.apache.org/)."
 *    Alternately, this acknowledgment may appear in the software itself,
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Xerces" and "Apache Software Foundation" must
 *    not be used to endorse or promote products derived from this
 *    software without prior written permission. For written
 *    permission, please contact apache\@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache",
 *    nor may "Apache" appear in their name, without prior written
 *    permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 * ====================================================================
 *
 * This software consists of voluntary contributions made by many
 * individuals on behalf of the Apache Software Foundation, and was
 * originally based on software copyright (c) 1999, International
 * Business Machines, Inc., http://www.ibm.com .  For more information
 * on the Apache Software Foundation, please see
 * <http://www.apache.org/>.
 */

/**
 *  $Id: XMLRecognizer.cpp,v 1.8 2003/05/15 18:26:07 knoaman Exp $
 */


// ---------------------------------------------------------------------------
//  Includes
// ---------------------------------------------------------------------------
#include <xercesc/framework/XMLRecognizer.hpp>
#include <xercesc/util/RuntimeException.hpp>
#include <xercesc/util/XMLString.hpp>

XERCES_CPP_NAMESPACE_BEGIN

// ---------------------------------------------------------------------------
//  Local data
//
//  gEncodingNameMap
//      This array maps the Encodings enum values to their canonical names.
//      Be sure to keep this in sync with that enum!
// ---------------------------------------------------------------------------
static const XMLCh* gEncodingNameMap[XMLRecognizer::Encodings_Count] =
{
    XMLUni::fgEBCDICEncodingString
    , XMLUni::fgUCS4BEncodingString
    , XMLUni::fgUCS4LEncodingString
    , XMLUni::fgUSASCIIEncodingString
    , XMLUni::fgUTF8EncodingString
    , XMLUni::fgUTF16BEncodingString
    , XMLUni::fgUTF16LEncodingString
    , XMLUni::fgXMLChEncodingString
};



// ---------------------------------------------------------------------------
//  XMLRecognizer: Public, const static data
//
//  gXXXPre
//  gXXXPreLen
//      The byte sequence prefixes for all of the encodings that we can
//      auto sense. Also included is the length of each sequence.
// ---------------------------------------------------------------------------
const char           XMLRecognizer::fgASCIIPre[]  = { 0x3C, 0x3F, 0x78, 0x6D, 0x6C, 0x20 };
const unsigned int   XMLRecognizer::fgASCIIPreLen = 6;
const XMLByte        XMLRecognizer::fgEBCDICPre[] = { 0x4C, 0x6F, 0xA7, 0x94, 0x93, 0x40 };
const unsigned int   XMLRecognizer::fgEBCDICPreLen = 6;
const XMLByte        XMLRecognizer::fgUTF16BPre[] = { 0x00, 0x3C, 0x00, 0x3F, 0x00, 0x78, 0x00, 0x6D, 0x00, 0x6C, 0x00, 0x20 };
const XMLByte        XMLRecognizer::fgUTF16LPre[] = { 0x3C, 0x00, 0x3F, 0x00, 0x78, 0x00, 0x6D, 0x00, 0x6C, 0x00, 0x20, 0x00 };
const unsigned int   XMLRecognizer::fgUTF16PreLen = 12;
const XMLByte        XMLRecognizer::fgUCS4BPre[]  =
{
        0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x3F
    ,   0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D
    ,   0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, 0x20
};
const XMLByte        XMLRecognizer::fgUCS4LPre[]  =
{
        0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00
    ,   0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00
    ,   0x6C, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00
};
const unsigned int   XMLRecognizer::fgUCS4PreLen = 24;

const char           XMLRecognizer::fgUTF8BOM[] = {(char)0xEF, (char)0xBB, (char)0xBF};
const unsigned int   XMLRecognizer::fgUTF8BOMLen = 3;

// ---------------------------------------------------------------------------
//  XMLRecognizer: Encoding recognition methods
// ---------------------------------------------------------------------------
XMLRecognizer::Encodings
XMLRecognizer::basicEncodingProbe(  const   XMLByte* const  rawBuffer
                                    , const unsigned int    rawByteCount)
{
    //
    //  As an optimization to check the 90% case, check first for the ASCII
    //  sequence '<?xml', which means its either US-ASCII, UTF-8, or some
    //  other encoding that we don't do manually but which happens to share
    //  the US-ASCII code points for these characters. So just return UTF-8
    //  to get us through the first line.
    //
    if (rawByteCount >= fgASCIIPreLen)
    {
        if (!memcmp(rawBuffer, fgASCIIPre, fgASCIIPreLen))
            return UTF_8;
    }

    //
    //  If the count of raw bytes is less than 2, it cannot be anything
    //  we understand, so return UTF-8 as a fallback.
    //
    if (rawByteCount < 2)
        return UTF_8;
         
    //  
    //  We have two to four bytes, so lets check for a UTF-16 BOM. That
    //  is quick to check and enough to identify two major encodings.   
    // 

    if (rawByteCount < 4)
    {
        if ((rawBuffer[0] == 0xFE) && (rawBuffer[1] == 0xFF))
            return UTF_16B;
        else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE))
            return UTF_16L;
        else 
            return UTF_8;
    }

    /***
     *    F.1 Detection Without External Encoding Information
     *
     *    Because each XML entity not accompanied by external encoding information and 
     *    not in UTF-8 or UTF-16 encoding must begin with an XML encoding declaration, 
     *    in which the first characters must be '<?xml', any conforming processor can detect, 
     *    after two to four octets of input, which of the following cases apply. 
     *
     *    In reading this list, it may help to know that in UCS-4, '<' is "#x0000003C" and 
     *    '?' is "#x0000003F", and the Byte Order Mark required of UTF-16 data streams is 
     *    "#xFEFF". The notation ## is used to denote any byte value except that two consecutive 
     *    ##s cannot be both 00.
     *
     *    With a Byte Order Mark:
     *
     *    00 00 FE FF           UCS-4,    big-endian machine    (1234 order) 
     *    FF FE 00 00           UCS-4,    little-endian machine (4321 order) 
     *    00 00 FF FE           UCS-4,    unusual octet order   (2143) 
     *    FE FF 00 00           UCS-4,    unusual octet order   (3412) 
     *    FE FF ## ##           UTF-16,   big-endian 
     *    FF FE ## ##           UTF-16,   little-endian 
     *    EF BB BF              UTF-8 
     *
     ***/

    //
    //  We have at least four bytes, so we can check all BOM
    //  for UCS-4BE, UCS-4LE, UTF-16BE and UTF-16LE as well.
    //
    if ((rawBuffer[0] == 0x00) && (rawBuffer[1] == 0x00) && (rawBuffer[2] == 0xFE) && (rawBuffer[3] == 0xFF))
        return UCS_4B;
    else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE) && (rawBuffer[2] == 0x00) && (rawBuffer[3] == 0x00))
        return UCS_4L;
    else if ((rawBuffer[0] == 0xFE) && (rawBuffer[1] == 0xFF))
        return UTF_16B;
    else if ((rawBuffer[0] == 0xFF) && (rawBuffer[1] == 0xFE))
        return UTF_16L;

    //
    //  We have at least 4 bytes. So lets check the 4 byte sequences that
    //  indicate other UTF-16 and UCS encodings.
    //
    if ((rawBuffer[0] == 0x00) || (rawBuffer[0] == 0x3C))
    {
        if (rawByteCount >= fgUCS4PreLen && !memcmp(rawBuffer, fgUCS4BPre, fgUCS4PreLen))
            return UCS_4B;
        else if (rawByteCount >= fgUCS4PreLen && !memcmp(rawBuffer, fgUCS4LPre, fgUCS4PreLen))
            return UCS_4L;
        else if (rawByteCount >= fgUTF16PreLen && !memcmp(rawBuffer, fgUTF16BPre, fgUTF16PreLen))
            return UTF_16B;
        else if (rawByteCount >= fgUTF16PreLen && !memcmp(rawBuffer, fgUTF16LPre, fgUTF16PreLen))
            return UTF_16L;
    }

    //
    //  See if we have enough bytes to possibly match the EBCDIC prefix.
    //  If so, try it.
    //
    if (rawByteCount > fgEBCDICPreLen)
    {
        if (!memcmp(rawBuffer, fgEBCDICPre, fgEBCDICPreLen))
            return EBCDIC;
    }

    //
    //  Does not seem to be anything we know, so go with UTF-8 to get at
    //  least through the first line and see what it really is.
    //
    return UTF_8;
}


XMLRecognizer::Encodings
XMLRecognizer::encodingForName(const XMLCh* const encName)
{
    //
    //  Compare the passed string, assume input string is already uppercased,
    //  to the variations that we recognize.
    //
    //  !!NOTE: Note that we don't handle EBCDIC here because we don't handle
    //  that one ourselves. It is allowed to fall into 'other'.
    //
   if (encName == XMLUni::fgXMLChEncodingString ||
        !XMLString::compareString(encName, XMLUni::fgXMLChEncodingString))
   {
        return XMLRecognizer::XERCES_XMLCH;
   }
   else if (!XMLString::compareString(encName, XMLUni::fgUTF8EncodingString)
    ||  !XMLString::compareString(encName, XMLUni::fgUTF8EncodingString2))
    {
        return XMLRecognizer::UTF_8;
    }
     else if (!XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString)
          ||  !XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString2)
          ||  !XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString3)
          ||  !XMLString::compareString(encName, XMLUni::fgUSASCIIEncodingString4))
    {
        return XMLRecognizer::US_ASCII;
    }
     else if (!XMLString::compareString(encName, XMLUni::fgUTF16LEncodingString)
          ||  !XMLString::compareString(encName, XMLUni::fgUTF16LEncodingString2))
    {
        return XMLRecognizer::UTF_16L;
    }
     else if (!XMLString::compareString(encName, XMLUni::fgUTF16BEncodingString)
          ||  !XMLString::compareString(encName, XMLUni::fgUTF16BEncodingString2))
    {
        return XMLRecognizer::UTF_16B;
    }
     else if (!XMLString::compareString(encName, XMLUni::fgUCS4LEncodingString)
          ||  !XMLString::compareString(encName, XMLUni::fgUCS4LEncodingString2))
    {
        return XMLRecognizer::UCS_4L;
    }
     else if (!XMLString::compareString(encName, XMLUni::fgUCS4BEncodingString)
          ||  !XMLString::compareString(encName, XMLUni::fgUCS4BEncodingString2))
    {
        return XMLRecognizer::UCS_4B;
    }

    // Return 'other' since we don't recognizer it
    return XMLRecognizer::OtherEncoding;
}


const XMLCh*
XMLRecognizer::nameForEncoding(const XMLRecognizer::Encodings theEncoding)
{
    if (theEncoding > Encodings_Count)
        ThrowXML(RuntimeException, XMLExcepts::XMLRec_UnknownEncoding);

    return gEncodingNameMap[theEncoding];
}

XERCES_CPP_NAMESPACE_END
