os/textandloc/charconvfw/charconv_fw/tools/convtool/convtool.cpp
changeset 0 bde4ae8d615e
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/os/textandloc/charconvfw/charconv_fw/tools/convtool/convtool.cpp	Fri Jun 15 03:10:57 2012 +0200
     1.3 @@ -0,0 +1,376 @@
     1.4 +/*
     1.5 +* Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
     1.6 +* All rights reserved.
     1.7 +* This component and the accompanying materials are made available
     1.8 +* under the terms of "Eclipse Public License v1.0"
     1.9 +* which accompanies this distribution, and is available
    1.10 +* at the URL "http://www.eclipse.org/legal/epl-v10.html".
    1.11 +*
    1.12 +* Initial Contributors:
    1.13 +* Nokia Corporation - initial contribution.
    1.14 +*
    1.15 +* Contributors:
    1.16 +*
    1.17 +* Description: 
    1.18 +*
    1.19 +*/
    1.20 +
    1.21 +
    1.22 +#pragma warning (disable: 4514) // unreferenced inline/local function has been removed
    1.23 +
    1.24 +#include <stdio.h>
    1.25 +#include <stdlib.h>
    1.26 +#include <string.h>
    1.27 +#include <locale.h>
    1.28 +#include <wchar.h>
    1.29 +#if defined(__VC32__)
    1.30 +#include <FCNTL.H>
    1.31 +#include <IO.H>
    1.32 +#endif
    1.33 +
    1.34 +#undef BIG_ENDIAN
    1.35 +
    1.36 +#if (defined(__MSVCRT__) || defined(_MSC_VER))
    1.37 +//#define _stricmp  _stricmp
    1.38 +//#define _strnicmp _strnicmp
    1.39 +#else // linux 
    1.40 +#define _stricmp  strcasecmp
    1.41 +#define _strnicmp strncasecmp
    1.42 +#endif
    1.43 +
    1.44 +const int KVersionNumber=025;
    1.45 +const int KLargeNumber=1000000;
    1.46 +
    1.47 +extern int Utf8ToUnicode(wchar_t* aUnicode, const char* aUtf8);
    1.48 +extern int UnicodeToUtf8(char* aUtf8, const wchar_t* aUnicode);
    1.49 +
    1.50 +enum TByteOrder
    1.51 +	{
    1.52 +	EByteOrderUnspecified,
    1.53 +	EByteOrderBigEndian,
    1.54 +	EByteOrderLittleEndian,
    1.55 +#if defined(BIG_ENDIAN)
    1.56 +	EByteOrderNative=EByteOrderBigEndian,
    1.57 +	EByteOrderForeign=EByteOrderLittleEndian
    1.58 +#else
    1.59 +	EByteOrderNative=EByteOrderLittleEndian,
    1.60 +	EByteOrderForeign=EByteOrderBigEndian
    1.61 +#endif
    1.62 +	};
    1.63 +
    1.64 +struct SBuffer
    1.65 +	{
    1.66 +	int iNumberOfBytes;
    1.67 +	void* iData;
    1.68 +	};
    1.69 +
    1.70 +void PrintUsage(const char* aProgramName)
    1.71 +    {
    1.72 +	fprintf(stderr, "\nVersion %03d\n\nCharacter set conversion tool\nCopyright (c) 1999 Symbian Ltd\n\n", KVersionNumber);
    1.73 +	fprintf(stderr, "Usage:\n\n\t%s [<options>] <inputspec> <outputspec>\n\nwhere\n\n\t"
    1.74 +				"options    :=  [-big|-little][-byteordermark]\n\t"
    1.75 +				"inputspec  :=  -input=<format> [<input_file>]\n\t"
    1.76 +				"outputspec :=  -output=<format> [<output_file>]\n\t"
    1.77 +				"format     :=  unicode|1252|utf8|...\n\n", aProgramName);
    1.78 +	const char* localeData=setlocale(LC_ALL, "");
    1.79 +	while (*localeData!='.')
    1.80 +		{
    1.81 +		++localeData;
    1.82 +		}
    1.83 +	fprintf(stderr, "(The default encoding is currently \"%s\")\n\n", localeData+1);
    1.84 +	}
    1.85 +
    1.86 +void Assert(int aCondition, const char* aErrorMessageFormat, const void* aExtraParameter1=NULL, const void* aExtraParameter2=NULL)
    1.87 +	{
    1.88 +	if (!aCondition)
    1.89 +		{
    1.90 +		char errorMessage[100];
    1.91 +		sprintf(errorMessage, aErrorMessageFormat, aExtraParameter1, aExtraParameter2);
    1.92 +		fprintf(stderr, "Error: %s\n", errorMessage);
    1.93 +		exit(1);
    1.94 +		}
    1.95 +	}
    1.96 +
    1.97 +void PrintWarning(const char* aWarningMessage)
    1.98 +	{
    1.99 +	fprintf(stderr, "Warning: %s\n", aWarningMessage);
   1.100 +	}
   1.101 +
   1.102 +int TryFileParameter(int aArgc, char* aArgv[], int& aArgIndex, const char* aInputOrOutput, const char*& aEncoding, FILE*& aFile, const char* aFileMode)
   1.103 +	{
   1.104 +	char prefix[100];
   1.105 +	strcpy(prefix, "-");
   1.106 +	strcat(prefix, aInputOrOutput);
   1.107 +	strcat(prefix, "=");
   1.108 +	int lengthOfPrefix=strlen(prefix);
   1.109 +	if (_strnicmp(aArgv[aArgIndex], prefix, lengthOfPrefix)==0)
   1.110 +		{
   1.111 +		Assert(aEncoding==NULL, "\"%s...\" is specified more than once", prefix);
   1.112 +		aEncoding=aArgv[aArgIndex]+lengthOfPrefix;
   1.113 +		++aArgIndex;
   1.114 +		if ((aArgIndex>=aArgc) || (aArgv[aArgIndex][0]=='-'))
   1.115 +			{
   1.116 +			--aArgIndex;
   1.117 +			}
   1.118 +		else
   1.119 +			{
   1.120 +			aFile=fopen(aArgv[aArgIndex], aFileMode);
   1.121 +			Assert(aFile!=NULL, "opening %s-file failed", aInputOrOutput);
   1.122 +			}
   1.123 +		return 1;
   1.124 +		}
   1.125 +	return 0;
   1.126 +	}
   1.127 +
   1.128 +void ReadParameters(int aArgc, char* aArgv[], int& aOutputByteOrderMark, TByteOrder& aUnicodeByteOrder, const char*& aInputEncoding, const char*& aOutputEncoding, FILE*& aInputFile, FILE*& aOutputFile)
   1.129 +	{
   1.130 +	if ((aArgc<=1) || (_stricmp(aArgv[1], "?")==0) || (_stricmp(aArgv[1], "/?")==0))
   1.131 +		{
   1.132 +		PrintUsage(aArgv[0]);
   1.133 +		exit(0);
   1.134 +		}
   1.135 +	for (int i=1; i<aArgc; ++i) // start at index 1 to avoid the program name (which is the first parameter)
   1.136 +		{
   1.137 +		if (_stricmp(aArgv[i], "-byteordermark")==0)
   1.138 +			{
   1.139 +			Assert(!aOutputByteOrderMark, "\"-byteordermark\" is specified more than once");
   1.140 +			aOutputByteOrderMark=1;
   1.141 +			}
   1.142 +		else if (_stricmp(aArgv[i], "-big")==0)
   1.143 +			{
   1.144 +			Assert(aUnicodeByteOrder==EByteOrderUnspecified, "the byte order of unicode text (i.e. \"-big\"/\"-little\") is specified more than once");
   1.145 +			aUnicodeByteOrder=EByteOrderBigEndian;
   1.146 +			}
   1.147 +		else if (_stricmp(aArgv[i], "-little")==0)
   1.148 +			{
   1.149 +			Assert(aUnicodeByteOrder==EByteOrderUnspecified, "the byte order of unicode text (i.e. \"-big\"/\"-little\") is specified more than once");
   1.150 +			aUnicodeByteOrder=EByteOrderLittleEndian;
   1.151 +			}
   1.152 +		else
   1.153 +			{
   1.154 +			Assert(TryFileParameter(aArgc, aArgv, i, "input", aInputEncoding, aInputFile, "r") ||
   1.155 +				   TryFileParameter(aArgc, aArgv, i, "output", aOutputEncoding, aOutputFile, "w"), "bad parameter \"%s\"", aArgv[i]);
   1.156 +			}
   1.157 +		}
   1.158 +	Assert(aInputEncoding!=NULL, "no input encoding is specified");
   1.159 +	Assert(aOutputEncoding!=NULL, "no output encoding is specified");
   1.160 +	}
   1.161 +
   1.162 +int ReadFromFileReturningNumberOfBytesRead(void* aBuffer, int aNumberOfBytesToRead, FILE* aInputFile)
   1.163 +	{
   1.164 +	int numberOfBytesRead=0;
   1.165 +	int numberOfBytesToReadThisTime=aNumberOfBytesToRead;
   1.166 +	for (;;)
   1.167 +		{
   1.168 +		for (;;)
   1.169 +			{
   1.170 +			const int remainingNumberOfBytesToRead=aNumberOfBytesToRead-numberOfBytesRead;
   1.171 +			if (numberOfBytesToReadThisTime>remainingNumberOfBytesToRead)
   1.172 +				{
   1.173 +				numberOfBytesToReadThisTime=remainingNumberOfBytesToRead;
   1.174 +				}
   1.175 +			const int numberOfBytesReadThisTime=fread(aBuffer, 1, numberOfBytesToReadThisTime, aInputFile);
   1.176 +			const int error=ferror(aInputFile);
   1.177 +			if (error==0)
   1.178 +				{
   1.179 +				aBuffer=((unsigned char*)aBuffer)+numberOfBytesReadThisTime;
   1.180 +				numberOfBytesRead+=numberOfBytesReadThisTime;
   1.181 +				Assert(numberOfBytesRead<=aNumberOfBytesToRead, "internal error (read too many bytes)");
   1.182 +				if ((numberOfBytesRead>=aNumberOfBytesToRead) || feof(aInputFile))
   1.183 +					{
   1.184 +					return numberOfBytesRead;
   1.185 +					}
   1.186 +				break;
   1.187 +				}
   1.188 +			numberOfBytesToReadThisTime/=2;
   1.189 +			Assert(numberOfBytesToReadThisTime>0, "reading from file failed with error number %d", (const void*)error);
   1.190 +			clearerr(aInputFile);
   1.191 +			}
   1.192 +		}
   1.193 +	}
   1.194 +
   1.195 +void WriteToFile(const void* aBuffer, int aNumberOfBytesToWrite, FILE* aOutputFile)
   1.196 +	{
   1.197 +	const int numberOfBytesWritten=fwrite(aBuffer, 1, aNumberOfBytesToWrite, aOutputFile);
   1.198 +	Assert(numberOfBytesWritten==aNumberOfBytesToWrite, "only %d out of %d bytes could be written to file", (const void*)numberOfBytesWritten, (const void*)aNumberOfBytesToWrite);
   1.199 +	const int error=ferror(aOutputFile);
   1.200 +	Assert(error==0, "writing to file failed with error number %d", (const void*)error);
   1.201 +	}
   1.202 +
   1.203 +void HandleByteOrderMarks(int aOutputByteOrderMark, TByteOrder& aUnicodeByteOrder, const char* aInputEncoding, const char* aOutputEncoding, FILE* aInputFile, FILE* aOutputFile)
   1.204 +	{
   1.205 +	if (_stricmp(aInputEncoding, "unicode")==0)
   1.206 +		{
   1.207 +		unsigned short firstUnicodeCharacter=0;
   1.208 +		const int numberOfBytesRead=ReadFromFileReturningNumberOfBytesRead((void*)&firstUnicodeCharacter, sizeof(unsigned short), aInputFile);
   1.209 +		TByteOrder byteOrderSpecifiedByByteOrderMark=EByteOrderUnspecified;
   1.210 +		if (numberOfBytesRead==sizeof(unsigned short))
   1.211 +			{
   1.212 +			switch (firstUnicodeCharacter)
   1.213 +				{
   1.214 +			case 0xfeff:
   1.215 +				byteOrderSpecifiedByByteOrderMark=EByteOrderNative;
   1.216 +				break;
   1.217 +			case 0xfffe:
   1.218 +				byteOrderSpecifiedByByteOrderMark=EByteOrderForeign;
   1.219 +				break;
   1.220 +			default:
   1.221 +				const int error=fseek(aInputFile, 0, SEEK_SET); // rewind to the start of the file
   1.222 +				Assert(error==0, "could not rewind to the start of the input file");
   1.223 +				break;
   1.224 +				}
   1.225 +			}
   1.226 +		if (byteOrderSpecifiedByByteOrderMark!=EByteOrderUnspecified)
   1.227 +			{
   1.228 +			if ((aUnicodeByteOrder!=EByteOrderUnspecified) && (byteOrderSpecifiedByByteOrderMark!=aUnicodeByteOrder))
   1.229 +				{
   1.230 +				PrintWarning("the byte order specified by the byte-order mark in the unicode input is different from the byte order specified by the parameter - taking the byte-order specified by the byte-order mark in the unicode input");
   1.231 +				}
   1.232 +			aUnicodeByteOrder=byteOrderSpecifiedByByteOrderMark;
   1.233 +			}
   1.234 +		}
   1.235 +	if (aOutputByteOrderMark)
   1.236 +		{
   1.237 +		if (_stricmp(aOutputEncoding, "unicode")!=0)
   1.238 +			{
   1.239 +			PrintWarning("\"-byteordermark\" is only relevant for unicode output");
   1.240 +			}
   1.241 +		else
   1.242 +			{
   1.243 +			Assert(aUnicodeByteOrder!=EByteOrderUnspecified, "the byte order must be specified if a byte-order mark is to be added to the unicode output");
   1.244 +			unsigned short firstUnicodeCharacter=(unsigned short)((aUnicodeByteOrder==EByteOrderNative)? 0xfeff: 0xfffe);
   1.245 +			WriteToFile((const void*)&firstUnicodeCharacter, sizeof(unsigned short), aOutputFile);
   1.246 +			}
   1.247 +		}
   1.248 +	}
   1.249 +
   1.250 +void ObeyRequiredByteOrderIfUnicode(TByteOrder& aUnicodeByteOrder, const char* aEncoding, SBuffer& aBuffer)
   1.251 +	{
   1.252 +	if (_stricmp(aEncoding, "unicode")==0)
   1.253 +		{
   1.254 +		Assert(aBuffer.iNumberOfBytes%sizeof(wchar_t)==0, "internal error (bad number of bytes in unicode buffer)");
   1.255 +		if (aUnicodeByteOrder==EByteOrderUnspecified)
   1.256 +			{
   1.257 +			PrintWarning("the byte order of unicode text is unspecified - defaulting to little endian");
   1.258 +			aUnicodeByteOrder=EByteOrderLittleEndian;
   1.259 +			}
   1.260 +		if (aUnicodeByteOrder==EByteOrderForeign)
   1.261 +			{
   1.262 +			for (unsigned char* bytePointer=((unsigned char*)aBuffer.iData)+(aBuffer.iNumberOfBytes-sizeof(wchar_t)); bytePointer>=aBuffer.iData; bytePointer-=sizeof(wchar_t))
   1.263 +				{
   1.264 +				unsigned char temp=*bytePointer;
   1.265 +				*bytePointer=*(bytePointer+1);
   1.266 +				*(bytePointer+1)=temp;
   1.267 +				}
   1.268 +			}
   1.269 +		}
   1.270 +	}
   1.271 +
   1.272 +int OtherToUnicode(const char* aInputEncoding, wchar_t* aUnicode, const char* aOther)
   1.273 +// if the output parameter is NULL, it returns the precise size of the would-be output parameter (in terms of number of "wchar_t"s) excluding any trailing '\0', otherwise it returns 0
   1.274 +	{
   1.275 +	if (_stricmp(aInputEncoding, "utf8")==0)
   1.276 +		{
   1.277 +		return Utf8ToUnicode(aUnicode, aOther);
   1.278 +		}
   1.279 +	char localeData[100];
   1.280 +	strcpy(localeData, ".");
   1.281 +	strcat(localeData, aInputEncoding);
   1.282 +	Assert(setlocale(LC_ALL, localeData)!=NULL, "could not convert from encoding \"%s\"", aInputEncoding);
   1.283 +	return mbstowcs(aUnicode, aOther, KLargeNumber);
   1.284 +	}
   1.285 +
   1.286 +int UnicodeToOther(const char* aOutputEncoding, char* aOther, const wchar_t* aUnicode)
   1.287 +// if the output parameter is NULL, it returns the precise size of the would-be output parameter (in terms of number of "char"s) excluding any trailing '\0', otherwise it returns 0
   1.288 +	{
   1.289 +	if (_stricmp(aOutputEncoding, "utf8")==0)
   1.290 +		{
   1.291 +		return UnicodeToUtf8(aOther, aUnicode);
   1.292 +		}
   1.293 +	char localeData[100];
   1.294 +	strcpy(localeData, ".");
   1.295 +	strcat(localeData, aOutputEncoding);
   1.296 +	Assert(setlocale(LC_ALL, localeData)!=NULL, "could not convert to encoding \"%s\"", aOutputEncoding);
   1.297 +	return wcstombs(aOther, aUnicode, KLargeNumber);
   1.298 +	}
   1.299 +
   1.300 +void DoConversion(TByteOrder& aUnicodeByteOrder, const char* aInputEncoding, const char* aOutputEncoding, FILE* aInputFile, FILE* aOutputFile)
   1.301 +	{
   1.302 +	SBuffer arrayOfBuffers[3];
   1.303 +	arrayOfBuffers[0].iNumberOfBytes=0;
   1.304 +	arrayOfBuffers[0].iData=malloc(KLargeNumber+2); // +2 for the 2 '\0' bytes appended to the data read from file
   1.305 +	Assert(arrayOfBuffers[0].iData!=NULL, "cannot allocate enough memory");
   1.306 +	arrayOfBuffers[1].iNumberOfBytes=0;
   1.307 +	arrayOfBuffers[1].iData=NULL;
   1.308 +	arrayOfBuffers[2].iNumberOfBytes=0;
   1.309 +	arrayOfBuffers[2].iData=NULL;
   1.310 +	SBuffer* currentBuffer=arrayOfBuffers;
   1.311 +	currentBuffer->iNumberOfBytes=ReadFromFileReturningNumberOfBytesRead(currentBuffer->iData, KLargeNumber, aInputFile);
   1.312 +	// append 2 '\0' bytes at the end of the buffer read from file (2 in case it is unicode)
   1.313 +	((char*)currentBuffer->iData)[currentBuffer->iNumberOfBytes]='\0';
   1.314 +	((char*)currentBuffer->iData)[currentBuffer->iNumberOfBytes+1]='\0';
   1.315 +	ObeyRequiredByteOrderIfUnicode(aUnicodeByteOrder, aInputEncoding, *currentBuffer);
   1.316 +	// if the input and output encodings are different, convert from one to the other (via unicode if neither is itself unicode)
   1.317 +	if (_stricmp(aInputEncoding, aOutputEncoding)!=0)
   1.318 +		{
   1.319 +		if (_stricmp(aInputEncoding, "unicode")!=0)
   1.320 +			{
   1.321 +			SBuffer* nextBuffer=currentBuffer+1;
   1.322 +			nextBuffer->iNumberOfBytes=sizeof(wchar_t)*OtherToUnicode(aInputEncoding, NULL, (const char*)currentBuffer->iData);
   1.323 +			Assert(nextBuffer->iNumberOfBytes>=0, "invalid multi-byte character encountered");
   1.324 +			nextBuffer->iData=malloc(nextBuffer->iNumberOfBytes+sizeof(wchar_t)); // "+sizeof(wchar_t)" for terminating '\0'
   1.325 +			Assert(nextBuffer->iData!=NULL, "cannot allocate enough memory");
   1.326 +			OtherToUnicode(aInputEncoding, (wchar_t*)nextBuffer->iData, (const char*)currentBuffer->iData);
   1.327 +			currentBuffer=nextBuffer;
   1.328 +			}
   1.329 +		if (_stricmp(aOutputEncoding, "unicode")!=0)
   1.330 +			{
   1.331 +			SBuffer* nextBuffer=currentBuffer+1;
   1.332 +			nextBuffer->iNumberOfBytes=sizeof(char)*UnicodeToOther(aOutputEncoding, NULL, (const wchar_t*)currentBuffer->iData);
   1.333 +			Assert(nextBuffer->iNumberOfBytes>=0, "unconvertible unicode character encountered");
   1.334 +			nextBuffer->iData=malloc(nextBuffer->iNumberOfBytes+sizeof(char)); // "+sizeof(char)" for terminating '\0'
   1.335 +			Assert(nextBuffer->iData!=NULL, "cannot allocate enough memory");
   1.336 +			UnicodeToOther(aOutputEncoding, (char*)nextBuffer->iData, (const wchar_t*)currentBuffer->iData);
   1.337 +			currentBuffer=nextBuffer;
   1.338 +			}
   1.339 +		}
   1.340 +	ObeyRequiredByteOrderIfUnicode(aUnicodeByteOrder, aOutputEncoding, *currentBuffer);
   1.341 +	WriteToFile((const void*)currentBuffer->iData, currentBuffer->iNumberOfBytes, aOutputFile);
   1.342 +	free(arrayOfBuffers[0].iData);
   1.343 +	free(arrayOfBuffers[1].iData);
   1.344 +	free(arrayOfBuffers[2].iData);
   1.345 +	}
   1.346 +
   1.347 +void FlushAndCloseFiles(FILE* aInputFile, FILE* aOutputFile)
   1.348 +	{
   1.349 +	Assert(fflush(aOutputFile)==0, "flushing output-file failed");
   1.350 +	if (aInputFile!=stdin)
   1.351 +		{
   1.352 +		Assert(fclose(aInputFile)==0, "closing input-file failed");
   1.353 +		}
   1.354 +	if (aOutputFile!=stdout)
   1.355 +		{
   1.356 +		Assert(fclose(aOutputFile)==0, "closing output-file failed");
   1.357 +		}
   1.358 +	}
   1.359 +
   1.360 +int main(int aArgc, char* aArgv[])
   1.361 +	{
   1.362 +	int outputByteOrderMark=0;
   1.363 +	TByteOrder unicodeByteOrder=EByteOrderUnspecified;
   1.364 +	const char* inputEncoding=NULL;
   1.365 +	const char* outputEncoding=NULL;
   1.366 +	FILE* inputFile=stdin;
   1.367 +	FILE* outputFile=stdout;
   1.368 +	ReadParameters(aArgc, aArgv, outputByteOrderMark, unicodeByteOrder, inputEncoding, outputEncoding, inputFile, outputFile);
   1.369 +#if defined(__VC32__)
   1.370 +	_setmode(_fileno(inputFile), _O_BINARY);
   1.371 +	_setmode(_fileno(outputFile), _O_BINARY);
   1.372 +#endif
   1.373 +	HandleByteOrderMarks(outputByteOrderMark, unicodeByteOrder, inputEncoding, outputEncoding, inputFile, outputFile);
   1.374 +	DoConversion(unicodeByteOrder, inputEncoding, outputEncoding, inputFile, outputFile);
   1.375 +	FlushAndCloseFiles(inputFile, outputFile);
   1.376 +	return 0;
   1.377 +	}
   1.378 +
   1.379 +