os/textandloc/charconvfw/charconv_fw/tools/convtool/convtool.cpp
author sl@SLION-WIN7.fritz.box
Fri, 15 Jun 2012 03:10:57 +0200
changeset 0 bde4ae8d615e
permissions -rw-r--r--
First public contribution.
     1 /*
     2 * Copyright (c) 1997-2009 Nokia Corporation and/or its subsidiary(-ies).
     3 * All rights reserved.
     4 * This component and the accompanying materials are made available
     5 * under the terms of "Eclipse Public License v1.0"
     6 * which accompanies this distribution, and is available
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
     8 *
     9 * Initial Contributors:
    10 * Nokia Corporation - initial contribution.
    11 *
    12 * Contributors:
    13 *
    14 * Description: 
    15 *
    16 */
    17 
    18 
    19 #pragma warning (disable: 4514) // unreferenced inline/local function has been removed
    20 
    21 #include <stdio.h>
    22 #include <stdlib.h>
    23 #include <string.h>
    24 #include <locale.h>
    25 #include <wchar.h>
    26 #if defined(__VC32__)
    27 #include <FCNTL.H>
    28 #include <IO.H>
    29 #endif
    30 
    31 #undef BIG_ENDIAN
    32 
    33 #if (defined(__MSVCRT__) || defined(_MSC_VER))
    34 //#define _stricmp  _stricmp
    35 //#define _strnicmp _strnicmp
    36 #else // linux 
    37 #define _stricmp  strcasecmp
    38 #define _strnicmp strncasecmp
    39 #endif
    40 
    41 const int KVersionNumber=025;
    42 const int KLargeNumber=1000000;
    43 
    44 extern int Utf8ToUnicode(wchar_t* aUnicode, const char* aUtf8);
    45 extern int UnicodeToUtf8(char* aUtf8, const wchar_t* aUnicode);
    46 
    47 enum TByteOrder
    48 	{
    49 	EByteOrderUnspecified,
    50 	EByteOrderBigEndian,
    51 	EByteOrderLittleEndian,
    52 #if defined(BIG_ENDIAN)
    53 	EByteOrderNative=EByteOrderBigEndian,
    54 	EByteOrderForeign=EByteOrderLittleEndian
    55 #else
    56 	EByteOrderNative=EByteOrderLittleEndian,
    57 	EByteOrderForeign=EByteOrderBigEndian
    58 #endif
    59 	};
    60 
    61 struct SBuffer
    62 	{
    63 	int iNumberOfBytes;
    64 	void* iData;
    65 	};
    66 
    67 void PrintUsage(const char* aProgramName)
    68     {
    69 	fprintf(stderr, "\nVersion %03d\n\nCharacter set conversion tool\nCopyright (c) 1999 Symbian Ltd\n\n", KVersionNumber);
    70 	fprintf(stderr, "Usage:\n\n\t%s [<options>] <inputspec> <outputspec>\n\nwhere\n\n\t"
    71 				"options    :=  [-big|-little][-byteordermark]\n\t"
    72 				"inputspec  :=  -input=<format> [<input_file>]\n\t"
    73 				"outputspec :=  -output=<format> [<output_file>]\n\t"
    74 				"format     :=  unicode|1252|utf8|...\n\n", aProgramName);
    75 	const char* localeData=setlocale(LC_ALL, "");
    76 	while (*localeData!='.')
    77 		{
    78 		++localeData;
    79 		}
    80 	fprintf(stderr, "(The default encoding is currently \"%s\")\n\n", localeData+1);
    81 	}
    82 
    83 void Assert(int aCondition, const char* aErrorMessageFormat, const void* aExtraParameter1=NULL, const void* aExtraParameter2=NULL)
    84 	{
    85 	if (!aCondition)
    86 		{
    87 		char errorMessage[100];
    88 		sprintf(errorMessage, aErrorMessageFormat, aExtraParameter1, aExtraParameter2);
    89 		fprintf(stderr, "Error: %s\n", errorMessage);
    90 		exit(1);
    91 		}
    92 	}
    93 
    94 void PrintWarning(const char* aWarningMessage)
    95 	{
    96 	fprintf(stderr, "Warning: %s\n", aWarningMessage);
    97 	}
    98 
    99 int TryFileParameter(int aArgc, char* aArgv[], int& aArgIndex, const char* aInputOrOutput, const char*& aEncoding, FILE*& aFile, const char* aFileMode)
   100 	{
   101 	char prefix[100];
   102 	strcpy(prefix, "-");
   103 	strcat(prefix, aInputOrOutput);
   104 	strcat(prefix, "=");
   105 	int lengthOfPrefix=strlen(prefix);
   106 	if (_strnicmp(aArgv[aArgIndex], prefix, lengthOfPrefix)==0)
   107 		{
   108 		Assert(aEncoding==NULL, "\"%s...\" is specified more than once", prefix);
   109 		aEncoding=aArgv[aArgIndex]+lengthOfPrefix;
   110 		++aArgIndex;
   111 		if ((aArgIndex>=aArgc) || (aArgv[aArgIndex][0]=='-'))
   112 			{
   113 			--aArgIndex;
   114 			}
   115 		else
   116 			{
   117 			aFile=fopen(aArgv[aArgIndex], aFileMode);
   118 			Assert(aFile!=NULL, "opening %s-file failed", aInputOrOutput);
   119 			}
   120 		return 1;
   121 		}
   122 	return 0;
   123 	}
   124 
   125 void ReadParameters(int aArgc, char* aArgv[], int& aOutputByteOrderMark, TByteOrder& aUnicodeByteOrder, const char*& aInputEncoding, const char*& aOutputEncoding, FILE*& aInputFile, FILE*& aOutputFile)
   126 	{
   127 	if ((aArgc<=1) || (_stricmp(aArgv[1], "?")==0) || (_stricmp(aArgv[1], "/?")==0))
   128 		{
   129 		PrintUsage(aArgv[0]);
   130 		exit(0);
   131 		}
   132 	for (int i=1; i<aArgc; ++i) // start at index 1 to avoid the program name (which is the first parameter)
   133 		{
   134 		if (_stricmp(aArgv[i], "-byteordermark")==0)
   135 			{
   136 			Assert(!aOutputByteOrderMark, "\"-byteordermark\" is specified more than once");
   137 			aOutputByteOrderMark=1;
   138 			}
   139 		else if (_stricmp(aArgv[i], "-big")==0)
   140 			{
   141 			Assert(aUnicodeByteOrder==EByteOrderUnspecified, "the byte order of unicode text (i.e. \"-big\"/\"-little\") is specified more than once");
   142 			aUnicodeByteOrder=EByteOrderBigEndian;
   143 			}
   144 		else if (_stricmp(aArgv[i], "-little")==0)
   145 			{
   146 			Assert(aUnicodeByteOrder==EByteOrderUnspecified, "the byte order of unicode text (i.e. \"-big\"/\"-little\") is specified more than once");
   147 			aUnicodeByteOrder=EByteOrderLittleEndian;
   148 			}
   149 		else
   150 			{
   151 			Assert(TryFileParameter(aArgc, aArgv, i, "input", aInputEncoding, aInputFile, "r") ||
   152 				   TryFileParameter(aArgc, aArgv, i, "output", aOutputEncoding, aOutputFile, "w"), "bad parameter \"%s\"", aArgv[i]);
   153 			}
   154 		}
   155 	Assert(aInputEncoding!=NULL, "no input encoding is specified");
   156 	Assert(aOutputEncoding!=NULL, "no output encoding is specified");
   157 	}
   158 
   159 int ReadFromFileReturningNumberOfBytesRead(void* aBuffer, int aNumberOfBytesToRead, FILE* aInputFile)
   160 	{
   161 	int numberOfBytesRead=0;
   162 	int numberOfBytesToReadThisTime=aNumberOfBytesToRead;
   163 	for (;;)
   164 		{
   165 		for (;;)
   166 			{
   167 			const int remainingNumberOfBytesToRead=aNumberOfBytesToRead-numberOfBytesRead;
   168 			if (numberOfBytesToReadThisTime>remainingNumberOfBytesToRead)
   169 				{
   170 				numberOfBytesToReadThisTime=remainingNumberOfBytesToRead;
   171 				}
   172 			const int numberOfBytesReadThisTime=fread(aBuffer, 1, numberOfBytesToReadThisTime, aInputFile);
   173 			const int error=ferror(aInputFile);
   174 			if (error==0)
   175 				{
   176 				aBuffer=((unsigned char*)aBuffer)+numberOfBytesReadThisTime;
   177 				numberOfBytesRead+=numberOfBytesReadThisTime;
   178 				Assert(numberOfBytesRead<=aNumberOfBytesToRead, "internal error (read too many bytes)");
   179 				if ((numberOfBytesRead>=aNumberOfBytesToRead) || feof(aInputFile))
   180 					{
   181 					return numberOfBytesRead;
   182 					}
   183 				break;
   184 				}
   185 			numberOfBytesToReadThisTime/=2;
   186 			Assert(numberOfBytesToReadThisTime>0, "reading from file failed with error number %d", (const void*)error);
   187 			clearerr(aInputFile);
   188 			}
   189 		}
   190 	}
   191 
   192 void WriteToFile(const void* aBuffer, int aNumberOfBytesToWrite, FILE* aOutputFile)
   193 	{
   194 	const int numberOfBytesWritten=fwrite(aBuffer, 1, aNumberOfBytesToWrite, aOutputFile);
   195 	Assert(numberOfBytesWritten==aNumberOfBytesToWrite, "only %d out of %d bytes could be written to file", (const void*)numberOfBytesWritten, (const void*)aNumberOfBytesToWrite);
   196 	const int error=ferror(aOutputFile);
   197 	Assert(error==0, "writing to file failed with error number %d", (const void*)error);
   198 	}
   199 
   200 void HandleByteOrderMarks(int aOutputByteOrderMark, TByteOrder& aUnicodeByteOrder, const char* aInputEncoding, const char* aOutputEncoding, FILE* aInputFile, FILE* aOutputFile)
   201 	{
   202 	if (_stricmp(aInputEncoding, "unicode")==0)
   203 		{
   204 		unsigned short firstUnicodeCharacter=0;
   205 		const int numberOfBytesRead=ReadFromFileReturningNumberOfBytesRead((void*)&firstUnicodeCharacter, sizeof(unsigned short), aInputFile);
   206 		TByteOrder byteOrderSpecifiedByByteOrderMark=EByteOrderUnspecified;
   207 		if (numberOfBytesRead==sizeof(unsigned short))
   208 			{
   209 			switch (firstUnicodeCharacter)
   210 				{
   211 			case 0xfeff:
   212 				byteOrderSpecifiedByByteOrderMark=EByteOrderNative;
   213 				break;
   214 			case 0xfffe:
   215 				byteOrderSpecifiedByByteOrderMark=EByteOrderForeign;
   216 				break;
   217 			default:
   218 				const int error=fseek(aInputFile, 0, SEEK_SET); // rewind to the start of the file
   219 				Assert(error==0, "could not rewind to the start of the input file");
   220 				break;
   221 				}
   222 			}
   223 		if (byteOrderSpecifiedByByteOrderMark!=EByteOrderUnspecified)
   224 			{
   225 			if ((aUnicodeByteOrder!=EByteOrderUnspecified) && (byteOrderSpecifiedByByteOrderMark!=aUnicodeByteOrder))
   226 				{
   227 				PrintWarning("the byte order specified by the byte-order mark in the unicode input is different from the byte order specified by the parameter - taking the byte-order specified by the byte-order mark in the unicode input");
   228 				}
   229 			aUnicodeByteOrder=byteOrderSpecifiedByByteOrderMark;
   230 			}
   231 		}
   232 	if (aOutputByteOrderMark)
   233 		{
   234 		if (_stricmp(aOutputEncoding, "unicode")!=0)
   235 			{
   236 			PrintWarning("\"-byteordermark\" is only relevant for unicode output");
   237 			}
   238 		else
   239 			{
   240 			Assert(aUnicodeByteOrder!=EByteOrderUnspecified, "the byte order must be specified if a byte-order mark is to be added to the unicode output");
   241 			unsigned short firstUnicodeCharacter=(unsigned short)((aUnicodeByteOrder==EByteOrderNative)? 0xfeff: 0xfffe);
   242 			WriteToFile((const void*)&firstUnicodeCharacter, sizeof(unsigned short), aOutputFile);
   243 			}
   244 		}
   245 	}
   246 
   247 void ObeyRequiredByteOrderIfUnicode(TByteOrder& aUnicodeByteOrder, const char* aEncoding, SBuffer& aBuffer)
   248 	{
   249 	if (_stricmp(aEncoding, "unicode")==0)
   250 		{
   251 		Assert(aBuffer.iNumberOfBytes%sizeof(wchar_t)==0, "internal error (bad number of bytes in unicode buffer)");
   252 		if (aUnicodeByteOrder==EByteOrderUnspecified)
   253 			{
   254 			PrintWarning("the byte order of unicode text is unspecified - defaulting to little endian");
   255 			aUnicodeByteOrder=EByteOrderLittleEndian;
   256 			}
   257 		if (aUnicodeByteOrder==EByteOrderForeign)
   258 			{
   259 			for (unsigned char* bytePointer=((unsigned char*)aBuffer.iData)+(aBuffer.iNumberOfBytes-sizeof(wchar_t)); bytePointer>=aBuffer.iData; bytePointer-=sizeof(wchar_t))
   260 				{
   261 				unsigned char temp=*bytePointer;
   262 				*bytePointer=*(bytePointer+1);
   263 				*(bytePointer+1)=temp;
   264 				}
   265 			}
   266 		}
   267 	}
   268 
   269 int OtherToUnicode(const char* aInputEncoding, wchar_t* aUnicode, const char* aOther)
   270 // if the output parameter is NULL, it returns the precise size of the would-be output parameter (in terms of number of "wchar_t"s) excluding any trailing '\0', otherwise it returns 0
   271 	{
   272 	if (_stricmp(aInputEncoding, "utf8")==0)
   273 		{
   274 		return Utf8ToUnicode(aUnicode, aOther);
   275 		}
   276 	char localeData[100];
   277 	strcpy(localeData, ".");
   278 	strcat(localeData, aInputEncoding);
   279 	Assert(setlocale(LC_ALL, localeData)!=NULL, "could not convert from encoding \"%s\"", aInputEncoding);
   280 	return mbstowcs(aUnicode, aOther, KLargeNumber);
   281 	}
   282 
   283 int UnicodeToOther(const char* aOutputEncoding, char* aOther, const wchar_t* aUnicode)
   284 // if the output parameter is NULL, it returns the precise size of the would-be output parameter (in terms of number of "char"s) excluding any trailing '\0', otherwise it returns 0
   285 	{
   286 	if (_stricmp(aOutputEncoding, "utf8")==0)
   287 		{
   288 		return UnicodeToUtf8(aOther, aUnicode);
   289 		}
   290 	char localeData[100];
   291 	strcpy(localeData, ".");
   292 	strcat(localeData, aOutputEncoding);
   293 	Assert(setlocale(LC_ALL, localeData)!=NULL, "could not convert to encoding \"%s\"", aOutputEncoding);
   294 	return wcstombs(aOther, aUnicode, KLargeNumber);
   295 	}
   296 
   297 void DoConversion(TByteOrder& aUnicodeByteOrder, const char* aInputEncoding, const char* aOutputEncoding, FILE* aInputFile, FILE* aOutputFile)
   298 	{
   299 	SBuffer arrayOfBuffers[3];
   300 	arrayOfBuffers[0].iNumberOfBytes=0;
   301 	arrayOfBuffers[0].iData=malloc(KLargeNumber+2); // +2 for the 2 '\0' bytes appended to the data read from file
   302 	Assert(arrayOfBuffers[0].iData!=NULL, "cannot allocate enough memory");
   303 	arrayOfBuffers[1].iNumberOfBytes=0;
   304 	arrayOfBuffers[1].iData=NULL;
   305 	arrayOfBuffers[2].iNumberOfBytes=0;
   306 	arrayOfBuffers[2].iData=NULL;
   307 	SBuffer* currentBuffer=arrayOfBuffers;
   308 	currentBuffer->iNumberOfBytes=ReadFromFileReturningNumberOfBytesRead(currentBuffer->iData, KLargeNumber, aInputFile);
   309 	// append 2 '\0' bytes at the end of the buffer read from file (2 in case it is unicode)
   310 	((char*)currentBuffer->iData)[currentBuffer->iNumberOfBytes]='\0';
   311 	((char*)currentBuffer->iData)[currentBuffer->iNumberOfBytes+1]='\0';
   312 	ObeyRequiredByteOrderIfUnicode(aUnicodeByteOrder, aInputEncoding, *currentBuffer);
   313 	// if the input and output encodings are different, convert from one to the other (via unicode if neither is itself unicode)
   314 	if (_stricmp(aInputEncoding, aOutputEncoding)!=0)
   315 		{
   316 		if (_stricmp(aInputEncoding, "unicode")!=0)
   317 			{
   318 			SBuffer* nextBuffer=currentBuffer+1;
   319 			nextBuffer->iNumberOfBytes=sizeof(wchar_t)*OtherToUnicode(aInputEncoding, NULL, (const char*)currentBuffer->iData);
   320 			Assert(nextBuffer->iNumberOfBytes>=0, "invalid multi-byte character encountered");
   321 			nextBuffer->iData=malloc(nextBuffer->iNumberOfBytes+sizeof(wchar_t)); // "+sizeof(wchar_t)" for terminating '\0'
   322 			Assert(nextBuffer->iData!=NULL, "cannot allocate enough memory");
   323 			OtherToUnicode(aInputEncoding, (wchar_t*)nextBuffer->iData, (const char*)currentBuffer->iData);
   324 			currentBuffer=nextBuffer;
   325 			}
   326 		if (_stricmp(aOutputEncoding, "unicode")!=0)
   327 			{
   328 			SBuffer* nextBuffer=currentBuffer+1;
   329 			nextBuffer->iNumberOfBytes=sizeof(char)*UnicodeToOther(aOutputEncoding, NULL, (const wchar_t*)currentBuffer->iData);
   330 			Assert(nextBuffer->iNumberOfBytes>=0, "unconvertible unicode character encountered");
   331 			nextBuffer->iData=malloc(nextBuffer->iNumberOfBytes+sizeof(char)); // "+sizeof(char)" for terminating '\0'
   332 			Assert(nextBuffer->iData!=NULL, "cannot allocate enough memory");
   333 			UnicodeToOther(aOutputEncoding, (char*)nextBuffer->iData, (const wchar_t*)currentBuffer->iData);
   334 			currentBuffer=nextBuffer;
   335 			}
   336 		}
   337 	ObeyRequiredByteOrderIfUnicode(aUnicodeByteOrder, aOutputEncoding, *currentBuffer);
   338 	WriteToFile((const void*)currentBuffer->iData, currentBuffer->iNumberOfBytes, aOutputFile);
   339 	free(arrayOfBuffers[0].iData);
   340 	free(arrayOfBuffers[1].iData);
   341 	free(arrayOfBuffers[2].iData);
   342 	}
   343 
   344 void FlushAndCloseFiles(FILE* aInputFile, FILE* aOutputFile)
   345 	{
   346 	Assert(fflush(aOutputFile)==0, "flushing output-file failed");
   347 	if (aInputFile!=stdin)
   348 		{
   349 		Assert(fclose(aInputFile)==0, "closing input-file failed");
   350 		}
   351 	if (aOutputFile!=stdout)
   352 		{
   353 		Assert(fclose(aOutputFile)==0, "closing output-file failed");
   354 		}
   355 	}
   356 
   357 int main(int aArgc, char* aArgv[])
   358 	{
   359 	int outputByteOrderMark=0;
   360 	TByteOrder unicodeByteOrder=EByteOrderUnspecified;
   361 	const char* inputEncoding=NULL;
   362 	const char* outputEncoding=NULL;
   363 	FILE* inputFile=stdin;
   364 	FILE* outputFile=stdout;
   365 	ReadParameters(aArgc, aArgv, outputByteOrderMark, unicodeByteOrder, inputEncoding, outputEncoding, inputFile, outputFile);
   366 #if defined(__VC32__)
   367 	_setmode(_fileno(inputFile), _O_BINARY);
   368 	_setmode(_fileno(outputFile), _O_BINARY);
   369 #endif
   370 	HandleByteOrderMarks(outputByteOrderMark, unicodeByteOrder, inputEncoding, outputEncoding, inputFile, outputFile);
   371 	DoConversion(unicodeByteOrder, inputEncoding, outputEncoding, inputFile, outputFile);
   372 	FlushAndCloseFiles(inputFile, outputFile);
   373 	return 0;
   374 	}
   375 
   376