1 // Copyright (c) 2003-2009 Nokia Corporation and/or its subsidiary(-ies).
2 // All rights reserved.
3 // This component and the accompanying materials are made available
4 // under the terms of "Eclipse Public License v1.0"
5 // which accompanies this distribution, and is available
6 // at the URL "http://www.eclipse.org/legal/epl-v10.html".
8 // Initial Contributors:
9 // Nokia Corporation - initial contribution.
14 // This file contains the declaration of the generic CMDXMLParser class
15 // which is responsible for creating a DOM structure
16 // from a given XML file.
24 #ifndef __GMXMLPARSER_H__
25 #define __GMXMLPARSER_H__
29 #include <gmxmlconstants.h>
34 class CMDXMLEntityConverter;
40 class MMDXMLParserObserver
41 /** Abstract observer interface for notification when XML parsing is complete.
43 It should be implemented by users of CMDXMLParser
49 Call back function used to inform a client of the Parser when a parsing operation completes.
51 virtual void ParseFileCompleteL() = 0;
54 class MMDXMLParserDataProvider
55 /** Abstract data source interface for XML data source.
57 The user of CMDXMLParser must build one of these to encapsulate the data source
58 that they wish to parse. CMDXMLParser implements a file-based data source to
59 implement the functionality of the ParseFile function.
65 /** Status codes returned by GetData() implementations. */
66 enum TDataProviderResults
68 KMoreData, //< Returned by the interface implementation when it is returning more data.
69 KDataStreamError, //< Returned by the interface when an unrecoverable error prevents obtaining more data. A recoverable error should be represented by KDataNotReady.
70 KDataStreamEnd //< Returned by the interface when there is no more data to come.
75 The XML Parser calls this on a specific data provider to get more data
78 Note that the TPtrC supplied may be used by the parser at any time
79 between the return of this call and the next call that the parser
82 Your data provider must not move the data pointed to until the
83 parser has indicated that it's done with that block by asking for
86 Ownership of the data pointed to remains with the data provider.
89 General comments on efficiency
90 ------------------------------
92 The parser is designed such that it processes the whole data block
93 provided in one go. It will automatically become asynchronous when
94 another block is required - the data provider only needs to supply
97 Because of this design, it allows the data provider to indirectly
98 control the amount of processing time that will be needed
101 It is a good idea to balance the need for the fastest possible
102 processing with the need for client application responsiveness by
103 ensuring that the amount of data passed in a single block is not
104 too large. However, it is worth bearing in mind that the parser
105 will convert UTF8 data streams in blocks of 32 characters, and
106 supplying blocks of smaller length than this will result in a
107 slight loss of efficiency.
109 @param aPtr On return, the data provided
110 @param aStatus Asynchronous status to be completed by the function with a
111 TDataProviderResults value
113 virtual void GetData(TPtrC8 &aPtr, TRequestStatus &aStatus) = 0;
115 Called to indicate that use of the data source is complete.
117 virtual void Disconnect() = 0;
120 class CMDXMLParserFileDataSource;
122 class CMDXMLParser: public CActive
123 /** Creates a DOM structure from a given XML file.
125 The parsing operation is asynchronous and is initiated by a call to ParseFile().
126 On completion, the created DOM document can be retrieved through DetachXMLDoc().
128 Note the following ownership rules for the DOM document:
130 1. calling DetachXMLDoc() transfers ownership of the document to the client
132 2. if the parser is asked to parse a new file while it still owns an existing
133 DOM document, it will delete the old document.
140 /** Allocates and constructs a new XML parser, specifying a DTD.
142 @param aParserObserver XML parser observer
143 @leave KErrNoMemory Out of memory
144 @return New XML parser */
145 IMPORT_C static CMDXMLParser* NewL(MMDXMLParserObserver* aParserObserver);
147 /** Allocates and constructs a new XML parser, specifying a DTD.
149 @param aParserObserver XML parser observer
150 @param aDtdRepresentation DTD validator
151 @leave KErrNoMemory Out of memory
152 @return New XML parser */
153 IMPORT_C static CMDXMLParser* NewL(MMDXMLParserObserver* aParserObserver, MXMLDtd* aDtdRepresentation);
155 /** Allocates and constructs a new XML parser, leaving the object on the cleanup
158 @param aParserObserver XML parser observer
159 @leave KErrNoMemory Out of memory
160 @return New XML parser */
161 IMPORT_C static CMDXMLParser* NewLC(MMDXMLParserObserver* aParserObserver);
163 /** Allocates and constructs a new XML parser, leaving the object on the cleanup
166 @param aParserObserver XML parser observer
167 @param aDtdRepresentation DTD validator
168 @leave KErrNoMemory Out of memory
169 @return New XML parser */
170 IMPORT_C static CMDXMLParser* NewLC(MMDXMLParserObserver* aParserObserver, MXMLDtd* aDtdRepresentation);
174 IMPORT_C ~CMDXMLParser();
176 /** Gets the last error found by the parser.
180 IMPORT_C TInt Error() const;
183 Get the severity of the most severe error found.
184 @return the maximum error severity
186 IMPORT_C TXMLErrorCodeSeverity ErrorSeverity() const;
188 /** Gets the created DOM.
190 This should be called after the conclusion of the parser process.
192 Note that the function sets the internal variable pointing to the document
193 to NULL, so this function can only be called once per file parse. The caller
194 takes ownership of the document, and must delete it when its use is complete.
196 @return The created DOM */
197 IMPORT_C CMDXMLDocument* DetachXMLDoc();
199 /** Parses a specified XML file into a DOM object tree.
201 @param aRFs File server session
202 @param aFileToParse The file name to parse
203 @return KErrNone if success or a file read error code */
204 IMPORT_C TInt ParseFile(RFs aRFs, const TDesC& aFileToParse);
206 IMPORT_C TInt ParseFile(RFile& aFileHandleToParse);
208 /** Parses a specified XML Data Source into a DOM object tree.
209 Use ParseSourceL() function in preference to ParseSource()
210 @param aSource MMDXMLParserDataProvider pointer
212 inline void ParseSource(MMDXMLParserDataProvider *aSource)
214 TRAP_IGNORE(ParseSourceL(aSource));
217 /** Parses a specified XML Data Source into a DOM object tree.
218 @param aSource MMDXMLParserDataProvider pointer
220 IMPORT_C void ParseSourceL(MMDXMLParserDataProvider *aSource);
222 /** Defines input stream character widths. */
223 enum TMDXMLParserInputCharWidth
225 EAscii = 0x01, //< ASCII
226 EUnicode = 0x02 //<Unicode
229 /** Sets the input stream character width.
231 * @param aWidth Character width for incoming stream. Possible values are EAscii and EUnicode (representing Ascii/UTF8 and Unicode respectively).
234 IMPORT_C void SetSourceCharacterWidth(TMDXMLParserInputCharWidth aWidth);
236 //Defect fix for INC036136- Enable the use of custom entity converters in GMXML
238 * Sets the entity converter to be used for parsing.
239 * and take ownership of the passed entity converter
240 * @param aEntityConverter the entity converter to be used.
242 IMPORT_C void SetEntityConverter(CMDXMLEntityConverter* aEntityConverter);
243 //End Defect fix for INC036136
246 Controls whether invalid elements and attributes are added to the DOM.
247 @param aStoreInvalid ETrue if invalid content should be stored, EFalse otherwise.
249 IMPORT_C void SetStoreInvalid(TBool aStoreInvalid);
252 Controls whether whitespaces are handled by XML parser or by client.
253 @param aPreserve ETrue if all whitespaces should be preserved (handled by client), EFalse otherwise.
255 IMPORT_C void SetWhiteSpaceHandlingMode(TBool aPreserve);
257 public: // public functions used by other classes within the .dll, not for Export.
258 /** Gets the entity converter.
260 @return The entity converter */
261 CMDXMLEntityConverter* EntityConverter();
264 IMPORT_C virtual void DoCancel();
267 * RunL function inherited from CActive base class - carries out the actual parsing.
268 * @leave can Leave due to OOM
273 * Helper function that does the parsing - called from inside RunL
275 TBool DoParseLoopL();
278 * RunError function inherited from CActive base class - intercepts any Leave from
279 * the RunL() function, sets an appropriate errorcode and calls ParseFileCompleteL
281 IMPORT_C TInt RunError(TInt aError);
286 CMDXMLParser(MMDXMLParserObserver* aParserObserver);
288 CMDXMLParser(MMDXMLParserObserver* aParserObserver, MXMLDtd* aDtdRepresentation);
291 * Called when a character is read in and found to bo outside of an element tag
293 virtual void HandleTextL(TDes& aChar);
297 KError = 0x00, // GetChar detected an error
298 KCharReturned, // GetChar returned a character
299 KWaitForChar // GetChar couldn't return a character this time, but might next time.
303 * Fetch one character from the input file
304 * @param aChar the returned character.
305 * @return returns one of the values of TCharReturn
307 TGetCharReturn GetChar(TDes& aChar);
309 /* utility functions, called from GetChar to deal with the
310 * 2 types of input stream
312 TGetCharReturn GetDoubleByteChar(TDes& aChar);
313 TGetCharReturn GetSingleByteChar(TDes& aChar);
316 * Fetch some more data from the data provider
317 * @return returns one of the values of TCharReturn
322 * @return Returns true if the current tag is a doctype tag and sets the
323 * Document DocType member accordingly on the first pass of this function.
328 * creates a new processing instruction if necessary and adds to document
329 * @return Returns true if the current tag is a processing instruction
331 TBool ProcessingInstructionL(CMDXMLElement* aParentElement);
334 * creates a new CDataSection if necessary and adds to document
335 * @return Returns true if the current tag is a processing instruction
337 TBool CDataSectionL(CMDXMLElement* aParentElement);
338 TBool EndOfCDataSection();
341 * @return returns true if the current tag is a version id tag and sets the
342 * Document Version member accordingly on the first pass of this function.
347 * creates a new comment if necessary and adds to document
348 * @return returns true if the current tag is a comment tag
350 TBool CommentL(CMDXMLElement* aParentElement);
353 * Parse a start of element tag and create an element with attributes set.
354 * @return Returns a pointer to the created element
355 * @leave can Leave due to OOM
357 virtual CMDXMLElement* ParseStartTagL();
360 * Detects the type of a file - can be Unicode or UTF-8
362 TBool DetectFileType();
365 * Creates a generic or DTD-specific document object
366 * @leave can Leave due to OOM
368 virtual void CreateDocumentL();
371 * Sets iError to new errorcode if more serious than any error so far encountered
373 IMPORT_C void SetError(const TInt aErrorCode, const TXMLErrorCodeSeverity aSeverity);
376 * This function is used to parse the attributes.
377 * @param aElement The element to which the attributes belong
378 * @param aTagToParse The tag to be parsed
379 * @return Returns KErrNone if both attribute name & value are valid
380 * KErrXMLBadAttributeName if attribute name is invalid or KErrXMLBadAttributeValue is invalid
381 * @leave can Leave due to OOM
383 TInt ParseElementAttributesL(CMDXMLElement& aElement, TDes& aTagToParse);
386 This function locates the next attribute in the tag.
387 @param aTagToParse the tag to find the attribute in
388 @return the offset of the next attribute
390 TInt LocateNextAttribute(const TDesC& aTagToParse);
393 * Parses an end tag. In fact, at this point the end tag must match
394 * the tag name of the start tag.
395 * @param aTagToParse Text of the end tag.
396 * @return Returns KErrNone if the end tag matches the start tag or KErrNotFound if there is a mismatch.
398 TInt ParseElementEndTag(CMDXMLElement& aElement, const TDesC& aTagToParse);
400 TInt CheckForStartCData(const TDesC& aTextToCheck);
401 TInt FindDelimiter(TDesC& aDataToSearch, TDesC& aDelimiterToFind);
404 * Second stage constructor
406 void ConstructL(MXMLDtd* aDtdRepresentation);
407 void AddTextL(CMDXMLElement* aParentElement);
410 * Checks whether the end of this tag is in a CDataSection.
411 * @param aDataToSearch The data to check
412 * @return Returns ETrue if the tag contains an unclosed CDataSection
414 TBool InCDataSection(TDesC& aDataToSearch);
417 * Entity converts the sections of one attribute value that are not within a CDataSection.
418 * @param aAttributeValue one attribute value
419 * @return Returns an error if entity conversion did not successfully complete, otherwise KErrNone
421 TInt ParseSingleAttributeL(TDes& aAttributeValue);
424 * Prepares this class for use on another file.
427 void PrepareForReuseL();
430 This should be called when parsing has been completed, before calling ParseFileCompleteL().
431 It checks for errors that can only be determined at the end of parsing, eg missing doctype or
434 void CheckForErrors();
436 IMPORT_C void PlaceholderForRemovedExport1(MMDXMLParserObserver* aParserObserver);
437 IMPORT_C void PlaceholderForRemovedExport2(MMDXMLParserObserver* aParserObserver, MXMLDtd* aDtdRepresentation);
438 IMPORT_C void PlaceholderForRemovedExport3();
442 enum TPanicCode { ENullMemVarDataSource,
443 ENullMemVarParserObserver,
445 ENullMemVarElementTag,
446 ENullParameterParentElement };
447 void Panic(TPanicCode aReason) const;
450 MMDXMLParserObserver* iParserObserver;
451 MXMLDtd* iDtdRepresentation;
452 TInt iError; // Current error
453 TXMLErrorCodeSeverity iSeverity; // ErrorCode severity
454 CMDXMLDocument* iXMLDoc; // Document created by the parser
455 CMDXMLEntityConverter* iEntityConverter; // Entity converter used by the parser
456 HBufC* iElementTag; // Currently processed element tag
461 /* member variables dealing with access to source data */
462 TPtrC8 iInputBufferPtr; // set during a call to get more data
463 TInt iCurrentInputBufferLen; // current length of the data block available
464 TInt iNextChar; // read position in the data block
465 TInt iInputBytesRemaining; // number of bytes remaining to read.
466 HBufC8 *iUTF8EdgeBuffer; // buffer to hold up to 6 bytes so that UTF8 parsing can span edges of data blocks
467 HBufC8 *iBomBuffer; // buffer to hold data at the start of the stream so we may determine charset
468 TInt iRequiredUTF8Bytes; // number of bytes required to complete the character held in the edge buffer
469 TBool iUnicodeInputMisaligned; // Set to ETrue if the unicode input stream is not aligned to 16-bit boundaries
470 MMDXMLParserDataProvider* iDataSource; // XML Data Source being parsed.
471 CMDXMLParserFileDataSource* iFileSource; // We own this, and need to free it when we are done. Only used when we're providing the data source object to wrap a local file.
473 /* member variables dealing with chunked conversion into unicode output */
474 TBuf<32> iUnicodeConversion; // buffer to temporarily hold the results of conversion from UTF8 to Unicode
475 TInt iUnicodeConversionLen; // number of characters stored in our intermediate buffer
476 TInt iUnicodeReadPos; // next character to send from our intermediate buffer
479 /* member variables used when parsing a local file */
482 RFile iFileHandleToParse;
486 /* member variables used in DoParseLoopL() */
489 CMDXMLElement* iNewElement;
490 CMDXMLElement* iParentElement;
502 EParserStates iState;
503 EParserStates iPreviousState;
504 TInt iSuspiciousCharacter;
505 TBool iStoreInvalid; // controls whether invalid elements and attributes are stored in the DOM.