1 // Copyright (c) 2003-2009 Nokia Corporation and/or its subsidiary(-ies).
2 // All rights reserved.
3 // This component and the accompanying materials are made available
4 // under the terms of the License "Symbian Foundation License v1.0" to Symbian Foundation members and "Symbian Foundation End User License Agreement v1.0" to non-members
5 // which accompanies this distribution, and is available
6 // at the URL "http://www.symbianfoundation.org/legal/licencesv10.html".
8 // Initial Contributors:
9 // Nokia Corporation - initial contribution.
14 // This file contains the declaration of the generic CMDXMLParser class
15 // which is responsible for creating a DOM structure
16 // from a given XML file.
26 #ifndef __GMXMLPARSER_H__
27 #define __GMXMLPARSER_H__
31 #include <gmxmlconstants.h>
35 class CMDXMLEntityConverter;
41 class MMDXMLParserObserver
42 /** Abstract observer interface for notification when XML parsing is complete.
44 It should be implemented by users of CMDXMLParser
50 Call back function used to inform a client of the Parser when a parsing operation completes.
52 virtual void ParseFileCompleteL() = 0;
55 class MMDXMLParserDataProvider
56 /** Abstract data source interface for XML data source.
58 The user of CMDXMLParser must build one of these to encapsulate the data source
59 that they wish to parse. CMDXMLParser implements a file-based data source to
60 implement the functionality of the ParseFile function.
66 /** Status codes returned by GetData() implementations. */
67 enum TDataProviderResults
69 KMoreData, ///< Returned by the interface implementation when it is returning more data.
70 KDataStreamError, ///< Returned by the interface when an unrecoverable error prevents obtaining more data. A recoverable error should be represented by KDataNotReady.
71 KDataStreamEnd ///< Returned by the interface when there is no more data to come.
76 The XML Parser calls this on a specific data provider to get more data
79 Note that the TPtrC supplied may be used by the parser at any time
80 between the return of this call and the next call that the parser
83 Your data provider must not move the data pointed to until the
84 parser has indicated that it's done with that block by asking for
87 Ownership of the data pointed to remains with the data provider.
90 General comments on efficiency
91 ------------------------------
93 The parser is designed such that it processes the whole data block
94 provided in one go. It will automatically become asynchronous when
95 another block is required - the data provider only needs to supply
98 Because of this design, it allows the data provider to indirectly
99 control the amount of processing time that will be needed
102 It is a good idea to balance the need for the fastest possible
103 processing with the need for client application responsiveness by
104 ensuring that the amount of data passed in a single block is not
105 too large. However, it is worth bearing in mind that the parser
106 will convert UTF8 data streams in blocks of 32 characters, and
107 supplying blocks of smaller length than this will result in a
108 slight loss of efficiency.
110 @param aPtr On return, the data provided
111 @param aStatus Asynchronous status to be completed by the function with a
112 TDataProviderResults value
114 virtual void GetData(TPtrC8 &aPtr, TRequestStatus &aStatus) = 0;
116 Called to indicate that use of the data source is complete.
118 virtual void Disconnect() = 0;
121 class CMDXMLParserFileDataSource;
123 class CMDXMLParser: public CActive
124 /** Creates a DOM structure from a given XML file.
126 The parsing operation is asynchronous and is initiated by a call to ParseFile().
127 On completion, the created DOM document can be retrieved through DetachXMLDoc().
129 Note the following ownership rules for the DOM document:
131 1. calling DetachXMLDoc() transfers ownership of the document to the client
133 2. if the parser is asked to parse a new file while it still owns an existing
134 DOM document, it will delete the old document.
141 /** Allocates and constructs a new XML parser, specifying a DTD.
143 @param aParserObserver XML parser observer
144 @leave KErrNoMemory Out of memory
145 @return New XML parser */
146 IMPORT_C static CMDXMLParser* NewL(MMDXMLParserObserver* aParserObserver);
148 /** Allocates and constructs a new XML parser, specifying a DTD.
150 @param aParserObserver XML parser observer
151 @param aDtdRepresentation DTD validator
152 @leave KErrNoMemory Out of memory
153 @return New XML parser */
154 IMPORT_C static CMDXMLParser* NewL(MMDXMLParserObserver* aParserObserver, MXMLDtd* aDtdRepresentation);
156 /** Allocates and constructs a new XML parser, leaving the object on the cleanup
159 @param aParserObserver XML parser observer
160 @leave KErrNoMemory Out of memory
161 @return New XML parser */
162 IMPORT_C static CMDXMLParser* NewLC(MMDXMLParserObserver* aParserObserver);
164 /** Allocates and constructs a new XML parser, leaving the object on the cleanup
167 @param aParserObserver XML parser observer
168 @param aDtdRepresentation DTD validator
169 @leave KErrNoMemory Out of memory
170 @return New XML parser */
171 IMPORT_C static CMDXMLParser* NewLC(MMDXMLParserObserver* aParserObserver, MXMLDtd* aDtdRepresentation);
175 IMPORT_C ~CMDXMLParser();
177 /** Gets the last error found by the parser.
181 IMPORT_C TInt Error() const;
184 Get the severity of the most severe error found.
185 @return the maximum error severity
187 IMPORT_C TXMLErrorCodeSeverity ErrorSeverity() const;
189 /** Gets the created DOM.
191 This should be called after the conclusion of the parser process.
193 Note that the function sets the internal variable pointing to the document
194 to NULL, so this function can only be called once per file parse. The caller
195 takes ownership of the document, and must delete it when its use is complete.
197 @return The created DOM */
198 IMPORT_C CMDXMLDocument* DetachXMLDoc();
200 /** Parses a specified XML file into a DOM object tree.
202 @param aRFs File server session
203 @param aFileToParse The file name to parse
204 @return KErrNone if success or a file read error code */
205 IMPORT_C TInt ParseFile(RFs aRFs, const TDesC& aFileToParse);
207 IMPORT_C TInt ParseFile(RFile& aFileHandleToParse);
209 /** Parses a specified XML Data Source into a DOM object tree.
210 Use ParseSourceL() function in preference to ParseSource()
211 @param aSource MMDXMLParserDataProvider pointer
213 inline void ParseSource(MMDXMLParserDataProvider *aSource)
215 TRAP_IGNORE(ParseSourceL(aSource));
218 /** Parses a specified XML Data Source into a DOM object tree.
219 @param aSource MMDXMLParserDataProvider pointer
221 IMPORT_C void ParseSourceL(MMDXMLParserDataProvider *aSource);
223 /** Defines input stream character widths. */
224 enum TMDXMLParserInputCharWidth
226 EAscii = 0x01, ///< ASCII
227 EUnicode = 0x02 ///<Unicode
230 /** Sets the input stream character width.
232 * @param aWidth Character width for incoming stream. Possible values are EAscii and EUnicode (representing Ascii/UTF8 and Unicode respectively).
235 IMPORT_C void SetSourceCharacterWidth(TMDXMLParserInputCharWidth aWidth);
237 //Defect fix for INC036136- Enable the use of custom entity converters in GMXML
239 * Sets the entity converter to be used for parsing.
240 * and take ownership of the passed entity converter
241 * @param aEntityConverter the entity converter to be used.
243 IMPORT_C void SetEntityConverter(CMDXMLEntityConverter* aEntityConverter);
244 //End Defect fix for INC036136
247 Controls whether invalid elements and attributes are added to the DOM.
248 @param aStoreInvalid ETrue if invalid content should be stored, EFalse otherwise.
250 IMPORT_C void SetStoreInvalid(TBool aStoreInvalid);
253 Controls whether whitespaces are handled by XML parser or by client.
254 @param aPreserve ETrue if all whitespaces should be preserved (handled by client), EFalse otherwise.
256 IMPORT_C void SetWhiteSpaceHandlingMode(TBool aPreserve);
258 public: // public functions used by other classes within the .dll, not for Export.
259 /** Gets the entity converter.
261 @return The entity converter */
262 CMDXMLEntityConverter* EntityConverter();
265 IMPORT_C virtual void DoCancel();
268 * RunL function inherited from CActive base class - carries out the actual parsing.
269 * @leave can Leave due to OOM
274 * Helper function that does the parsing - called from inside RunL
276 TBool DoParseLoopL();
279 * RunError function inherited from CActive base class - intercepts any Leave from
280 * the RunL() function, sets an appropriate errorcode and calls ParseFileCompleteL
282 IMPORT_C TInt RunError(TInt aError);
287 CMDXMLParser(MMDXMLParserObserver* aParserObserver);
289 CMDXMLParser(MMDXMLParserObserver* aParserObserver, MXMLDtd* aDtdRepresentation);
292 * Called when a character is read in and found to bo outside of an element tag
294 virtual void HandleTextL(TDes& aChar);
298 KError = 0x00, // GetChar detected an error
299 KCharReturned, // GetChar returned a character
300 KWaitForChar // GetChar couldn't return a character this time, but might next time.
304 * Fetch one character from the input file
305 * @param aChar the returned character.
306 * @return returns one of the values of TCharReturn
308 TGetCharReturn GetChar(TDes& aChar);
310 /* utility functions, called from GetChar to deal with the
311 * 2 types of input stream
313 TGetCharReturn GetDoubleByteChar(TDes& aChar);
314 TGetCharReturn GetSingleByteChar(TDes& aChar);
317 * Fetch some more data from the data provider
318 * @return returns one of the values of TCharReturn
323 * @return Returns true if the current tag is a doctype tag and sets the
324 * Document DocType member accordingly on the first pass of this function.
329 * creates a new processing instruction if necessary and adds to document
330 * @return Returns true if the current tag is a processing instruction
332 TBool ProcessingInstructionL(CMDXMLElement* aParentElement);
335 * creates a new CDataSection if necessary and adds to document
336 * @return Returns true if the current tag is a processing instruction
338 TBool CDataSectionL(CMDXMLElement* aParentElement);
339 TBool EndOfCDataSection();
342 * @return returns true if the current tag is a version id tag and sets the
343 * Document Version member accordingly on the first pass of this function.
348 * creates a new comment if necessary and adds to document
349 * @return returns true if the current tag is a comment tag
351 TBool CommentL(CMDXMLElement* aParentElement);
354 * Parse a start of element tag and create an element with attributes set.
355 * @return Returns a pointer to the created element
356 * @leave can Leave due to OOM
358 virtual CMDXMLElement* ParseStartTagL();
361 * Detects the type of a file - can be Unicode or UTF-8
363 TBool DetectFileType();
366 * Creates a generic or DTD-specific document object
367 * @leave can Leave due to OOM
369 virtual void CreateDocumentL();
372 * Sets iError to new errorcode if more serious than any error so far encountered
374 IMPORT_C void SetError(const TInt aErrorCode, const TXMLErrorCodeSeverity aSeverity);
377 * This function is used to parse the attributes.
378 * @param aElement The element to which the attributes belong
379 * @param aTagToParse The tag to be parsed
380 * @return Returns KErrNone if both attribute name & value are valid
381 * KErrXMLBadAttributeName if attribute name is invalid or KErrXMLBadAttributeValue is invalid
382 * @leave can Leave due to OOM
384 TInt ParseElementAttributesL(CMDXMLElement& aElement, TDes& aTagToParse);
387 This function locates the next attribute in the tag.
388 @param aTagToParse the tag to find the attribute in
389 @return the offset of the next attribute
391 TInt LocateNextAttribute(const TDesC& aTagToParse);
394 * Parses an end tag. In fact, at this point the end tag must match
395 * the tag name of the start tag.
396 * @param aTagToParse Text of the end tag.
397 * @return Returns KErrNone if the end tag matches the start tag or KErrNotFound if there is a mismatch.
399 TInt ParseElementEndTag(CMDXMLElement& aElement, const TDesC& aTagToParse);
401 TInt CheckForStartCData(const TDesC& aTextToCheck);
402 TInt FindDelimiter(TDesC& aDataToSearch, TDesC& aDelimiterToFind);
405 * Second stage constructor
407 void ConstructL(MXMLDtd* aDtdRepresentation);
408 void AddTextL(CMDXMLElement* aParentElement);
411 * Checks whether the end of this tag is in a CDataSection.
412 * @param aDataToSearch The data to check
413 * @return Returns ETrue if the tag contains an unclosed CDataSection
415 TBool InCDataSection(TDesC& aDataToSearch);
418 * Entity converts the sections of one attribute value that are not within a CDataSection.
419 * @param aAttributeValue one attribute value
420 * @return Returns an error if entity conversion did not successfully complete, otherwise KErrNone
422 TInt ParseSingleAttributeL(TDes& aAttributeValue);
425 * Prepares this class for use on another file.
428 void PrepareForReuseL();
431 This should be called when parsing has been completed, before calling ParseFileCompleteL().
432 It checks for errors that can only be determined at the end of parsing, eg missing doctype or
435 void CheckForErrors();
437 IMPORT_C void PlaceholderForRemovedExport1(MMDXMLParserObserver* aParserObserver);
438 IMPORT_C void PlaceholderForRemovedExport2(MMDXMLParserObserver* aParserObserver, MXMLDtd* aDtdRepresentation);
439 IMPORT_C void PlaceholderForRemovedExport3();
443 enum TPanicCode { ENullMemVarDataSource,
444 ENullMemVarParserObserver,
446 ENullMemVarElementTag,
447 ENullParameterParentElement };
448 void Panic(TPanicCode aReason) const;
451 MMDXMLParserObserver* iParserObserver;
452 MXMLDtd* iDtdRepresentation;
453 TInt iError; // Current error
454 TXMLErrorCodeSeverity iSeverity; // ErrorCode severity
455 CMDXMLDocument* iXMLDoc; // Document created by the parser
456 CMDXMLEntityConverter* iEntityConverter; // Entity converter used by the parser
457 HBufC* iElementTag; // Currently processed element tag
462 /* member variables dealing with access to source data */
463 TPtrC8 iInputBufferPtr; // set during a call to get more data
464 TInt iCurrentInputBufferLen; // current length of the data block available
465 TInt iNextChar; // read position in the data block
466 TInt iInputBytesRemaining; // number of bytes remaining to read.
467 HBufC8 *iUTF8EdgeBuffer; // buffer to hold up to 6 bytes so that UTF8 parsing can span edges of data blocks
468 HBufC8 *iBomBuffer; // buffer to hold data at the start of the stream so we may determine charset
469 TInt iRequiredUTF8Bytes; // number of bytes required to complete the character held in the edge buffer
470 TBool iUnicodeInputMisaligned; // Set to ETrue if the unicode input stream is not aligned to 16-bit boundaries
471 MMDXMLParserDataProvider* iDataSource; // XML Data Source being parsed.
472 CMDXMLParserFileDataSource* iFileSource; // We own this, and need to free it when we are done. Only used when we're providing the data source object to wrap a local file.
474 /* member variables dealing with chunked conversion into unicode output */
475 TBuf<32> iUnicodeConversion; // buffer to temporarily hold the results of conversion from UTF8 to Unicode
476 TInt iUnicodeConversionLen; // number of characters stored in our intermediate buffer
477 TInt iUnicodeReadPos; // next character to send from our intermediate buffer
480 /* member variables used when parsing a local file */
483 RFile iFileHandleToParse;
487 /* member variables used in DoParseLoopL() */
490 CMDXMLElement* iNewElement;
491 CMDXMLElement* iParentElement;
503 EParserStates iState;
504 EParserStates iPreviousState;
505 TInt iSuspiciousCharacter;
506 TBool iStoreInvalid; // controls whether invalid elements and attributes are stored in the DOM.