os/persistentdata/persistentstorage/sql/SQLite/pager.c
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
     1 /*
     2 ** 2001 September 15
     3 **
     4 ** The author disclaims copyright to this source code.  In place of
     5 ** a legal notice, here is a blessing:
     6 **
     7 **    May you do good and not evil.
     8 **    May you find forgiveness for yourself and forgive others.
     9 **    May you share freely, never taking more than you give.
    10 **
    11 *************************************************************************
    12 ** This is the implementation of the page cache subsystem or "pager".
    13 ** 
    14 ** The pager is used to access a database disk file.  It implements
    15 ** atomic commit and rollback through the use of a journal file that
    16 ** is separate from the database file.  The pager also implements file
    17 ** locking to prevent two processes from writing the same database
    18 ** file simultaneously, or one process from reading the database while
    19 ** another is writing.
    20 **
    21 ** @(#) $Id: pager.c,v 1.469 2008/08/02 03:50:39 drh Exp $
    22 */
    23 #ifndef SQLITE_OMIT_DISKIO
    24 #include "sqliteInt.h"
    25 #include <assert.h>
    26 #include <string.h>
    27 
    28 /*
    29 ** Macros for troubleshooting.  Normally turned off
    30 */
    31 #if 0
    32 #define sqlite3DebugPrintf printf
    33 #define PAGERTRACE1(X)       sqlite3DebugPrintf(X)
    34 #define PAGERTRACE2(X,Y)     sqlite3DebugPrintf(X,Y)
    35 #define PAGERTRACE3(X,Y,Z)   sqlite3DebugPrintf(X,Y,Z)
    36 #define PAGERTRACE4(X,Y,Z,W) sqlite3DebugPrintf(X,Y,Z,W)
    37 #define PAGERTRACE5(X,Y,Z,W,V) sqlite3DebugPrintf(X,Y,Z,W,V)
    38 #else
    39 #define PAGERTRACE1(X)
    40 #define PAGERTRACE2(X,Y)
    41 #define PAGERTRACE3(X,Y,Z)
    42 #define PAGERTRACE4(X,Y,Z,W)
    43 #define PAGERTRACE5(X,Y,Z,W,V)
    44 #endif
    45 
    46 /*
    47 ** The following two macros are used within the PAGERTRACEX() macros above
    48 ** to print out file-descriptors. 
    49 **
    50 ** PAGERID() takes a pointer to a Pager struct as its argument. The
    51 ** associated file-descriptor is returned. FILEHANDLEID() takes an sqlite3_file
    52 ** struct as its argument.
    53 */
    54 #define PAGERID(p) ((int)(p->fd))
    55 #define FILEHANDLEID(fd) ((int)fd)
    56 
    57 /*
    58 ** The page cache as a whole is always in one of the following
    59 ** states:
    60 **
    61 **   PAGER_UNLOCK        The page cache is not currently reading or 
    62 **                       writing the database file.  There is no
    63 **                       data held in memory.  This is the initial
    64 **                       state.
    65 **
    66 **   PAGER_SHARED        The page cache is reading the database.
    67 **                       Writing is not permitted.  There can be
    68 **                       multiple readers accessing the same database
    69 **                       file at the same time.
    70 **
    71 **   PAGER_RESERVED      This process has reserved the database for writing
    72 **                       but has not yet made any changes.  Only one process
    73 **                       at a time can reserve the database.  The original
    74 **                       database file has not been modified so other
    75 **                       processes may still be reading the on-disk
    76 **                       database file.
    77 **
    78 **   PAGER_EXCLUSIVE     The page cache is writing the database.
    79 **                       Access is exclusive.  No other processes or
    80 **                       threads can be reading or writing while one
    81 **                       process is writing.
    82 **
    83 **   PAGER_SYNCED        The pager moves to this state from PAGER_EXCLUSIVE
    84 **                       after all dirty pages have been written to the
    85 **                       database file and the file has been synced to
    86 **                       disk. All that remains to do is to remove or
    87 **                       truncate the journal file and the transaction 
    88 **                       will be committed.
    89 **
    90 ** The page cache comes up in PAGER_UNLOCK.  The first time a
    91 ** sqlite3PagerGet() occurs, the state transitions to PAGER_SHARED.
    92 ** After all pages have been released using sqlite_page_unref(),
    93 ** the state transitions back to PAGER_UNLOCK.  The first time
    94 ** that sqlite3PagerWrite() is called, the state transitions to
    95 ** PAGER_RESERVED.  (Note that sqlite3PagerWrite() can only be
    96 ** called on an outstanding page which means that the pager must
    97 ** be in PAGER_SHARED before it transitions to PAGER_RESERVED.)
    98 ** PAGER_RESERVED means that there is an open rollback journal.
    99 ** The transition to PAGER_EXCLUSIVE occurs before any changes
   100 ** are made to the database file, though writes to the rollback
   101 ** journal occurs with just PAGER_RESERVED.  After an sqlite3PagerRollback()
   102 ** or sqlite3PagerCommitPhaseTwo(), the state can go back to PAGER_SHARED,
   103 ** or it can stay at PAGER_EXCLUSIVE if we are in exclusive access mode.
   104 */
   105 #define PAGER_UNLOCK      0
   106 #define PAGER_SHARED      1   /* same as SHARED_LOCK */
   107 #define PAGER_RESERVED    2   /* same as RESERVED_LOCK */
   108 #define PAGER_EXCLUSIVE   4   /* same as EXCLUSIVE_LOCK */
   109 #define PAGER_SYNCED      5
   110 
   111 /*
   112 ** If the SQLITE_BUSY_RESERVED_LOCK macro is set to true at compile-time,
   113 ** then failed attempts to get a reserved lock will invoke the busy callback.
   114 ** This is off by default.  To see why, consider the following scenario:
   115 ** 
   116 ** Suppose thread A already has a shared lock and wants a reserved lock.
   117 ** Thread B already has a reserved lock and wants an exclusive lock.  If
   118 ** both threads are using their busy callbacks, it might be a long time
   119 ** be for one of the threads give up and allows the other to proceed.
   120 ** But if the thread trying to get the reserved lock gives up quickly
   121 ** (if it never invokes its busy callback) then the contention will be
   122 ** resolved quickly.
   123 */
   124 #ifndef SQLITE_BUSY_RESERVED_LOCK
   125 # define SQLITE_BUSY_RESERVED_LOCK 0
   126 #endif
   127 
   128 /*
   129 ** This macro rounds values up so that if the value is an address it
   130 ** is guaranteed to be an address that is aligned to an 8-byte boundary.
   131 */
   132 #define FORCE_ALIGNMENT(X)   (((X)+7)&~7)
   133 
   134 typedef struct PgHdr PgHdr;
   135 
   136 /*
   137 ** Each pager stores all currently unreferenced pages in a list sorted
   138 ** in least-recently-used (LRU) order (i.e. the first item on the list has 
   139 ** not been referenced in a long time, the last item has been recently
   140 ** used). An instance of this structure is included as part of each
   141 ** pager structure for this purpose (variable Pager.lru).
   142 **
   143 ** Additionally, if memory-management is enabled, all unreferenced pages 
   144 ** are stored in a global LRU list (global variable sqlite3LruPageList).
   145 **
   146 ** In both cases, the PagerLruList.pFirstSynced variable points to
   147 ** the first page in the corresponding list that does not require an
   148 ** fsync() operation before its memory can be reclaimed. If no such
   149 ** page exists, PagerLruList.pFirstSynced is set to NULL.
   150 */
   151 typedef struct PagerLruList PagerLruList;
   152 struct PagerLruList {
   153   PgHdr *pFirst;         /* First page in LRU list */
   154   PgHdr *pLast;          /* Last page in LRU list (the most recently used) */
   155   PgHdr *pFirstSynced;   /* First page in list with PgHdr.needSync==0 */
   156 };
   157 
   158 /*
   159 ** The following structure contains the next and previous pointers used
   160 ** to link a PgHdr structure into a PagerLruList linked list. 
   161 */
   162 typedef struct PagerLruLink PagerLruLink;
   163 struct PagerLruLink {
   164   PgHdr *pNext;
   165   PgHdr *pPrev;
   166 };
   167 
   168 /*
   169 ** Each in-memory image of a page begins with the following header.
   170 ** This header is only visible to this pager module.  The client
   171 ** code that calls pager sees only the data that follows the header.
   172 **
   173 ** Client code should call sqlite3PagerWrite() on a page prior to making
   174 ** any modifications to that page.  The first time sqlite3PagerWrite()
   175 ** is called, the original page contents are written into the rollback
   176 ** journal and PgHdr.inJournal and PgHdr.needSync are set.  Later, once
   177 ** the journal page has made it onto the disk surface, PgHdr.needSync
   178 ** is cleared.  The modified page cannot be written back into the original
   179 ** database file until the journal pages has been synced to disk and the
   180 ** PgHdr.needSync has been cleared.
   181 **
   182 ** The PgHdr.dirty flag is set when sqlite3PagerWrite() is called and
   183 ** is cleared again when the page content is written back to the original
   184 ** database file.
   185 **
   186 ** Details of important structure elements:
   187 **
   188 ** needSync
   189 **
   190 **     If this is true, this means that it is not safe to write the page
   191 **     content to the database because the original content needed
   192 **     for rollback has not by synced to the main rollback journal.
   193 **     The original content may have been written to the rollback journal
   194 **     but it has not yet been synced.  So we cannot write to the database
   195 **     file because power failure might cause the page in the journal file
   196 **     to never reach the disk.  It is as if the write to the journal file
   197 **     does not occur until the journal file is synced.
   198 **     
   199 **     This flag is false if the page content exactly matches what
   200 **     currently exists in the database file.  The needSync flag is also
   201 **     false if the original content has been written to the main rollback
   202 **     journal and synced.  If the page represents a new page that has
   203 **     been added onto the end of the database during the current
   204 **     transaction, the needSync flag is true until the original database
   205 **     size in the journal header has been synced to disk.
   206 **
   207 ** inJournal
   208 **
   209 **     This is true if the original page has been written into the main
   210 **     rollback journal.  This is always false for new pages added to
   211 **     the end of the database file during the current transaction.
   212 **     And this flag says nothing about whether or not the journal
   213 **     has been synced to disk.  For pages that are in the original
   214 **     database file, the following expression should always be true:
   215 **
   216 **       inJournal = sqlite3BitvecTest(pPager->pInJournal, pgno)
   217 **
   218 **     The pPager->pInJournal object is only valid for the original
   219 **     pages of the database, not new pages that are added to the end
   220 **     of the database, so obviously the above expression cannot be
   221 **     valid for new pages.  For new pages inJournal is always 0.
   222 **
   223 ** dirty
   224 **
   225 **     When true, this means that the content of the page has been
   226 **     modified and needs to be written back to the database file.
   227 **     If false, it means that either the content of the page is
   228 **     unchanged or else the content is unimportant and we do not
   229 **     care whether or not it is preserved.
   230 **
   231 ** alwaysRollback
   232 **
   233 **     This means that the sqlite3PagerDontRollback() API should be
   234 **     ignored for this page.  The DontRollback() API attempts to say
   235 **     that the content of the page on disk is unimportant (it is an
   236 **     unused page on the freelist) so that it is unnecessary to 
   237 **     rollback changes to this page because the content of the page
   238 **     can change without changing the meaning of the database.  This
   239 **     flag overrides any DontRollback() attempt.  This flag is set
   240 **     when a page that originally contained valid data is added to
   241 **     the freelist.  Later in the same transaction, this page might
   242 **     be pulled from the freelist and reused for something different
   243 **     and at that point the DontRollback() API will be called because
   244 **     pages taken from the freelist do not need to be protected by
   245 **     the rollback journal.  But this flag says that the page was
   246 **     not originally part of the freelist so that it still needs to
   247 **     be rolled back in spite of any subsequent DontRollback() calls.
   248 **
   249 ** needRead 
   250 **
   251 **     This flag means (when true) that the content of the page has
   252 **     not yet been loaded from disk.  The in-memory content is just
   253 **     garbage.  (Actually, we zero the content, but you should not
   254 **     make any assumptions about the content nevertheless.)  If the
   255 **     content is needed in the future, it should be read from the
   256 **     original database file.
   257 */
   258 struct PgHdr {
   259   Pager *pPager;                 /* The pager to which this page belongs */
   260   Pgno pgno;                     /* The page number for this page */
   261   PgHdr *pNextHash, *pPrevHash;  /* Hash collision chain for PgHdr.pgno */
   262   PagerLruLink free;             /* Next and previous free pages */
   263   PgHdr *pNextAll;               /* A list of all pages */
   264   u8 inJournal;                  /* TRUE if has been written to journal */
   265   u8 dirty;                      /* TRUE if we need to write back changes */
   266   u8 needSync;                   /* Sync journal before writing this page */
   267   u8 alwaysRollback;             /* Disable DontRollback() for this page */
   268   u8 needRead;                   /* Read content if PagerWrite() is called */
   269   short int nRef;                /* Number of users of this page */
   270   PgHdr *pDirty, *pPrevDirty;    /* Dirty pages */
   271 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
   272   PgHdr *pPrevAll;               /* A list of all pages */
   273   PagerLruLink gfree;            /* Global list of nRef==0 pages */
   274 #endif
   275 #ifdef SQLITE_CHECK_PAGES
   276   u32 pageHash;
   277 #endif
   278   void *pData;                   /* Page data */
   279   /* Pager.nExtra bytes of local data appended to this header */
   280 };
   281 
   282 /*
   283 ** For an in-memory only database, some extra information is recorded about
   284 ** each page so that changes can be rolled back.  (Journal files are not
   285 ** used for in-memory databases.)  The following information is added to
   286 ** the end of every EXTRA block for in-memory databases.
   287 **
   288 ** This information could have been added directly to the PgHdr structure.
   289 ** But then it would take up an extra 8 bytes of storage on every PgHdr
   290 ** even for disk-based databases.  Splitting it out saves 8 bytes.  This
   291 ** is only a savings of 0.8% but those percentages add up.
   292 */
   293 typedef struct PgHistory PgHistory;
   294 struct PgHistory {
   295   u8 *pOrig;     /* Original page text.  Restore to this on a full rollback */
   296   u8 *pStmt;     /* Text as it was at the beginning of the current statement */
   297   PgHdr *pNextStmt, *pPrevStmt;  /* List of pages in the statement journal */
   298   u8 inStmt;                     /* TRUE if in the statement subjournal */
   299 };
   300 
   301 /*
   302 ** A macro used for invoking the codec if there is one
   303 */
   304 #ifdef SQLITE_HAS_CODEC
   305 # define CODEC1(P,D,N,X) if( P->xCodec!=0 ){ P->xCodec(P->pCodecArg,D,N,X); }
   306 # define CODEC2(P,D,N,X) ((char*)(P->xCodec!=0?P->xCodec(P->pCodecArg,D,N,X):D))
   307 #else
   308 # define CODEC1(P,D,N,X) /* NO-OP */
   309 # define CODEC2(P,D,N,X) ((char*)D)
   310 #endif
   311 
   312 /*
   313 ** Convert a pointer to a PgHdr into a pointer to its data
   314 ** and back again.
   315 */
   316 #define PGHDR_TO_DATA(P)    ((P)->pData)
   317 #define PGHDR_TO_EXTRA(G,P) ((void*)&((G)[1]))
   318 #define PGHDR_TO_HIST(P,PGR)  \
   319             ((PgHistory*)&((char*)(&(P)[1]))[(PGR)->nExtra])
   320 
   321 /*
   322 ** A open page cache is an instance of the following structure.
   323 **
   324 ** Pager.errCode may be set to SQLITE_IOERR, SQLITE_CORRUPT, or
   325 ** or SQLITE_FULL. Once one of the first three errors occurs, it persists
   326 ** and is returned as the result of every major pager API call.  The
   327 ** SQLITE_FULL return code is slightly different. It persists only until the
   328 ** next successful rollback is performed on the pager cache. Also,
   329 ** SQLITE_FULL does not affect the sqlite3PagerGet() and sqlite3PagerLookup()
   330 ** APIs, they may still be used successfully.
   331 */
   332 struct Pager {
   333   sqlite3_vfs *pVfs;          /* OS functions to use for IO */
   334   u8 journalOpen;             /* True if journal file descriptors is valid */
   335   u8 journalStarted;          /* True if header of journal is synced */
   336   u8 useJournal;              /* Use a rollback journal on this file */
   337   u8 noReadlock;              /* Do not bother to obtain readlocks */
   338   u8 stmtOpen;                /* True if the statement subjournal is open */
   339   u8 stmtInUse;               /* True we are in a statement subtransaction */
   340   u8 stmtAutoopen;            /* Open stmt journal when main journal is opened*/
   341   u8 noSync;                  /* Do not sync the journal if true */
   342   u8 fullSync;                /* Do extra syncs of the journal for robustness */
   343   u8 sync_flags;              /* One of SYNC_NORMAL or SYNC_FULL */
   344   u8 state;                   /* PAGER_UNLOCK, _SHARED, _RESERVED, etc. */
   345   u8 tempFile;                /* zFilename is a temporary file */
   346   u8 readOnly;                /* True for a read-only database */
   347   u8 needSync;                /* True if an fsync() is needed on the journal */
   348   u8 dirtyCache;              /* True if cached pages have changed */
   349   u8 alwaysRollback;          /* Disable DontRollback() for all pages */
   350   u8 memDb;                   /* True to inhibit all file I/O */
   351   u8 setMaster;               /* True if a m-j name has been written to jrnl */
   352   u8 doNotSync;               /* Boolean. While true, do not spill the cache */
   353   u8 exclusiveMode;           /* Boolean. True if locking_mode==EXCLUSIVE */
   354   u8 journalMode;             /* On of the PAGER_JOURNALMODE_* values */
   355   u8 dbModified;              /* True if there are any changes to the Db */
   356   u8 changeCountDone;         /* Set after incrementing the change-counter */
   357   u32 vfsFlags;               /* Flags for sqlite3_vfs.xOpen() */
   358   int errCode;                /* One of several kinds of errors */
   359   int dbSize;                 /* Number of pages in the file */
   360   int origDbSize;             /* dbSize before the current change */
   361   int stmtSize;               /* Size of database (in pages) at stmt_begin() */
   362   int nRec;                   /* Number of pages written to the journal */
   363   u32 cksumInit;              /* Quasi-random value added to every checksum */
   364   int stmtNRec;               /* Number of records in stmt subjournal */
   365   int nExtra;                 /* Add this many bytes to each in-memory page */
   366   int pageSize;               /* Number of bytes in a page */
   367   int nPage;                  /* Total number of in-memory pages */
   368   int nRef;                   /* Number of in-memory pages with PgHdr.nRef>0 */
   369   int mxPage;                 /* Maximum number of pages to hold in cache */
   370   Pgno mxPgno;                /* Maximum allowed size of the database */
   371   Bitvec *pInJournal;         /* One bit for each page in the database file */
   372   Bitvec *pInStmt;            /* One bit for each page in the database */
   373   char *zFilename;            /* Name of the database file */
   374   char *zJournal;             /* Name of the journal file */
   375   char *zDirectory;           /* Directory hold database and journal files */
   376   sqlite3_file *fd, *jfd;     /* File descriptors for database and journal */
   377   sqlite3_file *stfd;         /* File descriptor for the statement subjournal*/
   378   BusyHandler *pBusyHandler;  /* Pointer to sqlite.busyHandler */
   379   PagerLruList lru;           /* LRU list of free pages */
   380   PgHdr *pAll;                /* List of all pages */
   381   PgHdr *pStmt;               /* List of pages in the statement subjournal */
   382   PgHdr *pDirty;              /* List of all dirty pages */
   383   i64 journalOff;             /* Current byte offset in the journal file */
   384   i64 journalHdr;             /* Byte offset to previous journal header */
   385   i64 stmtHdrOff;             /* First journal header written this statement */
   386   i64 stmtCksum;              /* cksumInit when statement was started */
   387   i64 stmtJSize;              /* Size of journal at stmt_begin() */
   388   int sectorSize;             /* Assumed sector size during rollback */
   389 #ifdef SQLITE_TEST
   390   int nHit, nMiss;            /* Cache hits and missing */
   391   int nRead, nWrite;          /* Database pages read/written */
   392 #endif
   393   void (*xDestructor)(DbPage*,int); /* Call this routine when freeing pages */
   394   void (*xReiniter)(DbPage*,int);   /* Call this routine when reloading pages */
   395 #ifdef SQLITE_HAS_CODEC
   396   void *(*xCodec)(void*,void*,Pgno,int); /* Routine for en/decoding data */
   397   void *pCodecArg;            /* First argument to xCodec() */
   398 #endif
   399   int nHash;                  /* Size of the pager hash table */
   400   PgHdr **aHash;              /* Hash table to map page number to PgHdr */
   401 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
   402   Pager *pNext;               /* Doubly linked list of pagers on which */
   403   Pager *pPrev;               /* sqlite3_release_memory() will work */
   404   volatile int iInUseMM;      /* Non-zero if unavailable to MM */
   405   volatile int iInUseDB;      /* Non-zero if in sqlite3_release_memory() */
   406 #endif
   407   char *pTmpSpace;            /* Pager.pageSize bytes of space for tmp use */
   408   char dbFileVers[16];        /* Changes whenever database file changes */
   409   i64 journalSizeLimit;       /* Size limit for persistent journal files */
   410 };
   411 
   412 /*
   413 ** The following global variables hold counters used for
   414 ** testing purposes only.  These variables do not exist in
   415 ** a non-testing build.  These variables are not thread-safe.
   416 */
   417 #ifdef SQLITE_TEST
   418 int sqlite3_pager_readdb_count = 0;    /* Number of full pages read from DB */
   419 int sqlite3_pager_writedb_count = 0;   /* Number of full pages written to DB */
   420 int sqlite3_pager_writej_count = 0;    /* Number of pages written to journal */
   421 int sqlite3_pager_pgfree_count = 0;    /* Number of cache pages freed */
   422 # define PAGER_INCR(v)  v++
   423 #else
   424 # define PAGER_INCR(v)
   425 #endif
   426 
   427 /*
   428 ** The following variable points to the head of a double-linked list
   429 ** of all pagers that are eligible for page stealing by the
   430 ** sqlite3_release_memory() interface.  Access to this list is
   431 ** protected by the SQLITE_MUTEX_STATIC_MEM2 mutex.
   432 */
   433 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
   434 static Pager *sqlite3PagerList = 0;
   435 static PagerLruList sqlite3LruPageList = {0, 0, 0};
   436 #endif
   437 
   438 
   439 /*
   440 ** Journal files begin with the following magic string.  The data
   441 ** was obtained from /dev/random.  It is used only as a sanity check.
   442 **
   443 ** Since version 2.8.0, the journal format contains additional sanity
   444 ** checking information.  If the power fails while the journal is begin
   445 ** written, semi-random garbage data might appear in the journal
   446 ** file after power is restored.  If an attempt is then made
   447 ** to roll the journal back, the database could be corrupted.  The additional
   448 ** sanity checking data is an attempt to discover the garbage in the
   449 ** journal and ignore it.
   450 **
   451 ** The sanity checking information for the new journal format consists
   452 ** of a 32-bit checksum on each page of data.  The checksum covers both
   453 ** the page number and the pPager->pageSize bytes of data for the page.
   454 ** This cksum is initialized to a 32-bit random value that appears in the
   455 ** journal file right after the header.  The random initializer is important,
   456 ** because garbage data that appears at the end of a journal is likely
   457 ** data that was once in other files that have now been deleted.  If the
   458 ** garbage data came from an obsolete journal file, the checksums might
   459 ** be correct.  But by initializing the checksum to random value which
   460 ** is different for every journal, we minimize that risk.
   461 */
   462 static const unsigned char aJournalMagic[] = {
   463   0xd9, 0xd5, 0x05, 0xf9, 0x20, 0xa1, 0x63, 0xd7,
   464 };
   465 
   466 /*
   467 ** The size of the header and of each page in the journal is determined
   468 ** by the following macros.
   469 */
   470 #define JOURNAL_PG_SZ(pPager)  ((pPager->pageSize) + 8)
   471 
   472 /*
   473 ** The journal header size for this pager. In the future, this could be
   474 ** set to some value read from the disk controller. The important
   475 ** characteristic is that it is the same size as a disk sector.
   476 */
   477 #define JOURNAL_HDR_SZ(pPager) (pPager->sectorSize)
   478 
   479 /*
   480 ** The macro MEMDB is true if we are dealing with an in-memory database.
   481 ** We do this as a macro so that if the SQLITE_OMIT_MEMORYDB macro is set,
   482 ** the value of MEMDB will be a constant and the compiler will optimize
   483 ** out code that would never execute.
   484 */
   485 #ifdef SQLITE_OMIT_MEMORYDB
   486 # define MEMDB 0
   487 #else
   488 # define MEMDB pPager->memDb
   489 #endif
   490 
   491 /*
   492 ** Page number PAGER_MJ_PGNO is never used in an SQLite database (it is
   493 ** reserved for working around a windows/posix incompatibility). It is
   494 ** used in the journal to signify that the remainder of the journal file 
   495 ** is devoted to storing a master journal name - there are no more pages to
   496 ** roll back. See comments for function writeMasterJournal() for details.
   497 */
   498 /* #define PAGER_MJ_PGNO(x) (PENDING_BYTE/((x)->pageSize)) */
   499 #define PAGER_MJ_PGNO(x) ((PENDING_BYTE/((x)->pageSize))+1)
   500 
   501 /*
   502 ** The maximum legal page number is (2^31 - 1).
   503 */
   504 #define PAGER_MAX_PGNO 2147483647
   505 
   506 /*
   507 ** The pagerEnter() and pagerLeave() routines acquire and release
   508 ** a mutex on each pager.  The mutex is recursive.
   509 **
   510 ** This is a special-purpose mutex.  It only provides mutual exclusion
   511 ** between the Btree and the Memory Management sqlite3_release_memory()
   512 ** function.  It does not prevent, for example, two Btrees from accessing
   513 ** the same pager at the same time.  Other general-purpose mutexes in
   514 ** the btree layer handle that chore.
   515 */
   516 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
   517   static void pagerEnter(Pager *p){
   518     p->iInUseDB++;
   519     if( p->iInUseMM && p->iInUseDB==1 ){
   520 #ifndef SQLITE_MUTEX_NOOP
   521       sqlite3_mutex *mutex;
   522       mutex = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MEM2);
   523 #endif
   524       p->iInUseDB = 0;
   525       sqlite3_mutex_enter(mutex);
   526       p->iInUseDB = 1;
   527       sqlite3_mutex_leave(mutex);
   528     }
   529     assert( p->iInUseMM==0 );
   530   }
   531   static void pagerLeave(Pager *p){
   532     p->iInUseDB--;
   533     assert( p->iInUseDB>=0 );
   534   }
   535 #else
   536 # define pagerEnter(X)
   537 # define pagerLeave(X)
   538 #endif
   539 
   540 /*
   541 ** Add page pPg to the end of the linked list managed by structure
   542 ** pList (pPg becomes the last entry in the list - the most recently 
   543 ** used). Argument pLink should point to either pPg->free or pPg->gfree,
   544 ** depending on whether pPg is being added to the pager-specific or
   545 ** global LRU list.
   546 */
   547 static void listAdd(PagerLruList *pList, PagerLruLink *pLink, PgHdr *pPg){
   548   pLink->pNext = 0;
   549   pLink->pPrev = pList->pLast;
   550 
   551 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
   552   assert(pLink==&pPg->free || pLink==&pPg->gfree);
   553   assert(pLink==&pPg->gfree || pList!=&sqlite3LruPageList);
   554 #endif
   555 
   556   if( pList->pLast ){
   557     int iOff = (char *)pLink - (char *)pPg;
   558     PagerLruLink *pLastLink = (PagerLruLink *)(&((u8 *)pList->pLast)[iOff]);
   559     pLastLink->pNext = pPg;
   560   }else{
   561     assert(!pList->pFirst);
   562     pList->pFirst = pPg;
   563   }
   564 
   565   pList->pLast = pPg;
   566   if( !pList->pFirstSynced && pPg->needSync==0 ){
   567     pList->pFirstSynced = pPg;
   568   }
   569 }
   570 
   571 /*
   572 ** Remove pPg from the list managed by the structure pointed to by pList.
   573 **
   574 ** Argument pLink should point to either pPg->free or pPg->gfree, depending 
   575 ** on whether pPg is being added to the pager-specific or global LRU list.
   576 */
   577 static void listRemove(PagerLruList *pList, PagerLruLink *pLink, PgHdr *pPg){
   578   int iOff = (char *)pLink - (char *)pPg;
   579 
   580 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
   581   assert(pLink==&pPg->free || pLink==&pPg->gfree);
   582   assert(pLink==&pPg->gfree || pList!=&sqlite3LruPageList);
   583 #endif
   584 
   585   if( pPg==pList->pFirst ){
   586     pList->pFirst = pLink->pNext;
   587   }
   588   if( pPg==pList->pLast ){
   589     pList->pLast = pLink->pPrev;
   590   }
   591   if( pLink->pPrev ){
   592     PagerLruLink *pPrevLink = (PagerLruLink *)(&((u8 *)pLink->pPrev)[iOff]);
   593     pPrevLink->pNext = pLink->pNext;
   594   }
   595   if( pLink->pNext ){
   596     PagerLruLink *pNextLink = (PagerLruLink *)(&((u8 *)pLink->pNext)[iOff]);
   597     pNextLink->pPrev = pLink->pPrev;
   598   }
   599   if( pPg==pList->pFirstSynced ){
   600     PgHdr *p = pLink->pNext;
   601     while( p && p->needSync ){
   602       PagerLruLink *pL = (PagerLruLink *)(&((u8 *)p)[iOff]);
   603       p = pL->pNext;
   604     }
   605     pList->pFirstSynced = p;
   606   }
   607 
   608   pLink->pNext = pLink->pPrev = 0;
   609 }
   610 
   611 /* 
   612 ** Add page pPg to the list of free pages for the pager. If 
   613 ** memory-management is enabled, also add the page to the global 
   614 ** list of free pages.
   615 */
   616 static void lruListAdd(PgHdr *pPg){
   617   listAdd(&pPg->pPager->lru, &pPg->free, pPg);
   618 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
   619   if( !pPg->pPager->memDb ){
   620     sqlite3_mutex_enter(sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_LRU));
   621     listAdd(&sqlite3LruPageList, &pPg->gfree, pPg);
   622     sqlite3_mutex_leave(sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_LRU));
   623   }
   624 #endif
   625 }
   626 
   627 /* 
   628 ** Remove page pPg from the list of free pages for the associated pager.
   629 ** If memory-management is enabled, also remove pPg from the global list
   630 ** of free pages.
   631 */
   632 static void lruListRemove(PgHdr *pPg){
   633   listRemove(&pPg->pPager->lru, &pPg->free, pPg);
   634 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
   635   if( !pPg->pPager->memDb ){
   636     sqlite3_mutex_enter(sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_LRU));
   637     listRemove(&sqlite3LruPageList, &pPg->gfree, pPg);
   638     sqlite3_mutex_leave(sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_LRU));
   639   }
   640 #endif
   641 }
   642 
   643 /* 
   644 ** This function is called just after the needSync flag has been cleared
   645 ** from all pages managed by pPager (usually because the journal file
   646 ** has just been synced). It updates the pPager->lru.pFirstSynced variable
   647 ** and, if memory-management is enabled, the sqlite3LruPageList.pFirstSynced
   648 ** variable also.
   649 */
   650 static void lruListSetFirstSynced(Pager *pPager){
   651   pPager->lru.pFirstSynced = pPager->lru.pFirst;
   652 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
   653   if( !pPager->memDb ){
   654     PgHdr *p;
   655     sqlite3_mutex_enter(sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_LRU));
   656     for(p=sqlite3LruPageList.pFirst; p && p->needSync; p=p->gfree.pNext);
   657     assert(p==pPager->lru.pFirstSynced || p==sqlite3LruPageList.pFirstSynced);
   658     sqlite3LruPageList.pFirstSynced = p;
   659     sqlite3_mutex_leave(sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_LRU));
   660   }
   661 #endif
   662 }
   663 
   664 /*
   665 ** Return true if page *pPg has already been written to the statement
   666 ** journal (or statement snapshot has been created, if *pPg is part
   667 ** of an in-memory database).
   668 */
   669 static int pageInStatement(PgHdr *pPg){
   670   Pager *pPager = pPg->pPager;
   671   if( MEMDB ){
   672     return PGHDR_TO_HIST(pPg, pPager)->inStmt;
   673   }else{
   674     return sqlite3BitvecTest(pPager->pInStmt, pPg->pgno);
   675   }
   676 }
   677 
   678 /*
   679 ** Change the size of the pager hash table to N.  N must be a power
   680 ** of two.
   681 */
   682 static void pager_resize_hash_table(Pager *pPager, int N){
   683   PgHdr **aHash, *pPg;
   684   assert( N>0 && (N&(N-1))==0 );
   685 #ifdef SQLITE_MALLOC_SOFT_LIMIT
   686   if( N*sizeof(aHash[0])>SQLITE_MALLOC_SOFT_LIMIT ){
   687     N = SQLITE_MALLOC_SOFT_LIMIT/sizeof(aHash[0]);
   688   }
   689   if( N==pPager->nHash ) return;
   690 #endif
   691   pagerLeave(pPager);
   692   if( pPager->aHash!=0 ) sqlite3BeginBenignMalloc();
   693   aHash = sqlite3MallocZero( sizeof(aHash[0])*N );
   694   if( pPager->aHash!=0 ) sqlite3EndBenignMalloc();
   695   pagerEnter(pPager);
   696   if( aHash==0 ){
   697     /* Failure to rehash is not an error.  It is only a performance hit. */
   698     return;
   699   }
   700   sqlite3_free(pPager->aHash);
   701   pPager->nHash = N;
   702   pPager->aHash = aHash;
   703   for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
   704     int h;
   705     if( pPg->pgno==0 ){
   706       assert( pPg->pNextHash==0 && pPg->pPrevHash==0 );
   707       continue;
   708     }
   709     h = pPg->pgno & (N-1);
   710     pPg->pNextHash = aHash[h];
   711     if( aHash[h] ){
   712       aHash[h]->pPrevHash = pPg;
   713     }
   714     aHash[h] = pPg;
   715     pPg->pPrevHash = 0;
   716   }
   717 }
   718 
   719 /*
   720 ** Read a 32-bit integer from the given file descriptor.  Store the integer
   721 ** that is read in *pRes.  Return SQLITE_OK if everything worked, or an
   722 ** error code is something goes wrong.
   723 **
   724 ** All values are stored on disk as big-endian.
   725 */
   726 static int read32bits(sqlite3_file *fd, i64 offset, u32 *pRes){
   727   unsigned char ac[4];
   728   int rc = sqlite3OsRead(fd, ac, sizeof(ac), offset);
   729   if( rc==SQLITE_OK ){
   730     *pRes = sqlite3Get4byte(ac);
   731   }
   732   return rc;
   733 }
   734 
   735 /*
   736 ** Write a 32-bit integer into a string buffer in big-endian byte order.
   737 */
   738 #define put32bits(A,B)  sqlite3Put4byte((u8*)A,B)
   739 
   740 /*
   741 ** Write a 32-bit integer into the given file descriptor.  Return SQLITE_OK
   742 ** on success or an error code is something goes wrong.
   743 */
   744 static int write32bits(sqlite3_file *fd, i64 offset, u32 val){
   745   char ac[4];
   746   put32bits(ac, val);
   747   return sqlite3OsWrite(fd, ac, 4, offset);
   748 }
   749 
   750 /*
   751 ** If file pFd is open, call sqlite3OsUnlock() on it.
   752 */
   753 static int osUnlock(sqlite3_file *pFd, int eLock){
   754   if( !pFd->pMethods ){
   755     return SQLITE_OK;
   756   }
   757   return sqlite3OsUnlock(pFd, eLock);
   758 }
   759 
   760 /*
   761 ** This function determines whether or not the atomic-write optimization
   762 ** can be used with this pager. The optimization can be used if:
   763 **
   764 **  (a) the value returned by OsDeviceCharacteristics() indicates that
   765 **      a database page may be written atomically, and
   766 **  (b) the value returned by OsSectorSize() is less than or equal
   767 **      to the page size.
   768 **
   769 ** If the optimization cannot be used, 0 is returned. If it can be used,
   770 ** then the value returned is the size of the journal file when it
   771 ** contains rollback data for exactly one page.
   772 */
   773 #ifdef SQLITE_ENABLE_ATOMIC_WRITE
   774 static int jrnlBufferSize(Pager *pPager){
   775   int dc;           /* Device characteristics */
   776   int nSector;      /* Sector size */
   777   int szPage;        /* Page size */
   778   sqlite3_file *fd = pPager->fd;
   779 
   780   if( fd->pMethods ){
   781     dc = sqlite3OsDeviceCharacteristics(fd);
   782     nSector = sqlite3OsSectorSize(fd);
   783     szPage = pPager->pageSize;
   784   }
   785 
   786   assert(SQLITE_IOCAP_ATOMIC512==(512>>8));
   787   assert(SQLITE_IOCAP_ATOMIC64K==(65536>>8));
   788 
   789   if( !fd->pMethods || 
   790        (dc & (SQLITE_IOCAP_ATOMIC|(szPage>>8)) && nSector<=szPage) ){
   791     return JOURNAL_HDR_SZ(pPager) + JOURNAL_PG_SZ(pPager);
   792   }
   793   return 0;
   794 }
   795 #endif
   796 
   797 /*
   798 ** This function should be called when an error occurs within the pager
   799 ** code. The first argument is a pointer to the pager structure, the
   800 ** second the error-code about to be returned by a pager API function. 
   801 ** The value returned is a copy of the second argument to this function. 
   802 **
   803 ** If the second argument is SQLITE_IOERR, SQLITE_CORRUPT, or SQLITE_FULL
   804 ** the error becomes persistent. Until the persisten error is cleared,
   805 ** subsequent API calls on this Pager will immediately return the same 
   806 ** error code.
   807 **
   808 ** A persistent error indicates that the contents of the pager-cache 
   809 ** cannot be trusted. This state can be cleared by completely discarding 
   810 ** the contents of the pager-cache. If a transaction was active when
   811 ** the persistent error occured, then the rollback journal may need
   812 ** to be replayed.
   813 */
   814 static void pager_unlock(Pager *pPager);
   815 static int pager_error(Pager *pPager, int rc){
   816   int rc2 = rc & 0xff;
   817   assert(
   818        pPager->errCode==SQLITE_FULL ||
   819        pPager->errCode==SQLITE_OK ||
   820        (pPager->errCode & 0xff)==SQLITE_IOERR
   821   );
   822   if(
   823     rc2==SQLITE_FULL ||
   824     rc2==SQLITE_IOERR ||
   825     rc2==SQLITE_CORRUPT
   826   ){
   827     pPager->errCode = rc;
   828     if( pPager->state==PAGER_UNLOCK && pPager->nRef==0 ){
   829       /* If the pager is already unlocked, call pager_unlock() now to
   830       ** clear the error state and ensure that the pager-cache is 
   831       ** completely empty.
   832       */
   833       pager_unlock(pPager);
   834     }
   835   }
   836   return rc;
   837 }
   838 
   839 /*
   840 ** If SQLITE_CHECK_PAGES is defined then we do some sanity checking
   841 ** on the cache using a hash function.  This is used for testing
   842 ** and debugging only.
   843 */
   844 #ifdef SQLITE_CHECK_PAGES
   845 /*
   846 ** Return a 32-bit hash of the page data for pPage.
   847 */
   848 static u32 pager_datahash(int nByte, unsigned char *pData){
   849   u32 hash = 0;
   850   int i;
   851   for(i=0; i<nByte; i++){
   852     hash = (hash*1039) + pData[i];
   853   }
   854   return hash;
   855 }
   856 static u32 pager_pagehash(PgHdr *pPage){
   857   return pager_datahash(pPage->pPager->pageSize, 
   858                         (unsigned char *)PGHDR_TO_DATA(pPage));
   859 }
   860 
   861 /*
   862 ** The CHECK_PAGE macro takes a PgHdr* as an argument. If SQLITE_CHECK_PAGES
   863 ** is defined, and NDEBUG is not defined, an assert() statement checks
   864 ** that the page is either dirty or still matches the calculated page-hash.
   865 */
   866 #define CHECK_PAGE(x) checkPage(x)
   867 static void checkPage(PgHdr *pPg){
   868   Pager *pPager = pPg->pPager;
   869   assert( !pPg->pageHash || pPager->errCode || MEMDB || pPg->dirty || 
   870       pPg->pageHash==pager_pagehash(pPg) );
   871 }
   872 
   873 #else
   874 #define pager_datahash(X,Y)  0
   875 #define pager_pagehash(X)  0
   876 #define CHECK_PAGE(x)
   877 #endif
   878 
   879 /*
   880 ** When this is called the journal file for pager pPager must be open.
   881 ** The master journal file name is read from the end of the file and 
   882 ** written into memory supplied by the caller. 
   883 **
   884 ** zMaster must point to a buffer of at least nMaster bytes allocated by
   885 ** the caller. This should be sqlite3_vfs.mxPathname+1 (to ensure there is
   886 ** enough space to write the master journal name). If the master journal
   887 ** name in the journal is longer than nMaster bytes (including a
   888 ** nul-terminator), then this is handled as if no master journal name
   889 ** were present in the journal.
   890 **
   891 ** If no master journal file name is present zMaster[0] is set to 0 and
   892 ** SQLITE_OK returned.
   893 */
   894 static int readMasterJournal(sqlite3_file *pJrnl, char *zMaster, int nMaster){
   895   int rc;
   896   u32 len;
   897   i64 szJ;
   898   u32 cksum;
   899   u32 u;                   /* Unsigned loop counter */
   900   unsigned char aMagic[8]; /* A buffer to hold the magic header */
   901 
   902   zMaster[0] = '\0';
   903 
   904   rc = sqlite3OsFileSize(pJrnl, &szJ);
   905   if( rc!=SQLITE_OK || szJ<16 ) return rc;
   906 
   907   rc = read32bits(pJrnl, szJ-16, &len);
   908   if( rc!=SQLITE_OK ) return rc;
   909 
   910   if( len>=nMaster ){
   911     return SQLITE_OK;
   912   }
   913 
   914   rc = read32bits(pJrnl, szJ-12, &cksum);
   915   if( rc!=SQLITE_OK ) return rc;
   916 
   917   rc = sqlite3OsRead(pJrnl, aMagic, 8, szJ-8);
   918   if( rc!=SQLITE_OK || memcmp(aMagic, aJournalMagic, 8) ) return rc;
   919 
   920   rc = sqlite3OsRead(pJrnl, zMaster, len, szJ-16-len);
   921   if( rc!=SQLITE_OK ){
   922     return rc;
   923   }
   924   zMaster[len] = '\0';
   925 
   926   /* See if the checksum matches the master journal name */
   927   for(u=0; u<len; u++){
   928     cksum -= zMaster[u];
   929    }
   930   if( cksum ){
   931     /* If the checksum doesn't add up, then one or more of the disk sectors
   932     ** containing the master journal filename is corrupted. This means
   933     ** definitely roll back, so just return SQLITE_OK and report a (nul)
   934     ** master-journal filename.
   935     */
   936     zMaster[0] = '\0';
   937   }
   938    
   939   return SQLITE_OK;
   940 }
   941 
   942 /*
   943 ** Seek the journal file descriptor to the next sector boundary where a
   944 ** journal header may be read or written. Pager.journalOff is updated with
   945 ** the new seek offset.
   946 **
   947 ** i.e for a sector size of 512:
   948 **
   949 ** Input Offset              Output Offset
   950 ** ---------------------------------------
   951 ** 0                         0
   952 ** 512                       512
   953 ** 100                       512
   954 ** 2000                      2048
   955 ** 
   956 */
   957 static void seekJournalHdr(Pager *pPager){
   958   i64 offset = 0;
   959   i64 c = pPager->journalOff;
   960   if( c ){
   961     offset = ((c-1)/JOURNAL_HDR_SZ(pPager) + 1) * JOURNAL_HDR_SZ(pPager);
   962   }
   963   assert( offset%JOURNAL_HDR_SZ(pPager)==0 );
   964   assert( offset>=c );
   965   assert( (offset-c)<JOURNAL_HDR_SZ(pPager) );
   966   pPager->journalOff = offset;
   967 }
   968 
   969 /*
   970 ** Write zeros over the header of the journal file.  This has the
   971 ** effect of invalidating the journal file and committing the
   972 ** transaction.
   973 */
   974 static int zeroJournalHdr(Pager *pPager, int doTruncate){
   975   int rc = SQLITE_OK;
   976   static const char zeroHdr[28];
   977 
   978   if( pPager->journalOff ){
   979     i64 iLimit = pPager->journalSizeLimit;
   980 
   981     IOTRACE(("JZEROHDR %p\n", pPager))
   982     if( doTruncate || iLimit==0 ){
   983       rc = sqlite3OsTruncate(pPager->jfd, 0);
   984     }else{
   985       rc = sqlite3OsWrite(pPager->jfd, zeroHdr, sizeof(zeroHdr), 0);
   986     }
   987     if( rc==SQLITE_OK && !pPager->noSync ){
   988       rc = sqlite3OsSync(pPager->jfd, SQLITE_SYNC_DATAONLY|pPager->sync_flags);
   989     }
   990 
   991     /* At this point the transaction is committed but the write lock 
   992     ** is still held on the file. If there is a size limit configured for 
   993     ** the persistent journal and the journal file currently consumes more
   994     ** space than that limit allows for, truncate it now. There is no need
   995     ** to sync the file following this operation.
   996     */
   997     if( rc==SQLITE_OK && iLimit>0 ){
   998       i64 sz;
   999       rc = sqlite3OsFileSize(pPager->jfd, &sz);
  1000       if( rc==SQLITE_OK && sz>iLimit ){
  1001         rc = sqlite3OsTruncate(pPager->jfd, iLimit);
  1002       }
  1003     }
  1004   }
  1005   return rc;
  1006 }
  1007 
  1008 /*
  1009 ** The journal file must be open when this routine is called. A journal
  1010 ** header (JOURNAL_HDR_SZ bytes) is written into the journal file at the
  1011 ** current location.
  1012 **
  1013 ** The format for the journal header is as follows:
  1014 ** - 8 bytes: Magic identifying journal format.
  1015 ** - 4 bytes: Number of records in journal, or -1 no-sync mode is on.
  1016 ** - 4 bytes: Random number used for page hash.
  1017 ** - 4 bytes: Initial database page count.
  1018 ** - 4 bytes: Sector size used by the process that wrote this journal.
  1019 ** - 4 bytes: Database page size.
  1020 ** 
  1021 ** Followed by (JOURNAL_HDR_SZ - 28) bytes of unused space.
  1022 */
  1023 static int writeJournalHdr(Pager *pPager){
  1024   int rc = SQLITE_OK;
  1025   char *zHeader = pPager->pTmpSpace;
  1026   int nHeader = pPager->pageSize;
  1027   int nWrite;
  1028 
  1029   if( nHeader>JOURNAL_HDR_SZ(pPager) ){
  1030     nHeader = JOURNAL_HDR_SZ(pPager);
  1031   }
  1032 
  1033   if( pPager->stmtHdrOff==0 ){
  1034     pPager->stmtHdrOff = pPager->journalOff;
  1035   }
  1036 
  1037   seekJournalHdr(pPager);
  1038   pPager->journalHdr = pPager->journalOff;
  1039 
  1040   memcpy(zHeader, aJournalMagic, sizeof(aJournalMagic));
  1041 
  1042   /* 
  1043   ** Write the nRec Field - the number of page records that follow this
  1044   ** journal header. Normally, zero is written to this value at this time.
  1045   ** After the records are added to the journal (and the journal synced, 
  1046   ** if in full-sync mode), the zero is overwritten with the true number
  1047   ** of records (see syncJournal()).
  1048   **
  1049   ** A faster alternative is to write 0xFFFFFFFF to the nRec field. When
  1050   ** reading the journal this value tells SQLite to assume that the
  1051   ** rest of the journal file contains valid page records. This assumption
  1052   ** is dangerous, as if a failure occured whilst writing to the journal
  1053   ** file it may contain some garbage data. There are two scenarios
  1054   ** where this risk can be ignored:
  1055   **
  1056   **   * When the pager is in no-sync mode. Corruption can follow a
  1057   **     power failure in this case anyway.
  1058   **
  1059   **   * When the SQLITE_IOCAP_SAFE_APPEND flag is set. This guarantees
  1060   **     that garbage data is never appended to the journal file.
  1061   */
  1062   assert(pPager->fd->pMethods||pPager->noSync);
  1063   if( (pPager->noSync) 
  1064    || (sqlite3OsDeviceCharacteristics(pPager->fd)&SQLITE_IOCAP_SAFE_APPEND) 
  1065   ){
  1066     put32bits(&zHeader[sizeof(aJournalMagic)], 0xffffffff);
  1067   }else{
  1068     put32bits(&zHeader[sizeof(aJournalMagic)], 0);
  1069   }
  1070 
  1071   /* The random check-hash initialiser */ 
  1072   sqlite3_randomness(sizeof(pPager->cksumInit), &pPager->cksumInit);
  1073   put32bits(&zHeader[sizeof(aJournalMagic)+4], pPager->cksumInit);
  1074   /* The initial database size */
  1075   put32bits(&zHeader[sizeof(aJournalMagic)+8], pPager->dbSize);
  1076   /* The assumed sector size for this process */
  1077   put32bits(&zHeader[sizeof(aJournalMagic)+12], pPager->sectorSize);
  1078   if( pPager->journalHdr==0 ){
  1079     /* The page size */
  1080     put32bits(&zHeader[sizeof(aJournalMagic)+16], pPager->pageSize);
  1081   }
  1082 
  1083   for(nWrite=0; rc==SQLITE_OK&&nWrite<JOURNAL_HDR_SZ(pPager); nWrite+=nHeader){
  1084     IOTRACE(("JHDR %p %lld %d\n", pPager, pPager->journalHdr, nHeader))
  1085     rc = sqlite3OsWrite(pPager->jfd, zHeader, nHeader, pPager->journalOff);
  1086     pPager->journalOff += nHeader;
  1087   }
  1088 
  1089   return rc;
  1090 }
  1091 
  1092 /*
  1093 ** The journal file must be open when this is called. A journal header file
  1094 ** (JOURNAL_HDR_SZ bytes) is read from the current location in the journal
  1095 ** file. See comments above function writeJournalHdr() for a description of
  1096 ** the journal header format.
  1097 **
  1098 ** If the header is read successfully, *nRec is set to the number of
  1099 ** page records following this header and *dbSize is set to the size of the
  1100 ** database before the transaction began, in pages. Also, pPager->cksumInit
  1101 ** is set to the value read from the journal header. SQLITE_OK is returned
  1102 ** in this case.
  1103 **
  1104 ** If the journal header file appears to be corrupted, SQLITE_DONE is
  1105 ** returned and *nRec and *dbSize are not set.  If JOURNAL_HDR_SZ bytes
  1106 ** cannot be read from the journal file an error code is returned.
  1107 */
  1108 static int readJournalHdr(
  1109   Pager *pPager, 
  1110   i64 journalSize,
  1111   u32 *pNRec, 
  1112   u32 *pDbSize
  1113 ){
  1114   int rc;
  1115   unsigned char aMagic[8]; /* A buffer to hold the magic header */
  1116   i64 jrnlOff;
  1117   int iPageSize;
  1118 
  1119   seekJournalHdr(pPager);
  1120   if( pPager->journalOff+JOURNAL_HDR_SZ(pPager) > journalSize ){
  1121     return SQLITE_DONE;
  1122   }
  1123   jrnlOff = pPager->journalOff;
  1124 
  1125   rc = sqlite3OsRead(pPager->jfd, aMagic, sizeof(aMagic), jrnlOff);
  1126   if( rc ) return rc;
  1127   jrnlOff += sizeof(aMagic);
  1128 
  1129   if( memcmp(aMagic, aJournalMagic, sizeof(aMagic))!=0 ){
  1130     return SQLITE_DONE;
  1131   }
  1132 
  1133   rc = read32bits(pPager->jfd, jrnlOff, pNRec);
  1134   if( rc ) return rc;
  1135 
  1136   rc = read32bits(pPager->jfd, jrnlOff+4, &pPager->cksumInit);
  1137   if( rc ) return rc;
  1138 
  1139   rc = read32bits(pPager->jfd, jrnlOff+8, pDbSize);
  1140   if( rc ) return rc;
  1141 
  1142   rc = read32bits(pPager->jfd, jrnlOff+16, (u32 *)&iPageSize);
  1143   if( rc==SQLITE_OK 
  1144    && iPageSize>=512 
  1145    && iPageSize<=SQLITE_MAX_PAGE_SIZE 
  1146    && ((iPageSize-1)&iPageSize)==0 
  1147   ){
  1148     u16 pagesize = iPageSize;
  1149     rc = sqlite3PagerSetPagesize(pPager, &pagesize);
  1150   }
  1151   if( rc ) return rc;
  1152 
  1153   /* Update the assumed sector-size to match the value used by 
  1154   ** the process that created this journal. If this journal was
  1155   ** created by a process other than this one, then this routine
  1156   ** is being called from within pager_playback(). The local value
  1157   ** of Pager.sectorSize is restored at the end of that routine.
  1158   */
  1159   rc = read32bits(pPager->jfd, jrnlOff+12, (u32 *)&pPager->sectorSize);
  1160   if( rc ) return rc;
  1161 
  1162   pPager->journalOff += JOURNAL_HDR_SZ(pPager);
  1163   return SQLITE_OK;
  1164 }
  1165 
  1166 
  1167 /*
  1168 ** Write the supplied master journal name into the journal file for pager
  1169 ** pPager at the current location. The master journal name must be the last
  1170 ** thing written to a journal file. If the pager is in full-sync mode, the
  1171 ** journal file descriptor is advanced to the next sector boundary before
  1172 ** anything is written. The format is:
  1173 **
  1174 ** + 4 bytes: PAGER_MJ_PGNO.
  1175 ** + N bytes: length of master journal name.
  1176 ** + 4 bytes: N
  1177 ** + 4 bytes: Master journal name checksum.
  1178 ** + 8 bytes: aJournalMagic[].
  1179 **
  1180 ** The master journal page checksum is the sum of the bytes in the master
  1181 ** journal name.
  1182 **
  1183 ** If zMaster is a NULL pointer (occurs for a single database transaction), 
  1184 ** this call is a no-op.
  1185 */
  1186 static int writeMasterJournal(Pager *pPager, const char *zMaster){
  1187   int rc;
  1188   int len; 
  1189   int i; 
  1190   i64 jrnlOff;
  1191   i64 jrnlSize;
  1192   u32 cksum = 0;
  1193   char zBuf[sizeof(aJournalMagic)+2*4];
  1194 
  1195   if( !zMaster || pPager->setMaster) return SQLITE_OK;
  1196   pPager->setMaster = 1;
  1197 
  1198   len = strlen(zMaster);
  1199   for(i=0; i<len; i++){
  1200     cksum += zMaster[i];
  1201   }
  1202 
  1203   /* If in full-sync mode, advance to the next disk sector before writing
  1204   ** the master journal name. This is in case the previous page written to
  1205   ** the journal has already been synced.
  1206   */
  1207   if( pPager->fullSync ){
  1208     seekJournalHdr(pPager);
  1209   }
  1210   jrnlOff = pPager->journalOff;
  1211   pPager->journalOff += (len+20);
  1212 
  1213   rc = write32bits(pPager->jfd, jrnlOff, PAGER_MJ_PGNO(pPager));
  1214   if( rc!=SQLITE_OK ) return rc;
  1215   jrnlOff += 4;
  1216 
  1217   rc = sqlite3OsWrite(pPager->jfd, zMaster, len, jrnlOff);
  1218   if( rc!=SQLITE_OK ) return rc;
  1219   jrnlOff += len;
  1220 
  1221   put32bits(zBuf, len);
  1222   put32bits(&zBuf[4], cksum);
  1223   memcpy(&zBuf[8], aJournalMagic, sizeof(aJournalMagic));
  1224   rc = sqlite3OsWrite(pPager->jfd, zBuf, 8+sizeof(aJournalMagic), jrnlOff);
  1225   jrnlOff += 8+sizeof(aJournalMagic);
  1226   pPager->needSync = !pPager->noSync;
  1227 
  1228   /* If the pager is in peristent-journal mode, then the physical 
  1229   ** journal-file may extend past the end of the master-journal name
  1230   ** and 8 bytes of magic data just written to the file. This is 
  1231   ** dangerous because the code to rollback a hot-journal file
  1232   ** will not be able to find the master-journal name to determine 
  1233   ** whether or not the journal is hot. 
  1234   **
  1235   ** Easiest thing to do in this scenario is to truncate the journal 
  1236   ** file to the required size.
  1237   */ 
  1238   if( (rc==SQLITE_OK)
  1239    && (rc = sqlite3OsFileSize(pPager->jfd, &jrnlSize))==SQLITE_OK
  1240    && jrnlSize>jrnlOff
  1241   ){
  1242     rc = sqlite3OsTruncate(pPager->jfd, jrnlOff);
  1243   }
  1244   return rc;
  1245 }
  1246 
  1247 /*
  1248 ** Add or remove a page from the list of all pages that are in the
  1249 ** statement journal.
  1250 **
  1251 ** The Pager keeps a separate list of pages that are currently in
  1252 ** the statement journal.  This helps the sqlite3PagerStmtCommit()
  1253 ** routine run MUCH faster for the common case where there are many
  1254 ** pages in memory but only a few are in the statement journal.
  1255 */
  1256 static void page_add_to_stmt_list(PgHdr *pPg){
  1257   Pager *pPager = pPg->pPager;
  1258   PgHistory *pHist = PGHDR_TO_HIST(pPg, pPager);
  1259   assert( MEMDB );
  1260   if( !pHist->inStmt ){
  1261     assert( pHist->pPrevStmt==0 && pHist->pNextStmt==0 );
  1262     if( pPager->pStmt ){
  1263       PGHDR_TO_HIST(pPager->pStmt, pPager)->pPrevStmt = pPg;
  1264     }
  1265     pHist->pNextStmt = pPager->pStmt;
  1266     pPager->pStmt = pPg;
  1267     pHist->inStmt = 1;
  1268   }
  1269 }
  1270 
  1271 /*
  1272 ** Find a page in the hash table given its page number.  Return
  1273 ** a pointer to the page or NULL if not found.
  1274 */
  1275 static PgHdr *pager_lookup(Pager *pPager, Pgno pgno){
  1276   PgHdr *p;
  1277   if( pPager->aHash==0 ) return 0;
  1278   p = pPager->aHash[pgno & (pPager->nHash-1)];
  1279   while( p && p->pgno!=pgno ){
  1280     p = p->pNextHash;
  1281   }
  1282   return p;
  1283 }
  1284 
  1285 /*
  1286 ** Clear the in-memory cache.  This routine
  1287 ** sets the state of the pager back to what it was when it was first
  1288 ** opened.  Any outstanding pages are invalidated and subsequent attempts
  1289 ** to access those pages will likely result in a coredump.
  1290 */
  1291 static void pager_reset(Pager *pPager){
  1292   PgHdr *pPg, *pNext;
  1293   if( pPager->errCode ) return;
  1294   for(pPg=pPager->pAll; pPg; pPg=pNext){
  1295     IOTRACE(("PGFREE %p %d\n", pPager, pPg->pgno));
  1296     PAGER_INCR(sqlite3_pager_pgfree_count);
  1297     pNext = pPg->pNextAll;
  1298     lruListRemove(pPg);
  1299     sqlite3PageFree(pPg->pData);
  1300     sqlite3_free(pPg);
  1301   }
  1302   assert(pPager->lru.pFirst==0);
  1303   assert(pPager->lru.pFirstSynced==0);
  1304   assert(pPager->lru.pLast==0);
  1305   pPager->pStmt = 0;
  1306   pPager->pAll = 0;
  1307   pPager->pDirty = 0;
  1308   pPager->nHash = 0;
  1309   sqlite3_free(pPager->aHash);
  1310   pPager->nPage = 0;
  1311   pPager->aHash = 0;
  1312   pPager->nRef = 0;
  1313 }
  1314 
  1315 /*
  1316 ** Unlock the database file. 
  1317 **
  1318 ** If the pager is currently in error state, discard the contents of 
  1319 ** the cache and reset the Pager structure internal state. If there is
  1320 ** an open journal-file, then the next time a shared-lock is obtained
  1321 ** on the pager file (by this or any other process), it will be
  1322 ** treated as a hot-journal and rolled back.
  1323 */
  1324 static void pager_unlock(Pager *pPager){
  1325   if( !pPager->exclusiveMode ){
  1326     if( !MEMDB ){
  1327       int rc = osUnlock(pPager->fd, NO_LOCK);
  1328       if( rc ) pPager->errCode = rc;
  1329       pPager->dbSize = -1;
  1330       IOTRACE(("UNLOCK %p\n", pPager))
  1331 
  1332       /* Always close the journal file when dropping the database lock.
  1333       ** Otherwise, another connection with journal_mode=delete might
  1334       ** delete the file out from under us.
  1335       */
  1336       if( pPager->journalOpen ){
  1337         sqlite3OsClose(pPager->jfd);
  1338         pPager->journalOpen = 0;
  1339         sqlite3BitvecDestroy(pPager->pInJournal);
  1340         pPager->pInJournal = 0;
  1341       }
  1342 
  1343       /* If Pager.errCode is set, the contents of the pager cache cannot be
  1344       ** trusted. Now that the pager file is unlocked, the contents of the
  1345       ** cache can be discarded and the error code safely cleared.
  1346       */
  1347       if( pPager->errCode ){
  1348         if( rc==SQLITE_OK ) pPager->errCode = SQLITE_OK;
  1349         pager_reset(pPager);
  1350         if( pPager->stmtOpen ){
  1351           sqlite3OsClose(pPager->stfd);
  1352           sqlite3BitvecDestroy(pPager->pInStmt);
  1353           pPager->pInStmt = 0;
  1354         }
  1355         pPager->stmtOpen = 0;
  1356         pPager->stmtInUse = 0;
  1357         pPager->journalOff = 0;
  1358         pPager->journalStarted = 0;
  1359         pPager->stmtAutoopen = 0;
  1360         pPager->origDbSize = 0;
  1361       }
  1362     }
  1363 
  1364     if( !MEMDB || pPager->errCode==SQLITE_OK ){
  1365       pPager->state = PAGER_UNLOCK;
  1366       pPager->changeCountDone = 0;
  1367     }
  1368   }
  1369 }
  1370 
  1371 /*
  1372 ** Execute a rollback if a transaction is active and unlock the 
  1373 ** database file. If the pager has already entered the error state, 
  1374 ** do not attempt the rollback.
  1375 */
  1376 static void pagerUnlockAndRollback(Pager *p){
  1377   /* assert( p->state>=PAGER_RESERVED || p->journalOpen==0 ); */
  1378   if( p->errCode==SQLITE_OK && p->state>=PAGER_RESERVED ){
  1379     sqlite3BeginBenignMalloc();
  1380     sqlite3PagerRollback(p);
  1381     sqlite3EndBenignMalloc();
  1382   }
  1383   pager_unlock(p);
  1384 #if 0
  1385   assert( p->errCode || !p->journalOpen || (p->exclusiveMode&&!p->journalOff) );
  1386   assert( p->errCode || !p->stmtOpen || p->exclusiveMode );
  1387 #endif
  1388 }
  1389 
  1390 /*
  1391 ** This routine ends a transaction.  A transaction is ended by either
  1392 ** a COMMIT or a ROLLBACK.
  1393 **
  1394 ** When this routine is called, the pager has the journal file open and
  1395 ** a RESERVED or EXCLUSIVE lock on the database.  This routine will release
  1396 ** the database lock and acquires a SHARED lock in its place if that is
  1397 ** the appropriate thing to do.  Release locks usually is appropriate,
  1398 ** unless we are in exclusive access mode or unless this is a 
  1399 ** COMMIT AND BEGIN or ROLLBACK AND BEGIN operation.
  1400 **
  1401 ** The journal file is either deleted or truncated.
  1402 **
  1403 ** TODO: Consider keeping the journal file open for temporary databases.
  1404 ** This might give a performance improvement on windows where opening
  1405 ** a file is an expensive operation.
  1406 */
  1407 static int pager_end_transaction(Pager *pPager, int hasMaster){
  1408   PgHdr *pPg;
  1409   int rc = SQLITE_OK;
  1410   int rc2 = SQLITE_OK;
  1411   assert( !MEMDB );
  1412   if( pPager->state<PAGER_RESERVED ){
  1413     return SQLITE_OK;
  1414   }
  1415   sqlite3PagerStmtCommit(pPager);
  1416   if( pPager->stmtOpen && !pPager->exclusiveMode ){
  1417     sqlite3OsClose(pPager->stfd);
  1418     pPager->stmtOpen = 0;
  1419   }
  1420   if( pPager->journalOpen ){
  1421     if( pPager->exclusiveMode 
  1422      || pPager->journalMode==PAGER_JOURNALMODE_PERSIST
  1423     ){
  1424       rc = zeroJournalHdr(pPager, hasMaster);
  1425       pager_error(pPager, rc);
  1426       pPager->journalOff = 0;
  1427       pPager->journalStarted = 0;
  1428     }else{
  1429       sqlite3OsClose(pPager->jfd);
  1430       pPager->journalOpen = 0;
  1431       if( rc==SQLITE_OK && !pPager->tempFile ){
  1432         rc = sqlite3OsDelete(pPager->pVfs, pPager->zJournal, 0);
  1433       }
  1434     }
  1435     sqlite3BitvecDestroy(pPager->pInJournal);
  1436     pPager->pInJournal = 0;
  1437     for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
  1438       pPg->inJournal = 0;
  1439       pPg->dirty = 0;
  1440       pPg->needSync = 0;
  1441       pPg->alwaysRollback = 0;
  1442 #ifdef SQLITE_CHECK_PAGES
  1443       pPg->pageHash = pager_pagehash(pPg);
  1444 #endif
  1445     }
  1446     pPager->pDirty = 0;
  1447     pPager->dirtyCache = 0;
  1448     pPager->nRec = 0;
  1449   }else{
  1450     assert( pPager->pInJournal==0 );
  1451   }
  1452 
  1453   if( !pPager->exclusiveMode ){
  1454     rc2 = osUnlock(pPager->fd, SHARED_LOCK);
  1455     pPager->state = PAGER_SHARED;
  1456   }else if( pPager->state==PAGER_SYNCED ){
  1457     pPager->state = PAGER_EXCLUSIVE;
  1458   }
  1459   pPager->origDbSize = 0;
  1460   pPager->setMaster = 0;
  1461   pPager->needSync = 0;
  1462   lruListSetFirstSynced(pPager);
  1463   pPager->dbSize = -1;
  1464   pPager->dbModified = 0;
  1465 
  1466   return (rc==SQLITE_OK?rc2:rc);
  1467 }
  1468 
  1469 /*
  1470 ** Compute and return a checksum for the page of data.
  1471 **
  1472 ** This is not a real checksum.  It is really just the sum of the 
  1473 ** random initial value and the page number.  We experimented with
  1474 ** a checksum of the entire data, but that was found to be too slow.
  1475 **
  1476 ** Note that the page number is stored at the beginning of data and
  1477 ** the checksum is stored at the end.  This is important.  If journal
  1478 ** corruption occurs due to a power failure, the most likely scenario
  1479 ** is that one end or the other of the record will be changed.  It is
  1480 ** much less likely that the two ends of the journal record will be
  1481 ** correct and the middle be corrupt.  Thus, this "checksum" scheme,
  1482 ** though fast and simple, catches the mostly likely kind of corruption.
  1483 **
  1484 ** FIX ME:  Consider adding every 200th (or so) byte of the data to the
  1485 ** checksum.  That way if a single page spans 3 or more disk sectors and
  1486 ** only the middle sector is corrupt, we will still have a reasonable
  1487 ** chance of failing the checksum and thus detecting the problem.
  1488 */
  1489 static u32 pager_cksum(Pager *pPager, const u8 *aData){
  1490   u32 cksum = pPager->cksumInit;
  1491   int i = pPager->pageSize-200;
  1492   while( i>0 ){
  1493     cksum += aData[i];
  1494     i -= 200;
  1495   }
  1496   return cksum;
  1497 }
  1498 
  1499 /* Forward declaration */
  1500 static void makeClean(PgHdr*);
  1501 
  1502 /*
  1503 ** Read a single page from the journal file opened on file descriptor
  1504 ** jfd.  Playback this one page.
  1505 **
  1506 ** If useCksum==0 it means this journal does not use checksums.  Checksums
  1507 ** are not used in statement journals because statement journals do not
  1508 ** need to survive power failures.
  1509 */
  1510 static int pager_playback_one_page(
  1511   Pager *pPager, 
  1512   sqlite3_file *jfd,
  1513   i64 offset,
  1514   int useCksum
  1515 ){
  1516   int rc;
  1517   PgHdr *pPg;                   /* An existing page in the cache */
  1518   Pgno pgno;                    /* The page number of a page in journal */
  1519   u32 cksum;                    /* Checksum used for sanity checking */
  1520   u8 *aData = (u8 *)pPager->pTmpSpace;   /* Temp storage for a page */
  1521 
  1522   /* useCksum should be true for the main journal and false for
  1523   ** statement journals.  Verify that this is always the case
  1524   */
  1525   assert( jfd == (useCksum ? pPager->jfd : pPager->stfd) );
  1526   assert( aData );
  1527 
  1528   rc = read32bits(jfd, offset, &pgno);
  1529   if( rc!=SQLITE_OK ) return rc;
  1530   rc = sqlite3OsRead(jfd, aData, pPager->pageSize, offset+4);
  1531   if( rc!=SQLITE_OK ) return rc;
  1532   pPager->journalOff += pPager->pageSize + 4;
  1533 
  1534   /* Sanity checking on the page.  This is more important that I originally
  1535   ** thought.  If a power failure occurs while the journal is being written,
  1536   ** it could cause invalid data to be written into the journal.  We need to
  1537   ** detect this invalid data (with high probability) and ignore it.
  1538   */
  1539   if( pgno==0 || pgno==PAGER_MJ_PGNO(pPager) ){
  1540     return SQLITE_DONE;
  1541   }
  1542   if( pgno>(unsigned)pPager->dbSize ){
  1543     return SQLITE_OK;
  1544   }
  1545   if( useCksum ){
  1546     rc = read32bits(jfd, offset+pPager->pageSize+4, &cksum);
  1547     if( rc ) return rc;
  1548     pPager->journalOff += 4;
  1549     if( pager_cksum(pPager, aData)!=cksum ){
  1550       return SQLITE_DONE;
  1551     }
  1552   }
  1553 
  1554   assert( pPager->state==PAGER_RESERVED || pPager->state>=PAGER_EXCLUSIVE );
  1555 
  1556   /* If the pager is in RESERVED state, then there must be a copy of this
  1557   ** page in the pager cache. In this case just update the pager cache,
  1558   ** not the database file. The page is left marked dirty in this case.
  1559   **
  1560   ** An exception to the above rule: If the database is in no-sync mode
  1561   ** and a page is moved during an incremental vacuum then the page may
  1562   ** not be in the pager cache. Later: if a malloc() or IO error occurs
  1563   ** during a Movepage() call, then the page may not be in the cache
  1564   ** either. So the condition described in the above paragraph is not
  1565   ** assert()able.
  1566   **
  1567   ** If in EXCLUSIVE state, then we update the pager cache if it exists
  1568   ** and the main file. The page is then marked not dirty.
  1569   **
  1570   ** Ticket #1171:  The statement journal might contain page content that is
  1571   ** different from the page content at the start of the transaction.
  1572   ** This occurs when a page is changed prior to the start of a statement
  1573   ** then changed again within the statement.  When rolling back such a
  1574   ** statement we must not write to the original database unless we know
  1575   ** for certain that original page contents are synced into the main rollback
  1576   ** journal.  Otherwise, a power loss might leave modified data in the
  1577   ** database file without an entry in the rollback journal that can
  1578   ** restore the database to its original form.  Two conditions must be
  1579   ** met before writing to the database files. (1) the database must be
  1580   ** locked.  (2) we know that the original page content is fully synced
  1581   ** in the main journal either because the page is not in cache or else
  1582   ** the page is marked as needSync==0.
  1583   **
  1584   ** 2008-04-14:  When attempting to vacuum a corrupt database file, it
  1585   ** is possible to fail a statement on a database that does not yet exist.
  1586   ** Do not attempt to write if database file has never been opened.
  1587   */
  1588   pPg = pager_lookup(pPager, pgno);
  1589   PAGERTRACE4("PLAYBACK %d page %d hash(%08x)\n",
  1590                PAGERID(pPager), pgno, pager_datahash(pPager->pageSize, aData));
  1591   if( pPager->state>=PAGER_EXCLUSIVE && (pPg==0 || pPg->needSync==0)
  1592         && pPager->fd->pMethods ){
  1593     i64 offset = (pgno-1)*(i64)pPager->pageSize;
  1594     rc = sqlite3OsWrite(pPager->fd, aData, pPager->pageSize, offset);
  1595     if( pPg ){
  1596       makeClean(pPg);
  1597     }
  1598   }
  1599   if( pPg ){
  1600     /* No page should ever be explicitly rolled back that is in use, except
  1601     ** for page 1 which is held in use in order to keep the lock on the
  1602     ** database active. However such a page may be rolled back as a result
  1603     ** of an internal error resulting in an automatic call to
  1604     ** sqlite3PagerRollback().
  1605     */
  1606     void *pData;
  1607     /* assert( pPg->nRef==0 || pPg->pgno==1 ); */
  1608     pData = PGHDR_TO_DATA(pPg);
  1609     memcpy(pData, aData, pPager->pageSize);
  1610     if( pPager->xReiniter ){
  1611       pPager->xReiniter(pPg, pPager->pageSize);
  1612     }
  1613 #ifdef SQLITE_CHECK_PAGES
  1614     pPg->pageHash = pager_pagehash(pPg);
  1615 #endif
  1616     /* If this was page 1, then restore the value of Pager.dbFileVers.
  1617     ** Do this before any decoding. */
  1618     if( pgno==1 ){
  1619       memcpy(&pPager->dbFileVers, &((u8*)pData)[24],sizeof(pPager->dbFileVers));
  1620     }
  1621 
  1622     /* Decode the page just read from disk */
  1623     CODEC1(pPager, pData, pPg->pgno, 3);
  1624   }
  1625   return rc;
  1626 }
  1627 
  1628 /*
  1629 ** Parameter zMaster is the name of a master journal file. A single journal
  1630 ** file that referred to the master journal file has just been rolled back.
  1631 ** This routine checks if it is possible to delete the master journal file,
  1632 ** and does so if it is.
  1633 **
  1634 ** Argument zMaster may point to Pager.pTmpSpace. So that buffer is not 
  1635 ** available for use within this function.
  1636 **
  1637 **
  1638 ** The master journal file contains the names of all child journals.
  1639 ** To tell if a master journal can be deleted, check to each of the
  1640 ** children.  If all children are either missing or do not refer to
  1641 ** a different master journal, then this master journal can be deleted.
  1642 */
  1643 static int pager_delmaster(Pager *pPager, const char *zMaster){
  1644   sqlite3_vfs *pVfs = pPager->pVfs;
  1645   int rc;
  1646   int master_open = 0;
  1647   sqlite3_file *pMaster;
  1648   sqlite3_file *pJournal;
  1649   char *zMasterJournal = 0; /* Contents of master journal file */
  1650   i64 nMasterJournal;       /* Size of master journal file */
  1651 
  1652   /* Open the master journal file exclusively in case some other process
  1653   ** is running this routine also. Not that it makes too much difference.
  1654   */
  1655   pMaster = (sqlite3_file *)sqlite3Malloc(pVfs->szOsFile * 2);
  1656   pJournal = (sqlite3_file *)(((u8 *)pMaster) + pVfs->szOsFile);
  1657   if( !pMaster ){
  1658     rc = SQLITE_NOMEM;
  1659   }else{
  1660     int flags = (SQLITE_OPEN_READONLY|SQLITE_OPEN_MASTER_JOURNAL);
  1661     rc = sqlite3OsOpen(pVfs, zMaster, pMaster, flags, 0);
  1662   }
  1663   if( rc!=SQLITE_OK ) goto delmaster_out;
  1664   master_open = 1;
  1665 
  1666   rc = sqlite3OsFileSize(pMaster, &nMasterJournal);
  1667   if( rc!=SQLITE_OK ) goto delmaster_out;
  1668 
  1669   if( nMasterJournal>0 ){
  1670     char *zJournal;
  1671     char *zMasterPtr = 0;
  1672     int nMasterPtr = pPager->pVfs->mxPathname+1;
  1673 
  1674     /* Load the entire master journal file into space obtained from
  1675     ** sqlite3_malloc() and pointed to by zMasterJournal. 
  1676     */
  1677     zMasterJournal = (char *)sqlite3Malloc(nMasterJournal + nMasterPtr);
  1678     if( !zMasterJournal ){
  1679       rc = SQLITE_NOMEM;
  1680       goto delmaster_out;
  1681     }
  1682     zMasterPtr = &zMasterJournal[nMasterJournal];
  1683     rc = sqlite3OsRead(pMaster, zMasterJournal, nMasterJournal, 0);
  1684     if( rc!=SQLITE_OK ) goto delmaster_out;
  1685 
  1686     zJournal = zMasterJournal;
  1687     while( (zJournal-zMasterJournal)<nMasterJournal ){
  1688       int exists;
  1689       rc = sqlite3OsAccess(pVfs, zJournal, SQLITE_ACCESS_EXISTS, &exists);
  1690       if( rc!=SQLITE_OK ){
  1691         goto delmaster_out;
  1692       }
  1693       if( exists ){
  1694         /* One of the journals pointed to by the master journal exists.
  1695         ** Open it and check if it points at the master journal. If
  1696         ** so, return without deleting the master journal file.
  1697         */
  1698         int c;
  1699         int flags = (SQLITE_OPEN_READONLY|SQLITE_OPEN_MAIN_JOURNAL);
  1700         rc = sqlite3OsOpen(pVfs, zJournal, pJournal, flags, 0);
  1701         if( rc!=SQLITE_OK ){
  1702           goto delmaster_out;
  1703         }
  1704 
  1705         rc = readMasterJournal(pJournal, zMasterPtr, nMasterPtr);
  1706         sqlite3OsClose(pJournal);
  1707         if( rc!=SQLITE_OK ){
  1708           goto delmaster_out;
  1709         }
  1710 
  1711         c = zMasterPtr[0]!=0 && strcmp(zMasterPtr, zMaster)==0;
  1712         if( c ){
  1713           /* We have a match. Do not delete the master journal file. */
  1714           goto delmaster_out;
  1715         }
  1716       }
  1717       zJournal += (strlen(zJournal)+1);
  1718     }
  1719   }
  1720   
  1721   rc = sqlite3OsDelete(pVfs, zMaster, 0);
  1722 
  1723 delmaster_out:
  1724   if( zMasterJournal ){
  1725     sqlite3_free(zMasterJournal);
  1726   }  
  1727   if( master_open ){
  1728     sqlite3OsClose(pMaster);
  1729   }
  1730   sqlite3_free(pMaster);
  1731   return rc;
  1732 }
  1733 
  1734 
  1735 static void pager_truncate_cache(Pager *pPager);
  1736 
  1737 /*
  1738 ** Truncate the main file of the given pager to the number of pages
  1739 ** indicated. Also truncate the cached representation of the file.
  1740 **
  1741 ** Might might be the case that the file on disk is smaller than nPage.
  1742 ** This can happen, for example, if we are in the middle of a transaction
  1743 ** which has extended the file size and the new pages are still all held
  1744 ** in cache, then an INSERT or UPDATE does a statement rollback.  Some
  1745 ** operating system implementations can get confused if you try to
  1746 ** truncate a file to some size that is larger than it currently is,
  1747 ** so detect this case and write a single zero byte to the end of the new
  1748 ** file instead.
  1749 */
  1750 static int pager_truncate(Pager *pPager, int nPage){
  1751   int rc = SQLITE_OK;
  1752   if( pPager->state>=PAGER_EXCLUSIVE && pPager->fd->pMethods ){
  1753     i64 currentSize, newSize;
  1754     rc = sqlite3OsFileSize(pPager->fd, &currentSize);
  1755     newSize = pPager->pageSize*(i64)nPage;
  1756     if( rc==SQLITE_OK && currentSize!=newSize ){
  1757       if( currentSize>newSize ){
  1758         rc = sqlite3OsTruncate(pPager->fd, newSize);
  1759       }else{
  1760         rc = sqlite3OsWrite(pPager->fd, "", 1, newSize-1);
  1761       }
  1762     }
  1763   }
  1764   if( rc==SQLITE_OK ){
  1765     pPager->dbSize = nPage;
  1766     pager_truncate_cache(pPager);
  1767   }
  1768   return rc;
  1769 }
  1770 
  1771 /*
  1772 ** Set the sectorSize for the given pager.
  1773 **
  1774 ** The sector size is at least as big as the sector size reported
  1775 ** by sqlite3OsSectorSize().  The minimum sector size is 512.
  1776 */
  1777 static void setSectorSize(Pager *pPager){
  1778   assert(pPager->fd->pMethods||pPager->tempFile);
  1779   if( !pPager->tempFile ){
  1780     /* Sector size doesn't matter for temporary files. Also, the file
  1781     ** may not have been opened yet, in whcih case the OsSectorSize()
  1782     ** call will segfault.
  1783     */
  1784     pPager->sectorSize = sqlite3OsSectorSize(pPager->fd);
  1785   }
  1786   if( pPager->sectorSize<512 ){
  1787     pPager->sectorSize = 512;
  1788   }
  1789 }
  1790 
  1791 /*
  1792 ** Playback the journal and thus restore the database file to
  1793 ** the state it was in before we started making changes.  
  1794 **
  1795 ** The journal file format is as follows: 
  1796 **
  1797 **  (1)  8 byte prefix.  A copy of aJournalMagic[].
  1798 **  (2)  4 byte big-endian integer which is the number of valid page records
  1799 **       in the journal.  If this value is 0xffffffff, then compute the
  1800 **       number of page records from the journal size.
  1801 **  (3)  4 byte big-endian integer which is the initial value for the 
  1802 **       sanity checksum.
  1803 **  (4)  4 byte integer which is the number of pages to truncate the
  1804 **       database to during a rollback.
  1805 **  (5)  4 byte big-endian integer which is the sector size.  The header
  1806 **       is this many bytes in size.
  1807 **  (6)  4 byte big-endian integer which is the page case.
  1808 **  (7)  4 byte integer which is the number of bytes in the master journal
  1809 **       name.  The value may be zero (indicate that there is no master
  1810 **       journal.)
  1811 **  (8)  N bytes of the master journal name.  The name will be nul-terminated
  1812 **       and might be shorter than the value read from (5).  If the first byte
  1813 **       of the name is \000 then there is no master journal.  The master
  1814 **       journal name is stored in UTF-8.
  1815 **  (9)  Zero or more pages instances, each as follows:
  1816 **        +  4 byte page number.
  1817 **        +  pPager->pageSize bytes of data.
  1818 **        +  4 byte checksum
  1819 **
  1820 ** When we speak of the journal header, we mean the first 8 items above.
  1821 ** Each entry in the journal is an instance of the 9th item.
  1822 **
  1823 ** Call the value from the second bullet "nRec".  nRec is the number of
  1824 ** valid page entries in the journal.  In most cases, you can compute the
  1825 ** value of nRec from the size of the journal file.  But if a power
  1826 ** failure occurred while the journal was being written, it could be the
  1827 ** case that the size of the journal file had already been increased but
  1828 ** the extra entries had not yet made it safely to disk.  In such a case,
  1829 ** the value of nRec computed from the file size would be too large.  For
  1830 ** that reason, we always use the nRec value in the header.
  1831 **
  1832 ** If the nRec value is 0xffffffff it means that nRec should be computed
  1833 ** from the file size.  This value is used when the user selects the
  1834 ** no-sync option for the journal.  A power failure could lead to corruption
  1835 ** in this case.  But for things like temporary table (which will be
  1836 ** deleted when the power is restored) we don't care.  
  1837 **
  1838 ** If the file opened as the journal file is not a well-formed
  1839 ** journal file then all pages up to the first corrupted page are rolled
  1840 ** back (or no pages if the journal header is corrupted). The journal file
  1841 ** is then deleted and SQLITE_OK returned, just as if no corruption had
  1842 ** been encountered.
  1843 **
  1844 ** If an I/O or malloc() error occurs, the journal-file is not deleted
  1845 ** and an error code is returned.
  1846 */
  1847 static int pager_playback(Pager *pPager, int isHot){
  1848   sqlite3_vfs *pVfs = pPager->pVfs;
  1849   i64 szJ;                 /* Size of the journal file in bytes */
  1850   u32 nRec;                /* Number of Records in the journal */
  1851   u32 u;                   /* Unsigned loop counter */
  1852   Pgno mxPg = 0;           /* Size of the original file in pages */
  1853   int rc;                  /* Result code of a subroutine */
  1854   int res = 1;             /* Value returned by sqlite3OsAccess() */
  1855   char *zMaster = 0;       /* Name of master journal file if any */
  1856 
  1857   /* Figure out how many records are in the journal.  Abort early if
  1858   ** the journal is empty.
  1859   */
  1860   assert( pPager->journalOpen );
  1861   rc = sqlite3OsFileSize(pPager->jfd, &szJ);
  1862   if( rc!=SQLITE_OK || szJ==0 ){
  1863     goto end_playback;
  1864   }
  1865 
  1866   /* Read the master journal name from the journal, if it is present.
  1867   ** If a master journal file name is specified, but the file is not
  1868   ** present on disk, then the journal is not hot and does not need to be
  1869   ** played back.
  1870   */
  1871   zMaster = pPager->pTmpSpace;
  1872   rc = readMasterJournal(pPager->jfd, zMaster, pPager->pVfs->mxPathname+1);
  1873   if( rc==SQLITE_OK && zMaster[0] ){
  1874     rc = sqlite3OsAccess(pVfs, zMaster, SQLITE_ACCESS_EXISTS, &res);
  1875   }
  1876   zMaster = 0;
  1877   if( rc!=SQLITE_OK || !res ){
  1878     goto end_playback;
  1879   }
  1880   pPager->journalOff = 0;
  1881 
  1882   /* This loop terminates either when the readJournalHdr() call returns
  1883   ** SQLITE_DONE or an IO error occurs. */
  1884   while( 1 ){
  1885 
  1886     /* Read the next journal header from the journal file.  If there are
  1887     ** not enough bytes left in the journal file for a complete header, or
  1888     ** it is corrupted, then a process must of failed while writing it.
  1889     ** This indicates nothing more needs to be rolled back.
  1890     */
  1891     rc = readJournalHdr(pPager, szJ, &nRec, &mxPg);
  1892     if( rc!=SQLITE_OK ){ 
  1893       if( rc==SQLITE_DONE ){
  1894         rc = SQLITE_OK;
  1895       }
  1896       goto end_playback;
  1897     }
  1898 
  1899     /* If nRec is 0xffffffff, then this journal was created by a process
  1900     ** working in no-sync mode. This means that the rest of the journal
  1901     ** file consists of pages, there are no more journal headers. Compute
  1902     ** the value of nRec based on this assumption.
  1903     */
  1904     if( nRec==0xffffffff ){
  1905       assert( pPager->journalOff==JOURNAL_HDR_SZ(pPager) );
  1906       nRec = (szJ - JOURNAL_HDR_SZ(pPager))/JOURNAL_PG_SZ(pPager);
  1907     }
  1908 
  1909     /* If nRec is 0 and this rollback is of a transaction created by this
  1910     ** process and if this is the final header in the journal, then it means
  1911     ** that this part of the journal was being filled but has not yet been
  1912     ** synced to disk.  Compute the number of pages based on the remaining
  1913     ** size of the file.
  1914     **
  1915     ** The third term of the test was added to fix ticket #2565.
  1916     */
  1917     if( nRec==0 && !isHot &&
  1918         pPager->journalHdr+JOURNAL_HDR_SZ(pPager)==pPager->journalOff ){
  1919       nRec = (szJ - pPager->journalOff) / JOURNAL_PG_SZ(pPager);
  1920     }
  1921 
  1922     /* If this is the first header read from the journal, truncate the
  1923     ** database file back to its original size.
  1924     */
  1925     if( pPager->journalOff==JOURNAL_HDR_SZ(pPager) ){
  1926       rc = pager_truncate(pPager, mxPg);
  1927       if( rc!=SQLITE_OK ){
  1928         goto end_playback;
  1929       }
  1930     }
  1931 
  1932     /* Copy original pages out of the journal and back into the database file.
  1933     */
  1934     for(u=0; u<nRec; u++){
  1935       rc = pager_playback_one_page(pPager, pPager->jfd, pPager->journalOff, 1);
  1936       if( rc!=SQLITE_OK ){
  1937         if( rc==SQLITE_DONE ){
  1938           rc = SQLITE_OK;
  1939           pPager->journalOff = szJ;
  1940           break;
  1941         }else{
  1942           goto end_playback;
  1943         }
  1944       }
  1945     }
  1946   }
  1947   /*NOTREACHED*/
  1948   assert( 0 );
  1949 
  1950 end_playback:
  1951   if( rc==SQLITE_OK ){
  1952     zMaster = pPager->pTmpSpace;
  1953     rc = readMasterJournal(pPager->jfd, zMaster, pPager->pVfs->mxPathname+1);
  1954   }
  1955   if( rc==SQLITE_OK ){
  1956     rc = pager_end_transaction(pPager, zMaster[0]!='\0');
  1957   }
  1958   if( rc==SQLITE_OK && zMaster[0] ){
  1959     /* If there was a master journal and this routine will return success,
  1960     ** see if it is possible to delete the master journal.
  1961     */
  1962     rc = pager_delmaster(pPager, zMaster);
  1963   }
  1964 
  1965   /* The Pager.sectorSize variable may have been updated while rolling
  1966   ** back a journal created by a process with a different sector size
  1967   ** value. Reset it to the correct value for this process.
  1968   */
  1969   setSectorSize(pPager);
  1970   return rc;
  1971 }
  1972 
  1973 /*
  1974 ** Playback the statement journal.
  1975 **
  1976 ** This is similar to playing back the transaction journal but with
  1977 ** a few extra twists.
  1978 **
  1979 **    (1)  The number of pages in the database file at the start of
  1980 **         the statement is stored in pPager->stmtSize, not in the
  1981 **         journal file itself.
  1982 **
  1983 **    (2)  In addition to playing back the statement journal, also
  1984 **         playback all pages of the transaction journal beginning
  1985 **         at offset pPager->stmtJSize.
  1986 */
  1987 static int pager_stmt_playback(Pager *pPager){
  1988   i64 szJ;                 /* Size of the full journal */
  1989   i64 hdrOff;
  1990   int nRec;                /* Number of Records */
  1991   int i;                   /* Loop counter */
  1992   int rc;
  1993 
  1994   szJ = pPager->journalOff;
  1995 
  1996   /* Set hdrOff to be the offset just after the end of the last journal
  1997   ** page written before the first journal-header for this statement
  1998   ** transaction was written, or the end of the file if no journal
  1999   ** header was written.
  2000   */
  2001   hdrOff = pPager->stmtHdrOff;
  2002   assert( pPager->fullSync || !hdrOff );
  2003   if( !hdrOff ){
  2004     hdrOff = szJ;
  2005   }
  2006   
  2007   /* Truncate the database back to its original size.
  2008   */
  2009   rc = pager_truncate(pPager, pPager->stmtSize);
  2010   assert( pPager->state>=PAGER_SHARED );
  2011 
  2012   /* Figure out how many records are in the statement journal.
  2013   */
  2014   assert( pPager->stmtInUse && pPager->journalOpen );
  2015   nRec = pPager->stmtNRec;
  2016   
  2017   /* Copy original pages out of the statement journal and back into the
  2018   ** database file.  Note that the statement journal omits checksums from
  2019   ** each record since power-failure recovery is not important to statement
  2020   ** journals.
  2021   */
  2022   for(i=0; i<nRec; i++){
  2023     i64 offset = i*(4+pPager->pageSize);
  2024     rc = pager_playback_one_page(pPager, pPager->stfd, offset, 0);
  2025     assert( rc!=SQLITE_DONE );
  2026     if( rc!=SQLITE_OK ) goto end_stmt_playback;
  2027   }
  2028 
  2029   /* Now roll some pages back from the transaction journal. Pager.stmtJSize
  2030   ** was the size of the journal file when this statement was started, so
  2031   ** everything after that needs to be rolled back, either into the
  2032   ** database, the memory cache, or both.
  2033   **
  2034   ** If it is not zero, then Pager.stmtHdrOff is the offset to the start
  2035   ** of the first journal header written during this statement transaction.
  2036   */
  2037   pPager->journalOff = pPager->stmtJSize;
  2038   pPager->cksumInit = pPager->stmtCksum;
  2039   while( pPager->journalOff < hdrOff ){
  2040     rc = pager_playback_one_page(pPager, pPager->jfd, pPager->journalOff, 1);
  2041     assert( rc!=SQLITE_DONE );
  2042     if( rc!=SQLITE_OK ) goto end_stmt_playback;
  2043   }
  2044 
  2045   while( pPager->journalOff < szJ ){
  2046     u32 nJRec;         /* Number of Journal Records */
  2047     u32 dummy;
  2048     rc = readJournalHdr(pPager, szJ, &nJRec, &dummy);
  2049     if( rc!=SQLITE_OK ){
  2050       assert( rc!=SQLITE_DONE );
  2051       goto end_stmt_playback;
  2052     }
  2053     if( nJRec==0 ){
  2054       nJRec = (szJ - pPager->journalOff) / (pPager->pageSize+8);
  2055     }
  2056     for(i=nJRec-1; i>=0 && pPager->journalOff < szJ; i--){
  2057       rc = pager_playback_one_page(pPager, pPager->jfd, pPager->journalOff, 1);
  2058       assert( rc!=SQLITE_DONE );
  2059       if( rc!=SQLITE_OK ) goto end_stmt_playback;
  2060     }
  2061   }
  2062 
  2063   pPager->journalOff = szJ;
  2064   
  2065 end_stmt_playback:
  2066   if( rc==SQLITE_OK) {
  2067     pPager->journalOff = szJ;
  2068     /* pager_reload_cache(pPager); */
  2069   }
  2070   return rc;
  2071 }
  2072 
  2073 /*
  2074 ** Change the maximum number of in-memory pages that are allowed.
  2075 */
  2076 void sqlite3PagerSetCachesize(Pager *pPager, int mxPage){
  2077   if( mxPage>10 ){
  2078     pPager->mxPage = mxPage;
  2079   }else{
  2080     pPager->mxPage = 10;
  2081   }
  2082 }
  2083 
  2084 /*
  2085 ** Adjust the robustness of the database to damage due to OS crashes
  2086 ** or power failures by changing the number of syncs()s when writing
  2087 ** the rollback journal.  There are three levels:
  2088 **
  2089 **    OFF       sqlite3OsSync() is never called.  This is the default
  2090 **              for temporary and transient files.
  2091 **
  2092 **    NORMAL    The journal is synced once before writes begin on the
  2093 **              database.  This is normally adequate protection, but
  2094 **              it is theoretically possible, though very unlikely,
  2095 **              that an inopertune power failure could leave the journal
  2096 **              in a state which would cause damage to the database
  2097 **              when it is rolled back.
  2098 **
  2099 **    FULL      The journal is synced twice before writes begin on the
  2100 **              database (with some additional information - the nRec field
  2101 **              of the journal header - being written in between the two
  2102 **              syncs).  If we assume that writing a
  2103 **              single disk sector is atomic, then this mode provides
  2104 **              assurance that the journal will not be corrupted to the
  2105 **              point of causing damage to the database during rollback.
  2106 **
  2107 ** Numeric values associated with these states are OFF==1, NORMAL=2,
  2108 ** and FULL=3.
  2109 */
  2110 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
  2111 void sqlite3PagerSetSafetyLevel(Pager *pPager, int level, int full_fsync){
  2112   pPager->noSync =  level==1 || pPager->tempFile;
  2113   pPager->fullSync = level==3 && !pPager->tempFile;
  2114   pPager->sync_flags = (full_fsync?SQLITE_SYNC_FULL:SQLITE_SYNC_NORMAL);
  2115   if( pPager->noSync ) pPager->needSync = 0;
  2116 }
  2117 #endif
  2118 
  2119 /*
  2120 ** The following global variable is incremented whenever the library
  2121 ** attempts to open a temporary file.  This information is used for
  2122 ** testing and analysis only.  
  2123 */
  2124 #ifdef SQLITE_TEST
  2125 int sqlite3_opentemp_count = 0;
  2126 #endif
  2127 
  2128 /*
  2129 ** Open a temporary file. 
  2130 **
  2131 ** Write the file descriptor into *fd.  Return SQLITE_OK on success or some
  2132 ** other error code if we fail. The OS will automatically delete the temporary
  2133 ** file when it is closed.
  2134 */
  2135 static int sqlite3PagerOpentemp(
  2136   Pager *pPager,        /* The pager object */
  2137   sqlite3_file *pFile,  /* Write the file descriptor here */
  2138   int vfsFlags          /* Flags passed through to the VFS */
  2139 ){
  2140   int rc;
  2141 
  2142 #ifdef SQLITE_TEST
  2143   sqlite3_opentemp_count++;  /* Used for testing and analysis only */
  2144 #endif
  2145 
  2146   vfsFlags |=  SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE |
  2147             SQLITE_OPEN_EXCLUSIVE | SQLITE_OPEN_DELETEONCLOSE;
  2148   rc = sqlite3OsOpen(pPager->pVfs, 0, pFile, vfsFlags, 0);
  2149   assert( rc!=SQLITE_OK || pFile->pMethods );
  2150   return rc;
  2151 }
  2152 
  2153 /*
  2154 ** Create a new page cache and put a pointer to the page cache in *ppPager.
  2155 ** The file to be cached need not exist.  The file is not locked until
  2156 ** the first call to sqlite3PagerGet() and is only held open until the
  2157 ** last page is released using sqlite3PagerUnref().
  2158 **
  2159 ** If zFilename is NULL then a randomly-named temporary file is created
  2160 ** and used as the file to be cached.  The file will be deleted
  2161 ** automatically when it is closed.
  2162 **
  2163 ** If zFilename is ":memory:" then all information is held in cache.
  2164 ** It is never written to disk.  This can be used to implement an
  2165 ** in-memory database.
  2166 */
  2167 int sqlite3PagerOpen(
  2168   sqlite3_vfs *pVfs,       /* The virtual file system to use */
  2169   Pager **ppPager,         /* Return the Pager structure here */
  2170   const char *zFilename,   /* Name of the database file to open */
  2171   int nExtra,              /* Extra bytes append to each in-memory page */
  2172   int flags,               /* flags controlling this file */
  2173   int vfsFlags             /* flags passed through to sqlite3_vfs.xOpen() */
  2174 ){
  2175   u8 *pPtr;
  2176   Pager *pPager = 0;
  2177   int rc = SQLITE_OK;
  2178   int i;
  2179   int tempFile = 0;
  2180   int memDb = 0;
  2181   int readOnly = 0;
  2182   int useJournal = (flags & PAGER_OMIT_JOURNAL)==0;
  2183   int noReadlock = (flags & PAGER_NO_READLOCK)!=0;
  2184   int journalFileSize = sqlite3JournalSize(pVfs);
  2185   int szPageDflt = SQLITE_DEFAULT_PAGE_SIZE;
  2186   char *zPathname = 0;
  2187   int nPathname = 0;
  2188 
  2189   /* The default return is a NULL pointer */
  2190   *ppPager = 0;
  2191 
  2192   /* Compute and store the full pathname in an allocated buffer pointed
  2193   ** to by zPathname, length nPathname. Or, if this is a temporary file,
  2194   ** leave both nPathname and zPathname set to 0.
  2195   */
  2196   if( zFilename && zFilename[0] ){
  2197     nPathname = pVfs->mxPathname+1;
  2198     zPathname = sqlite3Malloc(nPathname*2);
  2199     if( zPathname==0 ){
  2200       return SQLITE_NOMEM;
  2201     }
  2202 #ifndef SQLITE_OMIT_MEMORYDB
  2203     if( strcmp(zFilename,":memory:")==0 ){
  2204       memDb = 1;
  2205       zPathname[0] = 0;
  2206     }else
  2207 #endif
  2208     {
  2209       rc = sqlite3OsFullPathname(pVfs, zFilename, nPathname, zPathname);
  2210     }
  2211     if( rc!=SQLITE_OK ){
  2212       sqlite3_free(zPathname);
  2213       return rc;
  2214     }
  2215     nPathname = strlen(zPathname);
  2216   }
  2217 
  2218   /* Allocate memory for the pager structure */
  2219   pPager = sqlite3MallocZero(
  2220     sizeof(*pPager) +           /* Pager structure */
  2221     journalFileSize +           /* The journal file structure */ 
  2222     pVfs->szOsFile * 3 +        /* The main db and two journal files */ 
  2223     3*nPathname + 40            /* zFilename, zDirectory, zJournal */
  2224   );
  2225   if( !pPager ){
  2226     sqlite3_free(zPathname);
  2227     return SQLITE_NOMEM;
  2228   }
  2229   pPtr = (u8 *)&pPager[1];
  2230   pPager->vfsFlags = vfsFlags;
  2231   pPager->fd = (sqlite3_file*)&pPtr[pVfs->szOsFile*0];
  2232   pPager->stfd = (sqlite3_file*)&pPtr[pVfs->szOsFile*1];
  2233   pPager->jfd = (sqlite3_file*)&pPtr[pVfs->szOsFile*2];
  2234   pPager->zFilename = (char*)&pPtr[pVfs->szOsFile*2+journalFileSize];
  2235   pPager->zDirectory = &pPager->zFilename[nPathname+1];
  2236   pPager->zJournal = &pPager->zDirectory[nPathname+1];
  2237   pPager->pVfs = pVfs;
  2238   if( zPathname ){
  2239     memcpy(pPager->zFilename, zPathname, nPathname+1);
  2240     sqlite3_free(zPathname);
  2241   }
  2242 
  2243   /* Open the pager file.
  2244   */
  2245   if( zFilename && zFilename[0] && !memDb ){
  2246     if( nPathname>(pVfs->mxPathname - sizeof("-journal")) ){
  2247       rc = SQLITE_CANTOPEN;
  2248     }else{
  2249       int fout = 0;
  2250       rc = sqlite3OsOpen(pVfs, pPager->zFilename, pPager->fd,
  2251                          pPager->vfsFlags, &fout);
  2252       readOnly = (fout&SQLITE_OPEN_READONLY);
  2253 
  2254       /* If the file was successfully opened for read/write access,
  2255       ** choose a default page size in case we have to create the
  2256       ** database file. The default page size is the maximum of:
  2257       **
  2258       **    + SQLITE_DEFAULT_PAGE_SIZE,
  2259       **    + The value returned by sqlite3OsSectorSize()
  2260       **    + The largest page size that can be written atomically.
  2261       */
  2262       if( rc==SQLITE_OK && !readOnly ){
  2263         int iSectorSize = sqlite3OsSectorSize(pPager->fd);
  2264         if( szPageDflt<iSectorSize ){
  2265           szPageDflt = iSectorSize;
  2266         }
  2267 #ifdef SQLITE_ENABLE_ATOMIC_WRITE
  2268         {
  2269           int iDc = sqlite3OsDeviceCharacteristics(pPager->fd);
  2270           int ii;
  2271           assert(SQLITE_IOCAP_ATOMIC512==(512>>8));
  2272           assert(SQLITE_IOCAP_ATOMIC64K==(65536>>8));
  2273           assert(SQLITE_MAX_DEFAULT_PAGE_SIZE<=65536);
  2274           for(ii=szPageDflt; ii<=SQLITE_MAX_DEFAULT_PAGE_SIZE; ii=ii*2){
  2275             if( iDc&(SQLITE_IOCAP_ATOMIC|(ii>>8)) ) szPageDflt = ii;
  2276           }
  2277         }
  2278 #endif
  2279         if( szPageDflt>SQLITE_MAX_DEFAULT_PAGE_SIZE ){
  2280           szPageDflt = SQLITE_MAX_DEFAULT_PAGE_SIZE;
  2281         }
  2282       }
  2283     }
  2284   }else if( !memDb ){
  2285     /* If a temporary file is requested, it is not opened immediately.
  2286     ** In this case we accept the default page size and delay actually
  2287     ** opening the file until the first call to OsWrite().
  2288     */ 
  2289     tempFile = 1;
  2290     pPager->state = PAGER_EXCLUSIVE;
  2291   }
  2292 
  2293   if( pPager && rc==SQLITE_OK ){
  2294     pPager->pTmpSpace = sqlite3PageMalloc(szPageDflt);
  2295   }
  2296 
  2297   /* If an error occured in either of the blocks above.
  2298   ** Free the Pager structure and close the file.
  2299   ** Since the pager is not allocated there is no need to set 
  2300   ** any Pager.errMask variables.
  2301   */
  2302   if( !pPager || !pPager->pTmpSpace ){
  2303     sqlite3OsClose(pPager->fd);
  2304     sqlite3_free(pPager);
  2305     return ((rc==SQLITE_OK)?SQLITE_NOMEM:rc);
  2306   }
  2307 
  2308   PAGERTRACE3("OPEN %d %s\n", FILEHANDLEID(pPager->fd), pPager->zFilename);
  2309   IOTRACE(("OPEN %p %s\n", pPager, pPager->zFilename))
  2310 
  2311   /* Fill in Pager.zDirectory[] */
  2312   memcpy(pPager->zDirectory, pPager->zFilename, nPathname+1);
  2313   for(i=strlen(pPager->zDirectory); i>0 && pPager->zDirectory[i-1]!='/'; i--){}
  2314   if( i>0 ) pPager->zDirectory[i-1] = 0;
  2315 
  2316   /* Fill in Pager.zJournal[] */
  2317   if( zPathname ){
  2318     memcpy(pPager->zJournal, pPager->zFilename, nPathname);
  2319     memcpy(&pPager->zJournal[nPathname], "-journal", 9);
  2320   }else{
  2321     pPager->zJournal = 0;
  2322   }
  2323 
  2324   /* pPager->journalOpen = 0; */
  2325   pPager->useJournal = useJournal && !memDb;
  2326   pPager->noReadlock = noReadlock && readOnly;
  2327   /* pPager->stmtOpen = 0; */
  2328   /* pPager->stmtInUse = 0; */
  2329   /* pPager->nRef = 0; */
  2330   pPager->dbSize = memDb-1;
  2331   pPager->pageSize = szPageDflt;
  2332   /* pPager->stmtSize = 0; */
  2333   /* pPager->stmtJSize = 0; */
  2334   /* pPager->nPage = 0; */
  2335   pPager->mxPage = 100;
  2336   pPager->mxPgno = SQLITE_MAX_PAGE_COUNT;
  2337   /* pPager->state = PAGER_UNLOCK; */
  2338   assert( pPager->state == (tempFile ? PAGER_EXCLUSIVE : PAGER_UNLOCK) );
  2339   /* pPager->errMask = 0; */
  2340   pPager->tempFile = tempFile;
  2341   assert( tempFile==PAGER_LOCKINGMODE_NORMAL 
  2342           || tempFile==PAGER_LOCKINGMODE_EXCLUSIVE );
  2343   assert( PAGER_LOCKINGMODE_EXCLUSIVE==1 );
  2344   pPager->exclusiveMode = tempFile; 
  2345   pPager->memDb = memDb;
  2346   pPager->readOnly = readOnly;
  2347   /* pPager->needSync = 0; */
  2348   pPager->noSync = pPager->tempFile || !useJournal;
  2349   pPager->fullSync = (pPager->noSync?0:1);
  2350   pPager->sync_flags = SQLITE_SYNC_NORMAL;
  2351   /* pPager->pFirst = 0; */
  2352   /* pPager->pFirstSynced = 0; */
  2353   /* pPager->pLast = 0; */
  2354   pPager->nExtra = FORCE_ALIGNMENT(nExtra);
  2355   pPager->journalSizeLimit = SQLITE_DEFAULT_JOURNAL_SIZE_LIMIT;
  2356   assert(pPager->fd->pMethods||memDb||tempFile);
  2357   if( !memDb ){
  2358     setSectorSize(pPager);
  2359   }
  2360   /* pPager->pBusyHandler = 0; */
  2361   /* memset(pPager->aHash, 0, sizeof(pPager->aHash)); */
  2362   *ppPager = pPager;
  2363 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
  2364   pPager->iInUseMM = 0;
  2365   pPager->iInUseDB = 0;
  2366   if( !memDb ){
  2367 #ifndef SQLITE_MUTEX_NOOP
  2368     sqlite3_mutex *mutex = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MEM2);
  2369 #endif
  2370     sqlite3_mutex_enter(mutex);
  2371     pPager->pNext = sqlite3PagerList;
  2372     if( sqlite3PagerList ){
  2373       assert( sqlite3PagerList->pPrev==0 );
  2374       sqlite3PagerList->pPrev = pPager;
  2375     }
  2376     pPager->pPrev = 0;
  2377     sqlite3PagerList = pPager;
  2378     sqlite3_mutex_leave(mutex);
  2379   }
  2380 #endif
  2381   return SQLITE_OK;
  2382 }
  2383 
  2384 /*
  2385 ** Set the busy handler function.
  2386 */
  2387 void sqlite3PagerSetBusyhandler(Pager *pPager, BusyHandler *pBusyHandler){
  2388   pPager->pBusyHandler = pBusyHandler;
  2389 }
  2390 
  2391 /*
  2392 ** Set the destructor for this pager.  If not NULL, the destructor is called
  2393 ** when the reference count on each page reaches zero.  The destructor can
  2394 ** be used to clean up information in the extra segment appended to each page.
  2395 **
  2396 ** The destructor is not called as a result sqlite3PagerClose().  
  2397 ** Destructors are only called by sqlite3PagerUnref().
  2398 */
  2399 void sqlite3PagerSetDestructor(Pager *pPager, void (*xDesc)(DbPage*,int)){
  2400   pPager->xDestructor = xDesc;
  2401 }
  2402 
  2403 /*
  2404 ** Set the reinitializer for this pager.  If not NULL, the reinitializer
  2405 ** is called when the content of a page in cache is restored to its original
  2406 ** value as a result of a rollback.  The callback gives higher-level code
  2407 ** an opportunity to restore the EXTRA section to agree with the restored
  2408 ** page data.
  2409 */
  2410 void sqlite3PagerSetReiniter(Pager *pPager, void (*xReinit)(DbPage*,int)){
  2411   pPager->xReiniter = xReinit;
  2412 }
  2413 
  2414 /*
  2415 ** Set the page size to *pPageSize. If the suggest new page size is
  2416 ** inappropriate, then an alternative page size is set to that
  2417 ** value before returning.
  2418 */
  2419 int sqlite3PagerSetPagesize(Pager *pPager, u16 *pPageSize){
  2420   int rc = SQLITE_OK;
  2421   u16 pageSize = *pPageSize;
  2422   assert( pageSize==0 || (pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE) );
  2423   if( pageSize && pageSize!=pPager->pageSize 
  2424    && !pPager->memDb && pPager->nRef==0 
  2425   ){
  2426     char *pNew = (char *)sqlite3PageMalloc(pageSize);
  2427     if( !pNew ){
  2428       rc = SQLITE_NOMEM;
  2429     }else{
  2430       pagerEnter(pPager);
  2431       pager_reset(pPager);
  2432       pPager->pageSize = pageSize;
  2433       setSectorSize(pPager);
  2434       sqlite3PageFree(pPager->pTmpSpace);
  2435       pPager->pTmpSpace = pNew;
  2436       pagerLeave(pPager);
  2437     }
  2438   }
  2439   *pPageSize = pPager->pageSize;
  2440   return rc;
  2441 }
  2442 
  2443 /*
  2444 ** Return a pointer to the "temporary page" buffer held internally
  2445 ** by the pager.  This is a buffer that is big enough to hold the
  2446 ** entire content of a database page.  This buffer is used internally
  2447 ** during rollback and will be overwritten whenever a rollback
  2448 ** occurs.  But other modules are free to use it too, as long as
  2449 ** no rollbacks are happening.
  2450 */
  2451 void *sqlite3PagerTempSpace(Pager *pPager){
  2452   return pPager->pTmpSpace;
  2453 }
  2454 
  2455 /*
  2456 ** Attempt to set the maximum database page count if mxPage is positive. 
  2457 ** Make no changes if mxPage is zero or negative.  And never reduce the
  2458 ** maximum page count below the current size of the database.
  2459 **
  2460 ** Regardless of mxPage, return the current maximum page count.
  2461 */
  2462 int sqlite3PagerMaxPageCount(Pager *pPager, int mxPage){
  2463   if( mxPage>0 ){
  2464     pPager->mxPgno = mxPage;
  2465   }
  2466   sqlite3PagerPagecount(pPager, 0);
  2467   return pPager->mxPgno;
  2468 }
  2469 
  2470 /*
  2471 ** The following set of routines are used to disable the simulated
  2472 ** I/O error mechanism.  These routines are used to avoid simulated
  2473 ** errors in places where we do not care about errors.
  2474 **
  2475 ** Unless -DSQLITE_TEST=1 is used, these routines are all no-ops
  2476 ** and generate no code.
  2477 */
  2478 #ifdef SQLITE_TEST
  2479 extern int sqlite3_io_error_pending;
  2480 extern int sqlite3_io_error_hit;
  2481 static int saved_cnt;
  2482 void disable_simulated_io_errors(void){
  2483   saved_cnt = sqlite3_io_error_pending;
  2484   sqlite3_io_error_pending = -1;
  2485 }
  2486 void enable_simulated_io_errors(void){
  2487   sqlite3_io_error_pending = saved_cnt;
  2488 }
  2489 #else
  2490 # define disable_simulated_io_errors()
  2491 # define enable_simulated_io_errors()
  2492 #endif
  2493 
  2494 /*
  2495 ** Read the first N bytes from the beginning of the file into memory
  2496 ** that pDest points to. 
  2497 **
  2498 ** No error checking is done. The rational for this is that this function 
  2499 ** may be called even if the file does not exist or contain a header. In 
  2500 ** these cases sqlite3OsRead() will return an error, to which the correct 
  2501 ** response is to zero the memory at pDest and continue.  A real IO error 
  2502 ** will presumably recur and be picked up later (Todo: Think about this).
  2503 */
  2504 int sqlite3PagerReadFileheader(Pager *pPager, int N, unsigned char *pDest){
  2505   int rc = SQLITE_OK;
  2506   memset(pDest, 0, N);
  2507   assert(MEMDB||pPager->fd->pMethods||pPager->tempFile);
  2508   if( pPager->fd->pMethods ){
  2509     IOTRACE(("DBHDR %p 0 %d\n", pPager, N))
  2510     rc = sqlite3OsRead(pPager->fd, pDest, N, 0);
  2511     if( rc==SQLITE_IOERR_SHORT_READ ){
  2512       rc = SQLITE_OK;
  2513     }
  2514   }
  2515   return rc;
  2516 }
  2517 
  2518 /*
  2519 ** Return the total number of pages in the disk file associated with
  2520 ** pPager. 
  2521 **
  2522 ** If the PENDING_BYTE lies on the page directly after the end of the
  2523 ** file, then consider this page part of the file too. For example, if
  2524 ** PENDING_BYTE is byte 4096 (the first byte of page 5) and the size of the
  2525 ** file is 4096 bytes, 5 is returned instead of 4.
  2526 */
  2527 int sqlite3PagerPagecount(Pager *pPager, int *pnPage){
  2528   i64 n = 0;
  2529   int rc;
  2530   assert( pPager!=0 );
  2531   if( pPager->errCode ){
  2532     return pPager->errCode;
  2533   }
  2534   if( pPager->dbSize>=0 ){
  2535     n = pPager->dbSize;
  2536   } else {
  2537     assert(pPager->fd->pMethods||pPager->tempFile);
  2538     if( (pPager->fd->pMethods)
  2539      && (rc = sqlite3OsFileSize(pPager->fd, &n))!=SQLITE_OK ){
  2540       pPager->nRef++;
  2541       pager_error(pPager, rc);
  2542       pPager->nRef--;
  2543       return rc;
  2544     }
  2545     if( n>0 && n<pPager->pageSize ){
  2546       n = 1;
  2547     }else{
  2548       n /= pPager->pageSize;
  2549     }
  2550     if( pPager->state!=PAGER_UNLOCK ){
  2551       pPager->dbSize = n;
  2552     }
  2553   }
  2554   if( n==(PENDING_BYTE/pPager->pageSize) ){
  2555     n++;
  2556   }
  2557   if( n>pPager->mxPgno ){
  2558     pPager->mxPgno = n;
  2559   }
  2560   if( pnPage ){
  2561     *pnPage = n;
  2562   }
  2563   return SQLITE_OK;
  2564 }
  2565 
  2566 
  2567 #ifndef SQLITE_OMIT_MEMORYDB
  2568 /*
  2569 ** Clear a PgHistory block
  2570 */
  2571 static void clearHistory(PgHistory *pHist){
  2572   sqlite3PageFree(pHist->pOrig);
  2573   sqlite3PageFree(pHist->pStmt);
  2574   pHist->pOrig = 0;
  2575   pHist->pStmt = 0;
  2576 }
  2577 #else
  2578 #define clearHistory(x)
  2579 #endif
  2580 
  2581 /*
  2582 ** Forward declaration
  2583 */
  2584 static int syncJournal(Pager*);
  2585 
  2586 /*
  2587 ** Unlink pPg from its hash chain. Also set the page number to 0 to indicate
  2588 ** that the page is not part of any hash chain. This is required because the
  2589 ** sqlite3PagerMovepage() routine can leave a page in the 
  2590 ** pNextFree/pPrevFree list that is not a part of any hash-chain.
  2591 */
  2592 static void unlinkHashChain(Pager *pPager, PgHdr *pPg){
  2593   if( pPg->pgno==0 ){
  2594     assert( pPg->pNextHash==0 && pPg->pPrevHash==0 );
  2595     return;
  2596   }
  2597   if( pPg->pNextHash ){
  2598     pPg->pNextHash->pPrevHash = pPg->pPrevHash;
  2599   }
  2600   if( pPg->pPrevHash ){
  2601     assert( pPager->aHash[pPg->pgno & (pPager->nHash-1)]!=pPg );
  2602     pPg->pPrevHash->pNextHash = pPg->pNextHash;
  2603   }else{
  2604     int h = pPg->pgno & (pPager->nHash-1);
  2605     pPager->aHash[h] = pPg->pNextHash;
  2606   }
  2607   if( MEMDB ){
  2608     clearHistory(PGHDR_TO_HIST(pPg, pPager));
  2609   }
  2610   pPg->pgno = 0;
  2611   pPg->pNextHash = pPg->pPrevHash = 0;
  2612 }
  2613 
  2614 /*
  2615 ** Unlink a page from the free list (the list of all pages where nRef==0)
  2616 ** and from its hash collision chain.
  2617 */
  2618 static void unlinkPage(PgHdr *pPg){
  2619   Pager *pPager = pPg->pPager;
  2620 
  2621   /* Unlink from free page list */
  2622   lruListRemove(pPg);
  2623 
  2624   /* Unlink from the pgno hash table */
  2625   unlinkHashChain(pPager, pPg);
  2626 }
  2627 
  2628 /*
  2629 ** This routine is used to truncate the cache when a database
  2630 ** is truncated.  Drop from the cache all pages whose pgno is
  2631 ** larger than pPager->dbSize and is unreferenced.
  2632 **
  2633 ** Referenced pages larger than pPager->dbSize are zeroed.
  2634 **
  2635 ** Actually, at the point this routine is called, it would be
  2636 ** an error to have a referenced page.  But rather than delete
  2637 ** that page and guarantee a subsequent segfault, it seems better
  2638 ** to zero it and hope that we error out sanely.
  2639 */
  2640 static void pager_truncate_cache(Pager *pPager){
  2641   PgHdr *pPg;
  2642   PgHdr **ppPg;
  2643   int dbSize = pPager->dbSize;
  2644 
  2645   ppPg = &pPager->pAll;
  2646   while( (pPg = *ppPg)!=0 ){
  2647     if( pPg->pgno<=dbSize ){
  2648       ppPg = &pPg->pNextAll;
  2649     }else if( pPg->nRef>0 ){
  2650       memset(PGHDR_TO_DATA(pPg), 0, pPager->pageSize);
  2651       ppPg = &pPg->pNextAll;
  2652     }else{
  2653       *ppPg = pPg->pNextAll;
  2654 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
  2655       if( *ppPg ){
  2656         (*ppPg)->pPrevAll = pPg->pPrevAll;
  2657       }
  2658 #endif
  2659       IOTRACE(("PGFREE %p %d\n", pPager, pPg->pgno));
  2660       PAGER_INCR(sqlite3_pager_pgfree_count);
  2661       unlinkPage(pPg);
  2662       makeClean(pPg);
  2663       sqlite3PageFree(pPg->pData);
  2664       sqlite3_free(pPg);
  2665       pPager->nPage--;
  2666     }
  2667   }
  2668 }
  2669 
  2670 /*
  2671 ** Try to obtain a lock on a file.  Invoke the busy callback if the lock
  2672 ** is currently not available.  Repeat until the busy callback returns
  2673 ** false or until the lock succeeds.
  2674 **
  2675 ** Return SQLITE_OK on success and an error code if we cannot obtain
  2676 ** the lock.
  2677 */
  2678 static int pager_wait_on_lock(Pager *pPager, int locktype){
  2679   int rc;
  2680 
  2681   /* The OS lock values must be the same as the Pager lock values */
  2682   assert( PAGER_SHARED==SHARED_LOCK );
  2683   assert( PAGER_RESERVED==RESERVED_LOCK );
  2684   assert( PAGER_EXCLUSIVE==EXCLUSIVE_LOCK );
  2685 
  2686   /* If the file is currently unlocked then the size must be unknown */
  2687   assert( pPager->state>=PAGER_SHARED || pPager->dbSize<0 || MEMDB );
  2688 
  2689   if( pPager->state>=locktype ){
  2690     rc = SQLITE_OK;
  2691   }else{
  2692     if( pPager->pBusyHandler ) pPager->pBusyHandler->nBusy = 0;
  2693     do {
  2694       rc = sqlite3OsLock(pPager->fd, locktype);
  2695     }while( rc==SQLITE_BUSY && sqlite3InvokeBusyHandler(pPager->pBusyHandler) );
  2696     if( rc==SQLITE_OK ){
  2697       pPager->state = locktype;
  2698       IOTRACE(("LOCK %p %d\n", pPager, locktype))
  2699     }
  2700   }
  2701   return rc;
  2702 }
  2703 
  2704 /*
  2705 ** Truncate the file to the number of pages specified.
  2706 */
  2707 int sqlite3PagerTruncate(Pager *pPager, Pgno nPage){
  2708   int rc;
  2709   assert( pPager->state>=PAGER_SHARED || MEMDB );
  2710   sqlite3PagerPagecount(pPager, 0);
  2711   if( pPager->errCode ){
  2712     rc = pPager->errCode;
  2713     return rc;
  2714   }
  2715   if( nPage>=(unsigned)pPager->dbSize ){
  2716     return SQLITE_OK;
  2717   }
  2718   if( MEMDB ){
  2719     pPager->dbSize = nPage;
  2720     pager_truncate_cache(pPager);
  2721     return SQLITE_OK;
  2722   }
  2723   pagerEnter(pPager);
  2724   rc = syncJournal(pPager);
  2725   pagerLeave(pPager);
  2726   if( rc!=SQLITE_OK ){
  2727     return rc;
  2728   }
  2729 
  2730   /* Get an exclusive lock on the database before truncating. */
  2731   pagerEnter(pPager);
  2732   rc = pager_wait_on_lock(pPager, EXCLUSIVE_LOCK);
  2733   pagerLeave(pPager);
  2734   if( rc!=SQLITE_OK ){
  2735     return rc;
  2736   }
  2737 
  2738   rc = pager_truncate(pPager, nPage);
  2739   return rc;
  2740 }
  2741 
  2742 /*
  2743 ** Shutdown the page cache.  Free all memory and close all files.
  2744 **
  2745 ** If a transaction was in progress when this routine is called, that
  2746 ** transaction is rolled back.  All outstanding pages are invalidated
  2747 ** and their memory is freed.  Any attempt to use a page associated
  2748 ** with this page cache after this function returns will likely
  2749 ** result in a coredump.
  2750 **
  2751 ** This function always succeeds. If a transaction is active an attempt
  2752 ** is made to roll it back. If an error occurs during the rollback 
  2753 ** a hot journal may be left in the filesystem but no error is returned
  2754 ** to the caller.
  2755 */
  2756 int sqlite3PagerClose(Pager *pPager){
  2757 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
  2758   if( !MEMDB ){
  2759 #ifndef SQLITE_MUTEX_NOOP
  2760     sqlite3_mutex *mutex = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MEM2);
  2761 #endif
  2762     sqlite3_mutex_enter(mutex);
  2763     if( pPager->pPrev ){
  2764       pPager->pPrev->pNext = pPager->pNext;
  2765     }else{
  2766       sqlite3PagerList = pPager->pNext;
  2767     }
  2768     if( pPager->pNext ){
  2769       pPager->pNext->pPrev = pPager->pPrev;
  2770     }
  2771     sqlite3_mutex_leave(mutex);
  2772   }
  2773 #endif
  2774 
  2775   disable_simulated_io_errors();
  2776   sqlite3BeginBenignMalloc();
  2777   pPager->errCode = 0;
  2778   pPager->exclusiveMode = 0;
  2779   pager_reset(pPager);
  2780   pagerUnlockAndRollback(pPager);
  2781   enable_simulated_io_errors();
  2782   sqlite3EndBenignMalloc();
  2783   PAGERTRACE2("CLOSE %d\n", PAGERID(pPager));
  2784   IOTRACE(("CLOSE %p\n", pPager))
  2785   if( pPager->journalOpen ){
  2786     sqlite3OsClose(pPager->jfd);
  2787   }
  2788   sqlite3BitvecDestroy(pPager->pInJournal);
  2789   if( pPager->stmtOpen ){
  2790     sqlite3OsClose(pPager->stfd);
  2791   }
  2792   sqlite3OsClose(pPager->fd);
  2793   /* Temp files are automatically deleted by the OS
  2794   ** if( pPager->tempFile ){
  2795   **   sqlite3OsDelete(pPager->zFilename);
  2796   ** }
  2797   */
  2798 
  2799   sqlite3_free(pPager->aHash);
  2800   sqlite3PageFree(pPager->pTmpSpace);
  2801   sqlite3_free(pPager);
  2802   return SQLITE_OK;
  2803 }
  2804 
  2805 #if !defined(NDEBUG) || defined(SQLITE_TEST)
  2806 /*
  2807 ** Return the page number for the given page data.
  2808 */
  2809 Pgno sqlite3PagerPagenumber(DbPage *p){
  2810   return p->pgno;
  2811 }
  2812 #endif
  2813 
  2814 /*
  2815 ** The page_ref() function increments the reference count for a page.
  2816 ** If the page is currently on the freelist (the reference count is zero) then
  2817 ** remove it from the freelist.
  2818 **
  2819 ** For non-test systems, page_ref() is a macro that calls _page_ref()
  2820 ** online of the reference count is zero.  For test systems, page_ref()
  2821 ** is a real function so that we can set breakpoints and trace it.
  2822 */
  2823 static void _page_ref(PgHdr *pPg){
  2824   if( pPg->nRef==0 ){
  2825     /* The page is currently on the freelist.  Remove it. */
  2826     lruListRemove(pPg);
  2827     pPg->pPager->nRef++;
  2828   }
  2829   pPg->nRef++;
  2830 }
  2831 #ifdef SQLITE_DEBUG
  2832   static void page_ref(PgHdr *pPg){
  2833     if( pPg->nRef==0 ){
  2834       _page_ref(pPg);
  2835     }else{
  2836       pPg->nRef++;
  2837     }
  2838   }
  2839 #else
  2840 # define page_ref(P)   ((P)->nRef==0?_page_ref(P):(void)(P)->nRef++)
  2841 #endif
  2842 
  2843 /*
  2844 ** Increment the reference count for a page.  The input pointer is
  2845 ** a reference to the page data.
  2846 */
  2847 int sqlite3PagerRef(DbPage *pPg){
  2848   pagerEnter(pPg->pPager);
  2849   page_ref(pPg);
  2850   pagerLeave(pPg->pPager);
  2851   return SQLITE_OK;
  2852 }
  2853 
  2854 /*
  2855 ** Sync the journal.  In other words, make sure all the pages that have
  2856 ** been written to the journal have actually reached the surface of the
  2857 ** disk.  It is not safe to modify the original database file until after
  2858 ** the journal has been synced.  If the original database is modified before
  2859 ** the journal is synced and a power failure occurs, the unsynced journal
  2860 ** data would be lost and we would be unable to completely rollback the
  2861 ** database changes.  Database corruption would occur.
  2862 ** 
  2863 ** This routine also updates the nRec field in the header of the journal.
  2864 ** (See comments on the pager_playback() routine for additional information.)
  2865 ** If the sync mode is FULL, two syncs will occur.  First the whole journal
  2866 ** is synced, then the nRec field is updated, then a second sync occurs.
  2867 **
  2868 ** For temporary databases, we do not care if we are able to rollback
  2869 ** after a power failure, so no sync occurs.
  2870 **
  2871 ** If the IOCAP_SEQUENTIAL flag is set for the persistent media on which
  2872 ** the database is stored, then OsSync() is never called on the journal
  2873 ** file. In this case all that is required is to update the nRec field in
  2874 ** the journal header.
  2875 **
  2876 ** This routine clears the needSync field of every page current held in
  2877 ** memory.
  2878 */
  2879 static int syncJournal(Pager *pPager){
  2880   PgHdr *pPg;
  2881   int rc = SQLITE_OK;
  2882 
  2883   /* Sync the journal before modifying the main database
  2884   ** (assuming there is a journal and it needs to be synced.)
  2885   */
  2886   if( pPager->needSync ){
  2887     if( !pPager->tempFile ){
  2888       int iDc = sqlite3OsDeviceCharacteristics(pPager->fd);
  2889       assert( pPager->journalOpen );
  2890 
  2891       if( 0==(iDc&SQLITE_IOCAP_SAFE_APPEND) ){
  2892         /* Write the nRec value into the journal file header. If in
  2893         ** full-synchronous mode, sync the journal first. This ensures that
  2894         ** all data has really hit the disk before nRec is updated to mark
  2895         ** it as a candidate for rollback.
  2896         **
  2897         ** This is not required if the persistent media supports the
  2898         ** SAFE_APPEND property. Because in this case it is not possible 
  2899         ** for garbage data to be appended to the file, the nRec field
  2900         ** is populated with 0xFFFFFFFF when the journal header is written
  2901         ** and never needs to be updated.
  2902         */
  2903         i64 jrnlOff;
  2904         if( pPager->fullSync && 0==(iDc&SQLITE_IOCAP_SEQUENTIAL) ){
  2905           PAGERTRACE2("SYNC journal of %d\n", PAGERID(pPager));
  2906           IOTRACE(("JSYNC %p\n", pPager))
  2907           rc = sqlite3OsSync(pPager->jfd, pPager->sync_flags);
  2908           if( rc!=0 ) return rc;
  2909         }
  2910 
  2911         jrnlOff = pPager->journalHdr + sizeof(aJournalMagic);
  2912         IOTRACE(("JHDR %p %lld %d\n", pPager, jrnlOff, 4));
  2913         rc = write32bits(pPager->jfd, jrnlOff, pPager->nRec);
  2914         if( rc ) return rc;
  2915       }
  2916       if( 0==(iDc&SQLITE_IOCAP_SEQUENTIAL) ){
  2917         PAGERTRACE2("SYNC journal of %d\n", PAGERID(pPager));
  2918         IOTRACE(("JSYNC %p\n", pPager))
  2919         rc = sqlite3OsSync(pPager->jfd, pPager->sync_flags| 
  2920           (pPager->sync_flags==SQLITE_SYNC_FULL?SQLITE_SYNC_DATAONLY:0)
  2921         );
  2922         if( rc!=0 ) return rc;
  2923       }
  2924       pPager->journalStarted = 1;
  2925     }
  2926     pPager->needSync = 0;
  2927 
  2928     /* Erase the needSync flag from every page.
  2929     */
  2930     for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
  2931       pPg->needSync = 0;
  2932     }
  2933     lruListSetFirstSynced(pPager);
  2934   }
  2935 
  2936 #ifndef NDEBUG
  2937   /* If the Pager.needSync flag is clear then the PgHdr.needSync
  2938   ** flag must also be clear for all pages.  Verify that this
  2939   ** invariant is true.
  2940   */
  2941   else{
  2942     for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
  2943       assert( pPg->needSync==0 );
  2944     }
  2945     assert( pPager->lru.pFirstSynced==pPager->lru.pFirst );
  2946   }
  2947 #endif
  2948 
  2949   return rc;
  2950 }
  2951 
  2952 /*
  2953 ** Merge two lists of pages connected by pDirty and in pgno order.
  2954 ** Do not both fixing the pPrevDirty pointers.
  2955 */
  2956 static PgHdr *merge_pagelist(PgHdr *pA, PgHdr *pB){
  2957   PgHdr result, *pTail;
  2958   pTail = &result;
  2959   while( pA && pB ){
  2960     if( pA->pgno<pB->pgno ){
  2961       pTail->pDirty = pA;
  2962       pTail = pA;
  2963       pA = pA->pDirty;
  2964     }else{
  2965       pTail->pDirty = pB;
  2966       pTail = pB;
  2967       pB = pB->pDirty;
  2968     }
  2969   }
  2970   if( pA ){
  2971     pTail->pDirty = pA;
  2972   }else if( pB ){
  2973     pTail->pDirty = pB;
  2974   }else{
  2975     pTail->pDirty = 0;
  2976   }
  2977   return result.pDirty;
  2978 }
  2979 
  2980 /*
  2981 ** Sort the list of pages in accending order by pgno.  Pages are
  2982 ** connected by pDirty pointers.  The pPrevDirty pointers are
  2983 ** corrupted by this sort.
  2984 */
  2985 #define N_SORT_BUCKET_ALLOC 25
  2986 #define N_SORT_BUCKET       25
  2987 #ifdef SQLITE_TEST
  2988   int sqlite3_pager_n_sort_bucket = 0;
  2989   #undef N_SORT_BUCKET
  2990   #define N_SORT_BUCKET \
  2991    (sqlite3_pager_n_sort_bucket?sqlite3_pager_n_sort_bucket:N_SORT_BUCKET_ALLOC)
  2992 #endif
  2993 static PgHdr *sort_pagelist(PgHdr *pIn){
  2994   PgHdr *a[N_SORT_BUCKET_ALLOC], *p;
  2995   int i;
  2996   memset(a, 0, sizeof(a));
  2997   while( pIn ){
  2998     p = pIn;
  2999     pIn = p->pDirty;
  3000     p->pDirty = 0;
  3001     for(i=0; i<N_SORT_BUCKET-1; i++){
  3002       if( a[i]==0 ){
  3003         a[i] = p;
  3004         break;
  3005       }else{
  3006         p = merge_pagelist(a[i], p);
  3007         a[i] = 0;
  3008       }
  3009     }
  3010     if( i==N_SORT_BUCKET-1 ){
  3011       /* Coverage: To get here, there need to be 2^(N_SORT_BUCKET) 
  3012       ** elements in the input list. This is possible, but impractical.
  3013       ** Testing this line is the point of global variable
  3014       ** sqlite3_pager_n_sort_bucket.
  3015       */
  3016       a[i] = merge_pagelist(a[i], p);
  3017     }
  3018   }
  3019   p = a[0];
  3020   for(i=1; i<N_SORT_BUCKET; i++){
  3021     p = merge_pagelist(p, a[i]);
  3022   }
  3023   return p;
  3024 }
  3025 
  3026 /*
  3027 ** Given a list of pages (connected by the PgHdr.pDirty pointer) write
  3028 ** every one of those pages out to the database file and mark them all
  3029 ** as clean.
  3030 */
  3031 static int pager_write_pagelist(PgHdr *pList){
  3032   Pager *pPager;
  3033   PgHdr *p;
  3034   int rc;
  3035 
  3036   if( pList==0 ) return SQLITE_OK;
  3037   pPager = pList->pPager;
  3038 
  3039   /* At this point there may be either a RESERVED or EXCLUSIVE lock on the
  3040   ** database file. If there is already an EXCLUSIVE lock, the following
  3041   ** calls to sqlite3OsLock() are no-ops.
  3042   **
  3043   ** Moving the lock from RESERVED to EXCLUSIVE actually involves going
  3044   ** through an intermediate state PENDING.   A PENDING lock prevents new
  3045   ** readers from attaching to the database but is unsufficient for us to
  3046   ** write.  The idea of a PENDING lock is to prevent new readers from
  3047   ** coming in while we wait for existing readers to clear.
  3048   **
  3049   ** While the pager is in the RESERVED state, the original database file
  3050   ** is unchanged and we can rollback without having to playback the
  3051   ** journal into the original database file.  Once we transition to
  3052   ** EXCLUSIVE, it means the database file has been changed and any rollback
  3053   ** will require a journal playback.
  3054   */
  3055   rc = pager_wait_on_lock(pPager, EXCLUSIVE_LOCK);
  3056   if( rc!=SQLITE_OK ){
  3057     return rc;
  3058   }
  3059 
  3060   pList = sort_pagelist(pList);
  3061   for(p=pList; p; p=p->pDirty){
  3062     assert( p->dirty );
  3063     p->dirty = 0;
  3064   }
  3065 
  3066   /* If the file has not yet been opened, open it now. */
  3067   if( !pPager->fd->pMethods ){
  3068     assert(pPager->tempFile);
  3069 	rc = sqlite3PagerOpentemp(pPager, pPager->fd, pPager->vfsFlags);
  3070 	if( rc ) return rc;
  3071   }
  3072 
  3073   while( pList ){
  3074     /* If there are dirty pages in the page cache with page numbers greater
  3075     ** than Pager.dbSize, this means sqlite3PagerTruncate() was called to
  3076     ** make the file smaller (presumably by auto-vacuum code). Do not write
  3077     ** any such pages to the file.
  3078     */
  3079     if( pList->pgno<=pPager->dbSize ){
  3080       i64 offset = (pList->pgno-1)*(i64)pPager->pageSize;
  3081       char *pData = CODEC2(pPager, PGHDR_TO_DATA(pList), pList->pgno, 6);
  3082       PAGERTRACE4("STORE %d page %d hash(%08x)\n",
  3083                    PAGERID(pPager), pList->pgno, pager_pagehash(pList));
  3084       IOTRACE(("PGOUT %p %d\n", pPager, pList->pgno));
  3085       rc = sqlite3OsWrite(pPager->fd, pData, pPager->pageSize, offset);
  3086       PAGER_INCR(sqlite3_pager_writedb_count);
  3087       PAGER_INCR(pPager->nWrite);
  3088       if( pList->pgno==1 ){
  3089         memcpy(&pPager->dbFileVers, &pData[24], sizeof(pPager->dbFileVers));
  3090       }
  3091     }
  3092 #ifndef NDEBUG
  3093     else{
  3094       PAGERTRACE3("NOSTORE %d page %d\n", PAGERID(pPager), pList->pgno);
  3095     }
  3096 #endif
  3097     if( rc ) return rc;
  3098 #ifdef SQLITE_CHECK_PAGES
  3099     pList->pageHash = pager_pagehash(pList);
  3100 #endif
  3101     pList = pList->pDirty;
  3102   }
  3103   return SQLITE_OK;
  3104 }
  3105 
  3106 /*
  3107 ** Collect every dirty page into a dirty list and
  3108 ** return a pointer to the head of that list.  All pages are
  3109 ** collected even if they are still in use.
  3110 */
  3111 static PgHdr *pager_get_all_dirty_pages(Pager *pPager){
  3112 
  3113 #ifndef NDEBUG
  3114   /* Verify the sanity of the dirty list when we are running
  3115   ** in debugging mode.  This is expensive, so do not
  3116   ** do this on a normal build. */
  3117   int n1 = 0;
  3118   int n2 = 0;
  3119   PgHdr *p;
  3120   for(p=pPager->pAll; p; p=p->pNextAll){ if( p->dirty ) n1++; }
  3121   for(p=pPager->pDirty; p; p=p->pDirty){ n2++; }
  3122   assert( n1==n2 );
  3123 #endif
  3124 
  3125   return pPager->pDirty;
  3126 }
  3127 
  3128 /*
  3129 ** Return 1 if there is a hot journal on the given pager.
  3130 ** A hot journal is one that needs to be played back.
  3131 **
  3132 ** If the current size of the database file is 0 but a journal file
  3133 ** exists, that is probably an old journal left over from a prior
  3134 ** database with the same name.  Just delete the journal.
  3135 **
  3136 ** Return negative if unable to determine the status of the journal.
  3137 **
  3138 ** This routine does not open the journal file to examine its
  3139 ** content.  Hence, the journal might contain the name of a master
  3140 ** journal file that has been deleted, and hence not be hot.  Or
  3141 ** the header of the journal might be zeroed out.  This routine
  3142 ** does not discover these cases of a non-hot journal - if the
  3143 ** journal file exists and is not empty this routine assumes it
  3144 ** is hot.  The pager_playback() routine will discover that the
  3145 ** journal file is not really hot and will no-op.
  3146 */
  3147 static int hasHotJournal(Pager *pPager, int *pExists){
  3148   sqlite3_vfs *pVfs = pPager->pVfs;
  3149   int rc = SQLITE_OK;
  3150   *pExists = 0;
  3151   if( pPager->useJournal && pPager->fd->pMethods ){
  3152     int exists;
  3153     int locked;
  3154 
  3155     rc = sqlite3OsAccess(pVfs, pPager->zJournal, SQLITE_ACCESS_EXISTS, &exists);
  3156     if( rc==SQLITE_OK && exists ){
  3157       rc = sqlite3OsCheckReservedLock(pPager->fd, &locked);
  3158     }
  3159 
  3160     if( rc==SQLITE_OK && exists && !locked ){
  3161       int nPage;
  3162       rc = sqlite3PagerPagecount(pPager, &nPage);
  3163       if( rc==SQLITE_OK ){
  3164         if( nPage==0 ){
  3165           sqlite3OsDelete(pVfs, pPager->zJournal, 0);
  3166         }else{
  3167           *pExists = 1;
  3168         }
  3169       }
  3170     }
  3171   }
  3172 
  3173   return rc;
  3174 }
  3175 
  3176 /*
  3177 ** Try to find a page in the cache that can be recycled. 
  3178 **
  3179 ** This routine may return SQLITE_IOERR, SQLITE_FULL or SQLITE_OK. It 
  3180 ** does not set the pPager->errCode variable.
  3181 */
  3182 static int pager_recycle(Pager *pPager, PgHdr **ppPg){
  3183   PgHdr *pPg;
  3184   *ppPg = 0;
  3185 
  3186   /* It is illegal to call this function unless the pager object
  3187   ** pointed to by pPager has at least one free page (page with nRef==0).
  3188   */ 
  3189   assert(!MEMDB);
  3190   assert(pPager->lru.pFirst);
  3191 
  3192   /* Find a page to recycle.  Try to locate a page that does not
  3193   ** require us to do an fsync() on the journal.
  3194   */
  3195   pPg = pPager->lru.pFirstSynced;
  3196 
  3197   /* If we could not find a page that does not require an fsync()
  3198   ** on the journal file then fsync the journal file.  This is a
  3199   ** very slow operation, so we work hard to avoid it.  But sometimes
  3200   ** it can't be helped.
  3201   */
  3202   if( pPg==0 && pPager->lru.pFirst ){
  3203     if( !pPager->errCode ){
  3204       int iDc = sqlite3OsDeviceCharacteristics(pPager->fd);
  3205       int rc = syncJournal(pPager);
  3206       if( rc!=0 ){
  3207         return rc;
  3208       }
  3209       if( pPager->fullSync && 0==(iDc&SQLITE_IOCAP_SAFE_APPEND) ){
  3210         /* If in full-sync mode, write a new journal header into the
  3211         ** journal file. This is done to avoid ever modifying a journal
  3212         ** header that is involved in the rollback of pages that have
  3213         ** already been written to the database (in case the header is
  3214         ** trashed when the nRec field is updated).
  3215         */
  3216         pPager->nRec = 0;
  3217         assert( pPager->journalOff > 0 );
  3218         assert( pPager->doNotSync==0 );
  3219         rc = writeJournalHdr(pPager);
  3220         if( rc!=0 ){
  3221           return rc;
  3222         }
  3223       }
  3224     }
  3225     pPg = pPager->lru.pFirst;
  3226   }
  3227 
  3228   assert( pPg->nRef==0 );
  3229 
  3230   /* Write the page to the database file if it is dirty.
  3231   */
  3232   if( pPg->dirty && !pPager->errCode ){
  3233     int rc;
  3234     assert( pPg->needSync==0 );
  3235     makeClean(pPg);
  3236     pPg->dirty = 1;
  3237     pPg->pDirty = 0;
  3238     rc = pager_write_pagelist( pPg );
  3239     pPg->dirty = 0;
  3240     if( rc!=SQLITE_OK ){
  3241       return rc;
  3242     }
  3243   }
  3244   assert( pPg->dirty==0 || pPager->errCode );
  3245 
  3246   /* If the page we are recycling is marked as alwaysRollback, then
  3247   ** set the global alwaysRollback flag, thus disabling the
  3248   ** sqlite3PagerDontRollback() optimization for the rest of this transaction.
  3249   ** It is necessary to do this because the page marked alwaysRollback
  3250   ** might be reloaded at a later time but at that point we won't remember
  3251   ** that is was marked alwaysRollback.  This means that all pages must
  3252   ** be marked as alwaysRollback from here on out.
  3253   */
  3254   if( pPg->alwaysRollback ){
  3255     IOTRACE(("ALWAYS_ROLLBACK %p\n", pPager))
  3256     pPager->alwaysRollback = 1;
  3257   }
  3258 
  3259   /* Unlink the old page from the free list and the hash table
  3260   */
  3261   unlinkPage(pPg);
  3262   assert( pPg->pgno==0 );
  3263 
  3264   *ppPg = pPg;
  3265   return SQLITE_OK;
  3266 }
  3267 
  3268 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
  3269 /*
  3270 ** This function is called to free superfluous dynamically allocated memory
  3271 ** held by the pager system. Memory in use by any SQLite pager allocated
  3272 ** by the current thread may be sqlite3_free()ed.
  3273 **
  3274 ** nReq is the number of bytes of memory required. Once this much has
  3275 ** been released, the function returns. The return value is the total number 
  3276 ** of bytes of memory released.
  3277 */
  3278 int sqlite3PagerReleaseMemory(int nReq){
  3279   int nReleased = 0;          /* Bytes of memory released so far */
  3280   Pager *pPager;              /* For looping over pagers */
  3281   BusyHandler *savedBusy;     /* Saved copy of the busy handler */
  3282   int rc = SQLITE_OK;
  3283 
  3284   /* Acquire the memory-management mutex
  3285   */
  3286 #ifndef SQLITE_MUTEX_NOOP
  3287   sqlite3_mutex *mutex;       /* The MEM2 mutex */
  3288   mutex = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MEM2);
  3289 #endif
  3290   sqlite3_mutex_enter(mutex);
  3291 
  3292   /* Signal all database connections that memory management wants
  3293   ** to have access to the pagers.
  3294   */
  3295   for(pPager=sqlite3PagerList; pPager; pPager=pPager->pNext){
  3296      pPager->iInUseMM = 1;
  3297   }
  3298 
  3299   while( rc==SQLITE_OK && (nReq<0 || nReleased<nReq) ){
  3300     PgHdr *pPg;
  3301     PgHdr *pRecycled;
  3302  
  3303     /* Try to find a page to recycle that does not require a sync(). If
  3304     ** this is not possible, find one that does require a sync().
  3305     */
  3306     sqlite3_mutex_enter(sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_LRU));
  3307     pPg = sqlite3LruPageList.pFirstSynced;
  3308     while( pPg && (pPg->needSync || pPg->pPager->iInUseDB) ){
  3309       pPg = pPg->gfree.pNext;
  3310     }
  3311     if( !pPg ){
  3312       pPg = sqlite3LruPageList.pFirst;
  3313       while( pPg && pPg->pPager->iInUseDB ){
  3314         pPg = pPg->gfree.pNext;
  3315       }
  3316     }
  3317     sqlite3_mutex_leave(sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_LRU));
  3318 
  3319     /* If pPg==0, then the block above has failed to find a page to
  3320     ** recycle. In this case return early - no further memory will
  3321     ** be released.
  3322     */
  3323     if( !pPg ) break;
  3324 
  3325     pPager = pPg->pPager;
  3326     assert(!pPg->needSync || pPg==pPager->lru.pFirst);
  3327     assert(pPg->needSync || pPg==pPager->lru.pFirstSynced);
  3328   
  3329     savedBusy = pPager->pBusyHandler;
  3330     pPager->pBusyHandler = 0;
  3331     rc = pager_recycle(pPager, &pRecycled);
  3332     pPager->pBusyHandler = savedBusy;
  3333     assert(pRecycled==pPg || rc!=SQLITE_OK);
  3334     if( rc==SQLITE_OK ){
  3335       /* We've found a page to free. At this point the page has been 
  3336       ** removed from the page hash-table, free-list and synced-list 
  3337       ** (pFirstSynced). It is still in the all pages (pAll) list. 
  3338       ** Remove it from this list before freeing.
  3339       **
  3340       ** Todo: Check the Pager.pStmt list to make sure this is Ok. It 
  3341       ** probably is though.
  3342       */
  3343       PgHdr *pTmp;
  3344       assert( pPg );
  3345       if( pPg==pPager->pAll ){
  3346          assert(pPg->pPrevAll==0);
  3347          assert(pPg->pNextAll==0 || pPg->pNextAll->pPrevAll==pPg);
  3348          pPager->pAll = pPg->pNextAll;
  3349          if( pPager->pAll ){
  3350            pPager->pAll->pPrevAll = 0;
  3351          }
  3352       }else{
  3353          assert(pPg->pPrevAll);
  3354          assert(pPg->pPrevAll->pNextAll==pPg);
  3355          pTmp = pPg->pPrevAll;
  3356          pTmp->pNextAll = pPg->pNextAll;
  3357          if( pTmp->pNextAll ){
  3358            pTmp->pNextAll->pPrevAll = pTmp;
  3359          }
  3360       }
  3361       nReleased += (
  3362           sizeof(*pPg) + pPager->pageSize
  3363           + sizeof(u32) + pPager->nExtra
  3364           + MEMDB*sizeof(PgHistory) 
  3365       );
  3366       IOTRACE(("PGFREE %p %d *\n", pPager, pPg->pgno));
  3367       PAGER_INCR(sqlite3_pager_pgfree_count);
  3368       sqlite3PageFree(pPg->pData);
  3369       sqlite3_free(pPg);
  3370       pPager->nPage--;
  3371     }else{
  3372       /* An error occured whilst writing to the database file or 
  3373       ** journal in pager_recycle(). The error is not returned to the 
  3374       ** caller of this function. Instead, set the Pager.errCode variable.
  3375       ** The error will be returned to the user (or users, in the case 
  3376       ** of a shared pager cache) of the pager for which the error occured.
  3377       */
  3378       assert(
  3379           (rc&0xff)==SQLITE_IOERR ||
  3380           rc==SQLITE_FULL ||
  3381           rc==SQLITE_BUSY
  3382       );
  3383       assert( pPager->state>=PAGER_RESERVED );
  3384       pager_error(pPager, rc);
  3385     }
  3386   }
  3387 
  3388   /* Clear the memory management flags and release the mutex
  3389   */
  3390   for(pPager=sqlite3PagerList; pPager; pPager=pPager->pNext){
  3391      pPager->iInUseMM = 0;
  3392   }
  3393   sqlite3_mutex_leave(mutex);
  3394 
  3395   /* Return the number of bytes released
  3396   */
  3397   return nReleased;
  3398 }
  3399 #endif /* SQLITE_ENABLE_MEMORY_MANAGEMENT */
  3400 
  3401 /*
  3402 ** Read the content of page pPg out of the database file.
  3403 */
  3404 static int readDbPage(Pager *pPager, PgHdr *pPg, Pgno pgno){
  3405   int rc;
  3406   i64 offset;
  3407   assert( MEMDB==0 );
  3408   assert(pPager->fd->pMethods||pPager->tempFile);
  3409   if( !pPager->fd->pMethods ){
  3410     return SQLITE_IOERR_SHORT_READ;
  3411   }
  3412   offset = (pgno-1)*(i64)pPager->pageSize;
  3413   rc = sqlite3OsRead(pPager->fd, PGHDR_TO_DATA(pPg), pPager->pageSize, offset);
  3414   PAGER_INCR(sqlite3_pager_readdb_count);
  3415   PAGER_INCR(pPager->nRead);
  3416   IOTRACE(("PGIN %p %d\n", pPager, pgno));
  3417   if( pgno==1 ){
  3418     memcpy(&pPager->dbFileVers, &((u8*)PGHDR_TO_DATA(pPg))[24],
  3419                                               sizeof(pPager->dbFileVers));
  3420   }
  3421   CODEC1(pPager, PGHDR_TO_DATA(pPg), pPg->pgno, 3);
  3422   PAGERTRACE4("FETCH %d page %d hash(%08x)\n",
  3423                PAGERID(pPager), pPg->pgno, pager_pagehash(pPg));
  3424   return rc;
  3425 }
  3426 
  3427 
  3428 /*
  3429 ** This function is called to obtain the shared lock required before
  3430 ** data may be read from the pager cache. If the shared lock has already
  3431 ** been obtained, this function is a no-op.
  3432 **
  3433 ** Immediately after obtaining the shared lock (if required), this function
  3434 ** checks for a hot-journal file. If one is found, an emergency rollback
  3435 ** is performed immediately.
  3436 */
  3437 static int pagerSharedLock(Pager *pPager){
  3438   int rc = SQLITE_OK;
  3439   int isErrorReset = 0;
  3440 
  3441   /* If this database is opened for exclusive access, has no outstanding 
  3442   ** page references and is in an error-state, now is the chance to clear
  3443   ** the error. Discard the contents of the pager-cache and treat any
  3444   ** open journal file as a hot-journal.
  3445   */
  3446   if( !MEMDB && pPager->exclusiveMode && pPager->nRef==0 && pPager->errCode ){
  3447     if( pPager->journalOpen ){
  3448       isErrorReset = 1;
  3449     }
  3450     pPager->errCode = SQLITE_OK;
  3451     pager_reset(pPager);
  3452   }
  3453 
  3454   /* If the pager is still in an error state, do not proceed. The error 
  3455   ** state will be cleared at some point in the future when all page 
  3456   ** references are dropped and the cache can be discarded.
  3457   */
  3458   if( pPager->errCode && pPager->errCode!=SQLITE_FULL ){
  3459     return pPager->errCode;
  3460   }
  3461 
  3462   if( pPager->state==PAGER_UNLOCK || isErrorReset ){
  3463     sqlite3_vfs *pVfs = pPager->pVfs;
  3464     if( !MEMDB ){
  3465       int isHotJournal;
  3466       assert( pPager->nRef==0 );
  3467       if( !pPager->noReadlock ){
  3468         rc = pager_wait_on_lock(pPager, SHARED_LOCK);
  3469         if( rc!=SQLITE_OK ){
  3470           assert( pPager->state==PAGER_UNLOCK );
  3471           return pager_error(pPager, rc);
  3472         }
  3473         assert( pPager->state>=SHARED_LOCK );
  3474       }
  3475   
  3476       /* If a journal file exists, and there is no RESERVED lock on the
  3477       ** database file, then it either needs to be played back or deleted.
  3478       */
  3479       if( !isErrorReset ){
  3480         rc = hasHotJournal(pPager, &isHotJournal);
  3481         if( rc!=SQLITE_OK ){
  3482           goto failed;
  3483         }
  3484       }
  3485       if( isErrorReset || isHotJournal ){
  3486         /* Get an EXCLUSIVE lock on the database file. At this point it is
  3487         ** important that a RESERVED lock is not obtained on the way to the
  3488         ** EXCLUSIVE lock. If it were, another process might open the
  3489         ** database file, detect the RESERVED lock, and conclude that the
  3490         ** database is safe to read while this process is still rolling it 
  3491         ** back.
  3492         ** 
  3493         ** Because the intermediate RESERVED lock is not requested, the
  3494         ** second process will get to this point in the code and fail to
  3495         ** obtain its own EXCLUSIVE lock on the database file.
  3496         */
  3497         if( pPager->state<EXCLUSIVE_LOCK ){
  3498           rc = sqlite3OsLock(pPager->fd, EXCLUSIVE_LOCK);
  3499           if( rc!=SQLITE_OK ){
  3500             rc = pager_error(pPager, rc);
  3501             goto failed;
  3502           }
  3503           pPager->state = PAGER_EXCLUSIVE;
  3504         }
  3505  
  3506         /* Open the journal for read/write access. This is because in 
  3507         ** exclusive-access mode the file descriptor will be kept open and
  3508         ** possibly used for a transaction later on. On some systems, the
  3509         ** OsTruncate() call used in exclusive-access mode also requires
  3510         ** a read/write file handle.
  3511         */
  3512         if( !isErrorReset && pPager->journalOpen==0 ){
  3513           int res;
  3514           rc = sqlite3OsAccess(pVfs,pPager->zJournal,SQLITE_ACCESS_EXISTS,&res);
  3515           if( rc==SQLITE_OK ){
  3516             if( res ){
  3517               int fout = 0;
  3518               int f = SQLITE_OPEN_READWRITE|SQLITE_OPEN_MAIN_JOURNAL;
  3519               assert( !pPager->tempFile );
  3520               rc = sqlite3OsOpen(pVfs, pPager->zJournal, pPager->jfd, f, &fout);
  3521               assert( rc!=SQLITE_OK || pPager->jfd->pMethods );
  3522               if( fout&SQLITE_OPEN_READONLY ){
  3523                 rc = SQLITE_BUSY;
  3524                 sqlite3OsClose(pPager->jfd);
  3525               }
  3526             }else{
  3527               /* If the journal does not exist, that means some other process
  3528               ** has already rolled it back */
  3529               rc = SQLITE_BUSY;
  3530             }
  3531           }
  3532         }
  3533         if( rc!=SQLITE_OK ){
  3534           if( rc!=SQLITE_NOMEM && rc!=SQLITE_IOERR_UNLOCK 
  3535            && rc!=SQLITE_IOERR_NOMEM 
  3536           ){
  3537             rc = SQLITE_BUSY;
  3538           }
  3539           goto failed;
  3540         }
  3541         pPager->journalOpen = 1;
  3542         pPager->journalStarted = 0;
  3543         pPager->journalOff = 0;
  3544         pPager->setMaster = 0;
  3545         pPager->journalHdr = 0;
  3546  
  3547         /* Playback and delete the journal.  Drop the database write
  3548         ** lock and reacquire the read lock.
  3549         */
  3550         rc = pager_playback(pPager, 1);
  3551         if( rc!=SQLITE_OK ){
  3552           rc = pager_error(pPager, rc);
  3553           goto failed;
  3554         }
  3555         assert(pPager->state==PAGER_SHARED || 
  3556             (pPager->exclusiveMode && pPager->state>PAGER_SHARED)
  3557         );
  3558       }
  3559 
  3560       if( pPager->pAll ){
  3561         /* The shared-lock has just been acquired on the database file
  3562         ** and there are already pages in the cache (from a previous
  3563         ** read or write transaction).  Check to see if the database
  3564         ** has been modified.  If the database has changed, flush the
  3565         ** cache.
  3566         **
  3567         ** Database changes is detected by looking at 15 bytes beginning
  3568         ** at offset 24 into the file.  The first 4 of these 16 bytes are
  3569         ** a 32-bit counter that is incremented with each change.  The
  3570         ** other bytes change randomly with each file change when
  3571         ** a codec is in use.
  3572         ** 
  3573         ** There is a vanishingly small chance that a change will not be 
  3574         ** detected.  The chance of an undetected change is so small that
  3575         ** it can be neglected.
  3576         */
  3577         char dbFileVers[sizeof(pPager->dbFileVers)];
  3578         sqlite3PagerPagecount(pPager, 0);
  3579 
  3580         if( pPager->errCode ){
  3581           rc = pPager->errCode;
  3582           goto failed;
  3583         }
  3584 
  3585         if( pPager->dbSize>0 ){
  3586           IOTRACE(("CKVERS %p %d\n", pPager, sizeof(dbFileVers)));
  3587           rc = sqlite3OsRead(pPager->fd, &dbFileVers, sizeof(dbFileVers), 24);
  3588           if( rc!=SQLITE_OK ){
  3589             goto failed;
  3590           }
  3591         }else{
  3592           memset(dbFileVers, 0, sizeof(dbFileVers));
  3593         }
  3594 
  3595         if( memcmp(pPager->dbFileVers, dbFileVers, sizeof(dbFileVers))!=0 ){
  3596           pager_reset(pPager);
  3597         }
  3598       }
  3599     }
  3600     assert( pPager->exclusiveMode || pPager->state<=PAGER_SHARED );
  3601     if( pPager->state==PAGER_UNLOCK ){
  3602       pPager->state = PAGER_SHARED;
  3603     }
  3604   }
  3605 
  3606  failed:
  3607   if( rc!=SQLITE_OK ){
  3608     /* pager_unlock() is a no-op for exclusive mode and in-memory databases. */
  3609     pager_unlock(pPager);
  3610   }
  3611   return rc;
  3612 }
  3613 
  3614 /*
  3615 ** Allocate a PgHdr object.   Either create a new one or reuse
  3616 ** an existing one that is not otherwise in use.
  3617 **
  3618 ** A new PgHdr structure is created if any of the following are
  3619 ** true:
  3620 **
  3621 **     (1)  We have not exceeded our maximum allocated cache size
  3622 **          as set by the "PRAGMA cache_size" command.
  3623 **
  3624 **     (2)  There are no unused PgHdr objects available at this time.
  3625 **
  3626 **     (3)  This is an in-memory database.
  3627 **
  3628 **     (4)  There are no PgHdr objects that do not require a journal
  3629 **          file sync and a sync of the journal file is currently
  3630 **          prohibited.
  3631 **
  3632 ** Otherwise, reuse an existing PgHdr.  In other words, reuse an
  3633 ** existing PgHdr if all of the following are true:
  3634 **
  3635 **     (1)  We have reached or exceeded the maximum cache size
  3636 **          allowed by "PRAGMA cache_size".
  3637 **
  3638 **     (2)  There is a PgHdr available with PgHdr->nRef==0
  3639 **
  3640 **     (3)  We are not in an in-memory database
  3641 **
  3642 **     (4)  Either there is an available PgHdr that does not need
  3643 **          to be synced to disk or else disk syncing is currently
  3644 **          allowed.
  3645 */
  3646 static int pagerAllocatePage(Pager *pPager, PgHdr **ppPg){
  3647   int rc = SQLITE_OK;
  3648   PgHdr *pPg;
  3649   int nByteHdr;
  3650 
  3651   /* Create a new PgHdr if any of the four conditions defined 
  3652   ** above are met: */
  3653   if( pPager->nPage<pPager->mxPage
  3654    || pPager->lru.pFirst==0 
  3655    || MEMDB
  3656    || (pPager->lru.pFirstSynced==0 && pPager->doNotSync)
  3657   ){
  3658     void *pData = 0;                   /* Initialized to placate warning */
  3659     if( pPager->nPage>=pPager->nHash ){
  3660       pager_resize_hash_table(pPager,
  3661          pPager->nHash<256 ? 256 : pPager->nHash*2);
  3662       if( pPager->nHash==0 ){
  3663         rc = SQLITE_NOMEM;
  3664         goto pager_allocate_out;
  3665       }
  3666     }
  3667     pagerLeave(pPager);
  3668     nByteHdr = sizeof(*pPg) + sizeof(u32) + pPager->nExtra
  3669               + MEMDB*sizeof(PgHistory);
  3670     pPg = sqlite3Malloc( nByteHdr );
  3671     if( pPg ){
  3672       pData = sqlite3PageMalloc( pPager->pageSize );
  3673       if( pData==0 ){
  3674         sqlite3_free(pPg);
  3675         pPg = 0;
  3676       }
  3677     }
  3678     pagerEnter(pPager);
  3679     if( pPg==0 ){
  3680       rc = SQLITE_NOMEM;
  3681       goto pager_allocate_out;
  3682     }
  3683     memset(pPg, 0, nByteHdr);
  3684     pPg->pData = pData;
  3685     pPg->pPager = pPager;
  3686     pPg->pNextAll = pPager->pAll;
  3687 #ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
  3688     if( pPg->pNextAll ){
  3689       pPg->pNextAll->pPrevAll = pPg;
  3690     }
  3691 #endif
  3692     pPager->pAll = pPg;
  3693     pPager->nPage++;
  3694   }else{
  3695     /* Recycle an existing page with a zero ref-count. */
  3696     rc = pager_recycle(pPager, &pPg);
  3697     if( rc==SQLITE_BUSY ){
  3698       rc = SQLITE_IOERR_BLOCKED;
  3699     }
  3700     if( rc!=SQLITE_OK ){
  3701       goto pager_allocate_out;
  3702     }
  3703     assert( pPager->state>=SHARED_LOCK );
  3704     assert(pPg);
  3705   }
  3706   *ppPg = pPg;
  3707 
  3708 pager_allocate_out:
  3709   return rc;
  3710 }
  3711 
  3712 /*
  3713 ** Make sure we have the content for a page.  If the page was
  3714 ** previously acquired with noContent==1, then the content was
  3715 ** just initialized to zeros instead of being read from disk.
  3716 ** But now we need the real data off of disk.  So make sure we
  3717 ** have it.  Read it in if we do not have it already.
  3718 */
  3719 static int pager_get_content(PgHdr *pPg){
  3720   if( pPg->needRead ){
  3721     int rc = readDbPage(pPg->pPager, pPg, pPg->pgno);
  3722     if( rc==SQLITE_OK ){
  3723       pPg->needRead = 0;
  3724     }else{
  3725       return rc;
  3726     }
  3727   }
  3728   return SQLITE_OK;
  3729 }
  3730 
  3731 /*
  3732 ** Acquire a page.
  3733 **
  3734 ** A read lock on the disk file is obtained when the first page is acquired. 
  3735 ** This read lock is dropped when the last page is released.
  3736 **
  3737 ** This routine works for any page number greater than 0.  If the database
  3738 ** file is smaller than the requested page, then no actual disk
  3739 ** read occurs and the memory image of the page is initialized to
  3740 ** all zeros.  The extra data appended to a page is always initialized
  3741 ** to zeros the first time a page is loaded into memory.
  3742 **
  3743 ** The acquisition might fail for several reasons.  In all cases,
  3744 ** an appropriate error code is returned and *ppPage is set to NULL.
  3745 **
  3746 ** See also sqlite3PagerLookup().  Both this routine and Lookup() attempt
  3747 ** to find a page in the in-memory cache first.  If the page is not already
  3748 ** in memory, this routine goes to disk to read it in whereas Lookup()
  3749 ** just returns 0.  This routine acquires a read-lock the first time it
  3750 ** has to go to disk, and could also playback an old journal if necessary.
  3751 ** Since Lookup() never goes to disk, it never has to deal with locks
  3752 ** or journal files.
  3753 **
  3754 ** If noContent is false, the page contents are actually read from disk.
  3755 ** If noContent is true, it means that we do not care about the contents
  3756 ** of the page at this time, so do not do a disk read.  Just fill in the
  3757 ** page content with zeros.  But mark the fact that we have not read the
  3758 ** content by setting the PgHdr.needRead flag.  Later on, if 
  3759 ** sqlite3PagerWrite() is called on this page or if this routine is
  3760 ** called again with noContent==0, that means that the content is needed
  3761 ** and the disk read should occur at that point.
  3762 */
  3763 static int pagerAcquire(
  3764   Pager *pPager,      /* The pager open on the database file */
  3765   Pgno pgno,          /* Page number to fetch */
  3766   DbPage **ppPage,    /* Write a pointer to the page here */
  3767   int noContent       /* Do not bother reading content from disk if true */
  3768 ){
  3769   PgHdr *pPg;
  3770   int rc;
  3771 
  3772   assert( pPager->state==PAGER_UNLOCK || pPager->nRef>0 || pgno==1 );
  3773 
  3774   /* The maximum page number is 2^31. Return SQLITE_CORRUPT if a page
  3775   ** number greater than this, or zero, is requested.
  3776   */
  3777   if( pgno>PAGER_MAX_PGNO || pgno==0 || pgno==PAGER_MJ_PGNO(pPager) ){
  3778     return SQLITE_CORRUPT_BKPT;
  3779   }
  3780 
  3781   /* Make sure we have not hit any critical errors.
  3782   */ 
  3783   assert( pPager!=0 );
  3784   *ppPage = 0;
  3785 
  3786   /* If this is the first page accessed, then get a SHARED lock
  3787   ** on the database file. pagerSharedLock() is a no-op if 
  3788   ** a database lock is already held.
  3789   */
  3790   rc = pagerSharedLock(pPager);
  3791   if( rc!=SQLITE_OK ){
  3792     return rc;
  3793   }
  3794   assert( pPager->state!=PAGER_UNLOCK );
  3795 
  3796   pPg = pager_lookup(pPager, pgno);
  3797   if( pPg==0 ){
  3798     /* The requested page is not in the page cache. */
  3799     int nMax;
  3800     int h;
  3801     PAGER_INCR(pPager->nMiss);
  3802     rc = pagerAllocatePage(pPager, &pPg);
  3803     if( rc!=SQLITE_OK ){
  3804       return rc;
  3805     }
  3806 
  3807     pPg->pgno = pgno;
  3808     assert( !MEMDB || pgno>pPager->stmtSize );
  3809     pPg->inJournal = sqlite3BitvecTest(pPager->pInJournal, pgno);
  3810     pPg->needSync = 0;
  3811 
  3812     makeClean(pPg);
  3813     pPg->nRef = 1;
  3814 
  3815     pPager->nRef++;
  3816     if( pPager->nExtra>0 ){
  3817       memset(PGHDR_TO_EXTRA(pPg, pPager), 0, pPager->nExtra);
  3818     }
  3819     rc = sqlite3PagerPagecount(pPager, &nMax);
  3820     if( rc!=SQLITE_OK ){
  3821       sqlite3PagerUnref(pPg);
  3822       return rc;
  3823     }
  3824 
  3825     /* Populate the page with data, either by reading from the database
  3826     ** file, or by setting the entire page to zero.
  3827     */
  3828     if( nMax<(int)pgno || MEMDB || (noContent && !pPager->alwaysRollback) ){
  3829       if( pgno>pPager->mxPgno ){
  3830         sqlite3PagerUnref(pPg);
  3831         return SQLITE_FULL;
  3832       }
  3833       memset(PGHDR_TO_DATA(pPg), 0, pPager->pageSize);
  3834       pPg->needRead = noContent && !pPager->alwaysRollback;
  3835       IOTRACE(("ZERO %p %d\n", pPager, pgno));
  3836     }else{
  3837       rc = readDbPage(pPager, pPg, pgno);
  3838       if( rc!=SQLITE_OK && rc!=SQLITE_IOERR_SHORT_READ ){
  3839         pPg->pgno = 0;
  3840         sqlite3PagerUnref(pPg);
  3841         return rc;
  3842       }
  3843       pPg->needRead = 0;
  3844     }
  3845 
  3846     /* Link the page into the page hash table */
  3847     h = pgno & (pPager->nHash-1);
  3848     assert( pgno!=0 );
  3849     pPg->pNextHash = pPager->aHash[h];
  3850     pPager->aHash[h] = pPg;
  3851     if( pPg->pNextHash ){
  3852       assert( pPg->pNextHash->pPrevHash==0 );
  3853       pPg->pNextHash->pPrevHash = pPg;
  3854     }
  3855 
  3856 #ifdef SQLITE_CHECK_PAGES
  3857     pPg->pageHash = pager_pagehash(pPg);
  3858 #endif
  3859   }else{
  3860     /* The requested page is in the page cache. */
  3861     assert(pPager->nRef>0 || pgno==1);
  3862     PAGER_INCR(pPager->nHit);
  3863     if( !noContent ){
  3864       rc = pager_get_content(pPg);
  3865       if( rc ){
  3866         return rc;
  3867       }
  3868     }
  3869     page_ref(pPg);
  3870   }
  3871   *ppPage = pPg;
  3872   return SQLITE_OK;
  3873 }
  3874 int sqlite3PagerAcquire(
  3875   Pager *pPager,      /* The pager open on the database file */
  3876   Pgno pgno,          /* Page number to fetch */
  3877   DbPage **ppPage,    /* Write a pointer to the page here */
  3878   int noContent       /* Do not bother reading content from disk if true */
  3879 ){
  3880   int rc;
  3881   pagerEnter(pPager);
  3882   rc = pagerAcquire(pPager, pgno, ppPage, noContent);
  3883   pagerLeave(pPager);
  3884   return rc;
  3885 }
  3886 
  3887 
  3888 /*
  3889 ** Acquire a page if it is already in the in-memory cache.  Do
  3890 ** not read the page from disk.  Return a pointer to the page,
  3891 ** or 0 if the page is not in cache.
  3892 **
  3893 ** See also sqlite3PagerGet().  The difference between this routine
  3894 ** and sqlite3PagerGet() is that _get() will go to the disk and read
  3895 ** in the page if the page is not already in cache.  This routine
  3896 ** returns NULL if the page is not in cache or if a disk I/O error 
  3897 ** has ever happened.
  3898 */
  3899 DbPage *sqlite3PagerLookup(Pager *pPager, Pgno pgno){
  3900   PgHdr *pPg = 0;
  3901 
  3902   assert( pPager!=0 );
  3903   assert( pgno!=0 );
  3904 
  3905   pagerEnter(pPager);
  3906   if( pPager->state==PAGER_UNLOCK ){
  3907     assert( !pPager->pAll || pPager->exclusiveMode );
  3908   }else if( pPager->errCode && pPager->errCode!=SQLITE_FULL ){
  3909     /* Do nothing */
  3910   }else if( (pPg = pager_lookup(pPager, pgno))!=0 ){
  3911     page_ref(pPg);
  3912   }
  3913   pagerLeave(pPager);
  3914   return pPg;
  3915 }
  3916 
  3917 /*
  3918 ** Release a page.
  3919 **
  3920 ** If the number of references to the page drop to zero, then the
  3921 ** page is added to the LRU list.  When all references to all pages
  3922 ** are released, a rollback occurs and the lock on the database is
  3923 ** removed.
  3924 */
  3925 int sqlite3PagerUnref(DbPage *pPg){
  3926   Pager *pPager;
  3927 
  3928   if( pPg==0 ) return SQLITE_OK;
  3929   pPager = pPg->pPager;
  3930 
  3931   /* Decrement the reference count for this page
  3932   */
  3933   assert( pPg->nRef>0 );
  3934   pagerEnter(pPg->pPager);
  3935   pPg->nRef--;
  3936 
  3937   CHECK_PAGE(pPg);
  3938 
  3939   /* When the number of references to a page reach 0, call the
  3940   ** destructor and add the page to the freelist.
  3941   */
  3942   if( pPg->nRef==0 ){
  3943 
  3944     lruListAdd(pPg);
  3945     if( pPager->xDestructor ){
  3946       pPager->xDestructor(pPg, pPager->pageSize);
  3947     }
  3948   
  3949     /* When all pages reach the freelist, drop the read lock from
  3950     ** the database file.
  3951     */
  3952     pPager->nRef--;
  3953     assert( pPager->nRef>=0 );
  3954     if( pPager->nRef==0 && (!pPager->exclusiveMode || pPager->journalOff>0) ){
  3955       pagerUnlockAndRollback(pPager);
  3956     }
  3957   }
  3958   pagerLeave(pPager);
  3959   return SQLITE_OK;
  3960 }
  3961 
  3962 /*
  3963 ** Create a journal file for pPager.  There should already be a RESERVED
  3964 ** or EXCLUSIVE lock on the database file when this routine is called.
  3965 **
  3966 ** Return SQLITE_OK if everything.  Return an error code and release the
  3967 ** write lock if anything goes wrong.
  3968 */
  3969 static int pager_open_journal(Pager *pPager){
  3970   sqlite3_vfs *pVfs = pPager->pVfs;
  3971   int flags = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_EXCLUSIVE|SQLITE_OPEN_CREATE);
  3972 
  3973   int rc;
  3974   assert( !MEMDB );
  3975   assert( pPager->state>=PAGER_RESERVED );
  3976   assert( pPager->useJournal );
  3977   assert( pPager->pInJournal==0 );
  3978   sqlite3PagerPagecount(pPager, 0);
  3979   pagerLeave(pPager);
  3980   pPager->pInJournal = sqlite3BitvecCreate(pPager->dbSize);
  3981   pagerEnter(pPager);
  3982   if( pPager->pInJournal==0 ){
  3983     rc = SQLITE_NOMEM;
  3984     goto failed_to_open_journal;
  3985   }
  3986 
  3987   if( pPager->journalOpen==0 ){
  3988     if( pPager->tempFile ){
  3989       flags |= (SQLITE_OPEN_DELETEONCLOSE|SQLITE_OPEN_TEMP_JOURNAL);
  3990     }else{
  3991       flags |= (SQLITE_OPEN_MAIN_JOURNAL);
  3992     }
  3993 #ifdef SQLITE_ENABLE_ATOMIC_WRITE
  3994     rc = sqlite3JournalOpen(
  3995         pVfs, pPager->zJournal, pPager->jfd, flags, jrnlBufferSize(pPager)
  3996     );
  3997 #else
  3998     rc = sqlite3OsOpen(pVfs, pPager->zJournal, pPager->jfd, flags, 0);
  3999 #endif
  4000     assert( rc!=SQLITE_OK || pPager->jfd->pMethods );
  4001     pPager->journalOff = 0;
  4002     pPager->setMaster = 0;
  4003     pPager->journalHdr = 0;
  4004     if( rc!=SQLITE_OK ){
  4005       if( rc==SQLITE_NOMEM ){
  4006         sqlite3OsDelete(pVfs, pPager->zJournal, 0);
  4007       }
  4008       goto failed_to_open_journal;
  4009     }
  4010   }
  4011   pPager->journalOpen = 1;
  4012   pPager->journalStarted = 0;
  4013   pPager->needSync = 0;
  4014   pPager->alwaysRollback = 0;
  4015   pPager->nRec = 0;
  4016   if( pPager->errCode ){
  4017     rc = pPager->errCode;
  4018     goto failed_to_open_journal;
  4019   }
  4020   pPager->origDbSize = pPager->dbSize;
  4021 
  4022   rc = writeJournalHdr(pPager);
  4023 
  4024   if( pPager->stmtAutoopen && rc==SQLITE_OK ){
  4025     rc = sqlite3PagerStmtBegin(pPager);
  4026   }
  4027   if( rc!=SQLITE_OK && rc!=SQLITE_NOMEM && rc!=SQLITE_IOERR_NOMEM ){
  4028     rc = pager_end_transaction(pPager, 0);
  4029     if( rc==SQLITE_OK ){
  4030       rc = SQLITE_FULL;
  4031     }
  4032   }
  4033   return rc;
  4034 
  4035 failed_to_open_journal:
  4036   sqlite3BitvecDestroy(pPager->pInJournal);
  4037   pPager->pInJournal = 0;
  4038   return rc;
  4039 }
  4040 
  4041 /*
  4042 ** Acquire a write-lock on the database.  The lock is removed when
  4043 ** the any of the following happen:
  4044 **
  4045 **   *  sqlite3PagerCommitPhaseTwo() is called.
  4046 **   *  sqlite3PagerRollback() is called.
  4047 **   *  sqlite3PagerClose() is called.
  4048 **   *  sqlite3PagerUnref() is called to on every outstanding page.
  4049 **
  4050 ** The first parameter to this routine is a pointer to any open page of the
  4051 ** database file.  Nothing changes about the page - it is used merely to
  4052 ** acquire a pointer to the Pager structure and as proof that there is
  4053 ** already a read-lock on the database.
  4054 **
  4055 ** The second parameter indicates how much space in bytes to reserve for a
  4056 ** master journal file-name at the start of the journal when it is created.
  4057 **
  4058 ** A journal file is opened if this is not a temporary file.  For temporary
  4059 ** files, the opening of the journal file is deferred until there is an
  4060 ** actual need to write to the journal.
  4061 **
  4062 ** If the database is already reserved for writing, this routine is a no-op.
  4063 **
  4064 ** If exFlag is true, go ahead and get an EXCLUSIVE lock on the file
  4065 ** immediately instead of waiting until we try to flush the cache.  The
  4066 ** exFlag is ignored if a transaction is already active.
  4067 */
  4068 int sqlite3PagerBegin(DbPage *pPg, int exFlag){
  4069   Pager *pPager = pPg->pPager;
  4070   int rc = SQLITE_OK;
  4071   pagerEnter(pPager);
  4072   assert( pPg->nRef>0 );
  4073   assert( pPager->state!=PAGER_UNLOCK );
  4074   if( pPager->state==PAGER_SHARED ){
  4075     assert( pPager->pInJournal==0 );
  4076     if( MEMDB ){
  4077       pPager->state = PAGER_EXCLUSIVE;
  4078       pPager->origDbSize = pPager->dbSize;
  4079     }else{
  4080       rc = sqlite3OsLock(pPager->fd, RESERVED_LOCK);
  4081       if( rc==SQLITE_OK ){
  4082         pPager->state = PAGER_RESERVED;
  4083         if( exFlag ){
  4084           rc = pager_wait_on_lock(pPager, EXCLUSIVE_LOCK);
  4085         }
  4086       }
  4087       if( rc!=SQLITE_OK ){
  4088         pagerLeave(pPager);
  4089         return rc;
  4090       }
  4091       pPager->dirtyCache = 0;
  4092       PAGERTRACE2("TRANSACTION %d\n", PAGERID(pPager));
  4093       if( pPager->useJournal && !pPager->tempFile
  4094              && pPager->journalMode!=PAGER_JOURNALMODE_OFF ){
  4095         rc = pager_open_journal(pPager);
  4096       }
  4097     }
  4098   }else if( pPager->journalOpen && pPager->journalOff==0 ){
  4099     /* This happens when the pager was in exclusive-access mode the last
  4100     ** time a (read or write) transaction was successfully concluded
  4101     ** by this connection. Instead of deleting the journal file it was 
  4102     ** kept open and either was truncated to 0 bytes or its header was
  4103     ** overwritten with zeros.
  4104     */
  4105     assert( pPager->nRec==0 );
  4106     assert( pPager->origDbSize==0 );
  4107     assert( pPager->pInJournal==0 );
  4108     sqlite3PagerPagecount(pPager, 0);
  4109     pagerLeave(pPager);
  4110     pPager->pInJournal = sqlite3BitvecCreate( pPager->dbSize );
  4111     pagerEnter(pPager);
  4112     if( !pPager->pInJournal ){
  4113       rc = SQLITE_NOMEM;
  4114     }else{
  4115       pPager->origDbSize = pPager->dbSize;
  4116       rc = writeJournalHdr(pPager);
  4117     }
  4118   }
  4119   assert( !pPager->journalOpen || pPager->journalOff>0 || rc!=SQLITE_OK );
  4120   pagerLeave(pPager);
  4121   return rc;
  4122 }
  4123 
  4124 /*
  4125 ** Make a page dirty.  Set its dirty flag and add it to the dirty
  4126 ** page list.
  4127 */
  4128 static void makeDirty(PgHdr *pPg){
  4129   if( pPg->dirty==0 ){
  4130     Pager *pPager = pPg->pPager;
  4131     pPg->dirty = 1;
  4132     pPg->pDirty = pPager->pDirty;
  4133     if( pPager->pDirty ){
  4134       pPager->pDirty->pPrevDirty = pPg;
  4135     }
  4136     pPg->pPrevDirty = 0;
  4137     pPager->pDirty = pPg;
  4138   }
  4139 }
  4140 
  4141 /*
  4142 ** Make a page clean.  Clear its dirty bit and remove it from the
  4143 ** dirty page list.
  4144 */
  4145 static void makeClean(PgHdr *pPg){
  4146   if( pPg->dirty ){
  4147     pPg->dirty = 0;
  4148     if( pPg->pDirty ){
  4149       assert( pPg->pDirty->pPrevDirty==pPg );
  4150       pPg->pDirty->pPrevDirty = pPg->pPrevDirty;
  4151     }
  4152     if( pPg->pPrevDirty ){
  4153       assert( pPg->pPrevDirty->pDirty==pPg );
  4154       pPg->pPrevDirty->pDirty = pPg->pDirty;
  4155     }else{
  4156       assert( pPg->pPager->pDirty==pPg );
  4157       pPg->pPager->pDirty = pPg->pDirty;
  4158     }
  4159   }
  4160 }
  4161 
  4162 
  4163 /*
  4164 ** Mark a data page as writeable.  The page is written into the journal 
  4165 ** if it is not there already.  This routine must be called before making
  4166 ** changes to a page.
  4167 **
  4168 ** The first time this routine is called, the pager creates a new
  4169 ** journal and acquires a RESERVED lock on the database.  If the RESERVED
  4170 ** lock could not be acquired, this routine returns SQLITE_BUSY.  The
  4171 ** calling routine must check for that return value and be careful not to
  4172 ** change any page data until this routine returns SQLITE_OK.
  4173 **
  4174 ** If the journal file could not be written because the disk is full,
  4175 ** then this routine returns SQLITE_FULL and does an immediate rollback.
  4176 ** All subsequent write attempts also return SQLITE_FULL until there
  4177 ** is a call to sqlite3PagerCommit() or sqlite3PagerRollback() to
  4178 ** reset.
  4179 */
  4180 static int pager_write(PgHdr *pPg){
  4181   void *pData = PGHDR_TO_DATA(pPg);
  4182   Pager *pPager = pPg->pPager;
  4183   int rc = SQLITE_OK;
  4184 
  4185   /* Check for errors
  4186   */
  4187   if( pPager->errCode ){ 
  4188     return pPager->errCode;
  4189   }
  4190   if( pPager->readOnly ){
  4191     return SQLITE_PERM;
  4192   }
  4193 
  4194   assert( !pPager->setMaster );
  4195 
  4196   CHECK_PAGE(pPg);
  4197 
  4198   /* If this page was previously acquired with noContent==1, that means
  4199   ** we didn't really read in the content of the page.  This can happen
  4200   ** (for example) when the page is being moved to the freelist.  But
  4201   ** now we are (perhaps) moving the page off of the freelist for
  4202   ** reuse and we need to know its original content so that content
  4203   ** can be stored in the rollback journal.  So do the read at this
  4204   ** time.
  4205   */
  4206   rc = pager_get_content(pPg);
  4207   if( rc ){
  4208     return rc;
  4209   }
  4210 
  4211   /* Mark the page as dirty.  If the page has already been written
  4212   ** to the journal then we can return right away.
  4213   */
  4214   makeDirty(pPg);
  4215   if( pPg->inJournal && (pageInStatement(pPg) || pPager->stmtInUse==0) ){
  4216     pPager->dirtyCache = 1;
  4217     pPager->dbModified = 1;
  4218   }else{
  4219 
  4220     /* If we get this far, it means that the page needs to be
  4221     ** written to the transaction journal or the ckeckpoint journal
  4222     ** or both.
  4223     **
  4224     ** First check to see that the transaction journal exists and
  4225     ** create it if it does not.
  4226     */
  4227     assert( pPager->state!=PAGER_UNLOCK );
  4228     rc = sqlite3PagerBegin(pPg, 0);
  4229     if( rc!=SQLITE_OK ){
  4230       return rc;
  4231     }
  4232     assert( pPager->state>=PAGER_RESERVED );
  4233     if( !pPager->journalOpen && pPager->useJournal
  4234           && pPager->journalMode!=PAGER_JOURNALMODE_OFF ){
  4235       rc = pager_open_journal(pPager);
  4236       if( rc!=SQLITE_OK ) return rc;
  4237     }
  4238     pPager->dirtyCache = 1;
  4239     pPager->dbModified = 1;
  4240   
  4241     /* The transaction journal now exists and we have a RESERVED or an
  4242     ** EXCLUSIVE lock on the main database file.  Write the current page to
  4243     ** the transaction journal if it is not there already.
  4244     */
  4245     if( !pPg->inJournal && (pPager->journalOpen || MEMDB) ){
  4246       if( (int)pPg->pgno <= pPager->origDbSize ){
  4247         if( MEMDB ){
  4248           PgHistory *pHist = PGHDR_TO_HIST(pPg, pPager);
  4249           PAGERTRACE3("JOURNAL %d page %d\n", PAGERID(pPager), pPg->pgno);
  4250           assert( pHist->pOrig==0 );
  4251           pHist->pOrig = sqlite3PageMalloc( pPager->pageSize );
  4252           if( !pHist->pOrig ){
  4253             return SQLITE_NOMEM;
  4254           }
  4255           memcpy(pHist->pOrig, PGHDR_TO_DATA(pPg), pPager->pageSize);
  4256         }else{
  4257           u32 cksum;
  4258           char *pData2;
  4259 
  4260           /* We should never write to the journal file the page that
  4261           ** contains the database locks.  The following assert verifies
  4262           ** that we do not. */
  4263           assert( pPg->pgno!=PAGER_MJ_PGNO(pPager) );
  4264           pData2 = CODEC2(pPager, pData, pPg->pgno, 7);
  4265           cksum = pager_cksum(pPager, (u8*)pData2);
  4266           rc = write32bits(pPager->jfd, pPager->journalOff, pPg->pgno);
  4267           if( rc==SQLITE_OK ){
  4268             rc = sqlite3OsWrite(pPager->jfd, pData2, pPager->pageSize,
  4269                                 pPager->journalOff + 4);
  4270             pPager->journalOff += pPager->pageSize+4;
  4271           }
  4272           if( rc==SQLITE_OK ){
  4273             rc = write32bits(pPager->jfd, pPager->journalOff, cksum);
  4274             pPager->journalOff += 4;
  4275           }
  4276           IOTRACE(("JOUT %p %d %lld %d\n", pPager, pPg->pgno, 
  4277                    pPager->journalOff, pPager->pageSize));
  4278           PAGER_INCR(sqlite3_pager_writej_count);
  4279           PAGERTRACE5("JOURNAL %d page %d needSync=%d hash(%08x)\n",
  4280                PAGERID(pPager), pPg->pgno, pPg->needSync, pager_pagehash(pPg));
  4281 
  4282           /* An error has occured writing to the journal file. The 
  4283           ** transaction will be rolled back by the layer above.
  4284           */
  4285           if( rc!=SQLITE_OK ){
  4286             return rc;
  4287           }
  4288 
  4289           pPager->nRec++;
  4290           assert( pPager->pInJournal!=0 );
  4291           sqlite3BitvecSet(pPager->pInJournal, pPg->pgno);
  4292           pPg->needSync = !pPager->noSync;
  4293           if( pPager->stmtInUse ){
  4294             sqlite3BitvecSet(pPager->pInStmt, pPg->pgno);
  4295           }
  4296         }
  4297       }else{
  4298         pPg->needSync = !pPager->journalStarted && !pPager->noSync;
  4299         PAGERTRACE4("APPEND %d page %d needSync=%d\n",
  4300                 PAGERID(pPager), pPg->pgno, pPg->needSync);
  4301       }
  4302       if( pPg->needSync ){
  4303         pPager->needSync = 1;
  4304       }
  4305       pPg->inJournal = 1;
  4306     }
  4307   
  4308     /* If the statement journal is open and the page is not in it,
  4309     ** then write the current page to the statement journal.  Note that
  4310     ** the statement journal format differs from the standard journal format
  4311     ** in that it omits the checksums and the header.
  4312     */
  4313     if( pPager->stmtInUse 
  4314      && !pageInStatement(pPg) 
  4315      && (int)pPg->pgno<=pPager->stmtSize 
  4316     ){
  4317       assert( pPg->inJournal || (int)pPg->pgno>pPager->origDbSize );
  4318       if( MEMDB ){
  4319         PgHistory *pHist = PGHDR_TO_HIST(pPg, pPager);
  4320         assert( pHist->pStmt==0 );
  4321         pHist->pStmt = sqlite3PageMalloc( pPager->pageSize );
  4322         if( pHist->pStmt ){
  4323           memcpy(pHist->pStmt, PGHDR_TO_DATA(pPg), pPager->pageSize);
  4324         }
  4325         PAGERTRACE3("STMT-JOURNAL %d page %d\n", PAGERID(pPager), pPg->pgno);
  4326         page_add_to_stmt_list(pPg);
  4327       }else{
  4328         i64 offset = pPager->stmtNRec*(4+pPager->pageSize);
  4329         char *pData2 = CODEC2(pPager, pData, pPg->pgno, 7);
  4330         rc = write32bits(pPager->stfd, offset, pPg->pgno);
  4331         if( rc==SQLITE_OK ){
  4332           rc = sqlite3OsWrite(pPager->stfd, pData2, pPager->pageSize, offset+4);
  4333         }
  4334         PAGERTRACE3("STMT-JOURNAL %d page %d\n", PAGERID(pPager), pPg->pgno);
  4335         if( rc!=SQLITE_OK ){
  4336           return rc;
  4337         }
  4338         pPager->stmtNRec++;
  4339         assert( pPager->pInStmt!=0 );
  4340         sqlite3BitvecSet(pPager->pInStmt, pPg->pgno);
  4341       }
  4342     }
  4343   }
  4344 
  4345   /* Update the database size and return.
  4346   */
  4347   assert( pPager->state>=PAGER_SHARED );
  4348   if( pPager->dbSize<(int)pPg->pgno ){
  4349     pPager->dbSize = pPg->pgno;
  4350     if( !MEMDB && pPager->dbSize==PENDING_BYTE/pPager->pageSize ){
  4351       pPager->dbSize++;
  4352     }
  4353   }
  4354   return rc;
  4355 }
  4356 
  4357 /*
  4358 ** This function is used to mark a data-page as writable. It uses 
  4359 ** pager_write() to open a journal file (if it is not already open)
  4360 ** and write the page *pData to the journal.
  4361 **
  4362 ** The difference between this function and pager_write() is that this
  4363 ** function also deals with the special case where 2 or more pages
  4364 ** fit on a single disk sector. In this case all co-resident pages
  4365 ** must have been written to the journal file before returning.
  4366 */
  4367 int sqlite3PagerWrite(DbPage *pDbPage){
  4368   int rc = SQLITE_OK;
  4369 
  4370   PgHdr *pPg = pDbPage;
  4371   Pager *pPager = pPg->pPager;
  4372   Pgno nPagePerSector = (pPager->sectorSize/pPager->pageSize);
  4373 
  4374   pagerEnter(pPager);
  4375   if( !MEMDB && nPagePerSector>1 ){
  4376     Pgno nPageCount;          /* Total number of pages in database file */
  4377     Pgno pg1;                 /* First page of the sector pPg is located on. */
  4378     int nPage;                /* Number of pages starting at pg1 to journal */
  4379     int ii;
  4380     int needSync = 0;
  4381 
  4382     /* Set the doNotSync flag to 1. This is because we cannot allow a journal
  4383     ** header to be written between the pages journaled by this function.
  4384     */
  4385     assert( pPager->doNotSync==0 );
  4386     pPager->doNotSync = 1;
  4387 
  4388     /* This trick assumes that both the page-size and sector-size are
  4389     ** an integer power of 2. It sets variable pg1 to the identifier
  4390     ** of the first page of the sector pPg is located on.
  4391     */
  4392     pg1 = ((pPg->pgno-1) & ~(nPagePerSector-1)) + 1;
  4393 
  4394     sqlite3PagerPagecount(pPager, (int *)&nPageCount);
  4395     if( pPg->pgno>nPageCount ){
  4396       nPage = (pPg->pgno - pg1)+1;
  4397     }else if( (pg1+nPagePerSector-1)>nPageCount ){
  4398       nPage = nPageCount+1-pg1;
  4399     }else{
  4400       nPage = nPagePerSector;
  4401     }
  4402     assert(nPage>0);
  4403     assert(pg1<=pPg->pgno);
  4404     assert((pg1+nPage)>pPg->pgno);
  4405 
  4406     for(ii=0; ii<nPage && rc==SQLITE_OK; ii++){
  4407       Pgno pg = pg1+ii;
  4408       PgHdr *pPage;
  4409       if( pg==pPg->pgno || !sqlite3BitvecTest(pPager->pInJournal, pg) ){
  4410         if( pg!=PAGER_MJ_PGNO(pPager) ){
  4411           rc = sqlite3PagerGet(pPager, pg, &pPage);
  4412           if( rc==SQLITE_OK ){
  4413             rc = pager_write(pPage);
  4414             if( pPage->needSync ){
  4415               needSync = 1;
  4416             }
  4417             sqlite3PagerUnref(pPage);
  4418           }
  4419         }
  4420       }else if( (pPage = pager_lookup(pPager, pg))!=0 ){
  4421         if( pPage->needSync ){
  4422           needSync = 1;
  4423         }
  4424       }
  4425     }
  4426 
  4427     /* If the PgHdr.needSync flag is set for any of the nPage pages 
  4428     ** starting at pg1, then it needs to be set for all of them. Because
  4429     ** writing to any of these nPage pages may damage the others, the
  4430     ** journal file must contain sync()ed copies of all of them
  4431     ** before any of them can be written out to the database file.
  4432     */
  4433     if( needSync ){
  4434       for(ii=0; ii<nPage && needSync; ii++){
  4435         PgHdr *pPage = pager_lookup(pPager, pg1+ii);
  4436         if( pPage ) pPage->needSync = 1;
  4437       }
  4438       assert(pPager->needSync);
  4439     }
  4440 
  4441     assert( pPager->doNotSync==1 );
  4442     pPager->doNotSync = 0;
  4443   }else{
  4444     rc = pager_write(pDbPage);
  4445   }
  4446   pagerLeave(pPager);
  4447   return rc;
  4448 }
  4449 
  4450 /*
  4451 ** Return TRUE if the page given in the argument was previously passed
  4452 ** to sqlite3PagerWrite().  In other words, return TRUE if it is ok
  4453 ** to change the content of the page.
  4454 */
  4455 #ifndef NDEBUG
  4456 int sqlite3PagerIswriteable(DbPage *pPg){
  4457   return pPg->dirty;
  4458 }
  4459 #endif
  4460 
  4461 /*
  4462 ** A call to this routine tells the pager that it is not necessary to
  4463 ** write the information on page pPg back to the disk, even though
  4464 ** that page might be marked as dirty.
  4465 **
  4466 ** The overlying software layer calls this routine when all of the data
  4467 ** on the given page is unused.  The pager marks the page as clean so
  4468 ** that it does not get written to disk.
  4469 **
  4470 ** Tests show that this optimization, together with the
  4471 ** sqlite3PagerDontRollback() below, more than double the speed
  4472 ** of large INSERT operations and quadruple the speed of large DELETEs.
  4473 **
  4474 ** When this routine is called, set the alwaysRollback flag to true.
  4475 ** Subsequent calls to sqlite3PagerDontRollback() for the same page
  4476 ** will thereafter be ignored.  This is necessary to avoid a problem
  4477 ** where a page with data is added to the freelist during one part of
  4478 ** a transaction then removed from the freelist during a later part
  4479 ** of the same transaction and reused for some other purpose.  When it
  4480 ** is first added to the freelist, this routine is called.  When reused,
  4481 ** the sqlite3PagerDontRollback() routine is called.  But because the
  4482 ** page contains critical data, we still need to be sure it gets
  4483 ** rolled back in spite of the sqlite3PagerDontRollback() call.
  4484 */
  4485 void sqlite3PagerDontWrite(DbPage *pDbPage){
  4486   PgHdr *pPg = pDbPage;
  4487   Pager *pPager = pPg->pPager;
  4488 
  4489   if( MEMDB ) return;
  4490   pagerEnter(pPager);
  4491   pPg->alwaysRollback = 1;
  4492   if( pPg->dirty && !pPager->stmtInUse ){
  4493     assert( pPager->state>=PAGER_SHARED );
  4494     if( pPager->dbSize==(int)pPg->pgno && pPager->origDbSize<pPager->dbSize ){
  4495       /* If this pages is the last page in the file and the file has grown
  4496       ** during the current transaction, then do NOT mark the page as clean.
  4497       ** When the database file grows, we must make sure that the last page
  4498       ** gets written at least once so that the disk file will be the correct
  4499       ** size. If you do not write this page and the size of the file
  4500       ** on the disk ends up being too small, that can lead to database
  4501       ** corruption during the next transaction.
  4502       */
  4503     }else{
  4504       PAGERTRACE3("DONT_WRITE page %d of %d\n", pPg->pgno, PAGERID(pPager));
  4505       IOTRACE(("CLEAN %p %d\n", pPager, pPg->pgno))
  4506       makeClean(pPg);
  4507 #ifdef SQLITE_CHECK_PAGES
  4508       pPg->pageHash = pager_pagehash(pPg);
  4509 #endif
  4510     }
  4511   }
  4512   pagerLeave(pPager);
  4513 }
  4514 
  4515 /*
  4516 ** A call to this routine tells the pager that if a rollback occurs,
  4517 ** it is not necessary to restore the data on the given page.  This
  4518 ** means that the pager does not have to record the given page in the
  4519 ** rollback journal.
  4520 **
  4521 ** If we have not yet actually read the content of this page (if
  4522 ** the PgHdr.needRead flag is set) then this routine acts as a promise
  4523 ** that we will never need to read the page content in the future.
  4524 ** so the needRead flag can be cleared at this point.
  4525 */
  4526 void sqlite3PagerDontRollback(DbPage *pPg){
  4527   Pager *pPager = pPg->pPager;
  4528 
  4529   pagerEnter(pPager);
  4530   assert( pPager->state>=PAGER_RESERVED );
  4531 
  4532   /* If the journal file is not open, or DontWrite() has been called on
  4533   ** this page (DontWrite() sets the alwaysRollback flag), then this
  4534   ** function is a no-op.
  4535   */
  4536   if( pPager->journalOpen==0 || pPg->alwaysRollback || pPager->alwaysRollback ){
  4537     pagerLeave(pPager);
  4538     return;
  4539   }
  4540   assert( !MEMDB );    /* For a memdb, pPager->journalOpen is always 0 */
  4541 
  4542 #ifdef SQLITE_SECURE_DELETE
  4543   if( pPg->inJournal || (int)pPg->pgno > pPager->origDbSize ){
  4544     return;
  4545   }
  4546 #endif
  4547 
  4548   /* If SECURE_DELETE is disabled, then there is no way that this
  4549   ** routine can be called on a page for which sqlite3PagerDontWrite()
  4550   ** has not been previously called during the same transaction.
  4551   ** And if DontWrite() has previously been called, the following
  4552   ** conditions must be met.
  4553   **
  4554   ** (Later:)  Not true.  If the database is corrupted by having duplicate
  4555   ** pages on the freelist (ex: corrupt9.test) then the following is not
  4556   ** necessarily true:
  4557   */
  4558   /* assert( !pPg->inJournal && (int)pPg->pgno <= pPager->origDbSize ); */
  4559 
  4560   assert( pPager->pInJournal!=0 );
  4561   sqlite3BitvecSet(pPager->pInJournal, pPg->pgno);
  4562   pPg->inJournal = 1;
  4563   pPg->needRead = 0;
  4564   if( pPager->stmtInUse ){
  4565     assert( pPager->stmtSize >= pPager->origDbSize );
  4566     sqlite3BitvecSet(pPager->pInStmt, pPg->pgno);
  4567   }
  4568   PAGERTRACE3("DONT_ROLLBACK page %d of %d\n", pPg->pgno, PAGERID(pPager));
  4569   IOTRACE(("GARBAGE %p %d\n", pPager, pPg->pgno))
  4570   pagerLeave(pPager);
  4571 }
  4572 
  4573 
  4574 /*
  4575 ** This routine is called to increment the database file change-counter,
  4576 ** stored at byte 24 of the pager file.
  4577 */
  4578 static int pager_incr_changecounter(Pager *pPager, int isDirect){
  4579   PgHdr *pPgHdr;
  4580   u32 change_counter;
  4581   int rc = SQLITE_OK;
  4582 
  4583 #ifndef SQLITE_ENABLE_ATOMIC_WRITE
  4584   assert( isDirect==0 );  /* isDirect is only true for atomic writes */
  4585 #endif
  4586   if( !pPager->changeCountDone ){
  4587     /* Open page 1 of the file for writing. */
  4588     rc = sqlite3PagerGet(pPager, 1, &pPgHdr);
  4589     if( rc!=SQLITE_OK ) return rc;
  4590 
  4591     if( !isDirect ){
  4592       rc = sqlite3PagerWrite(pPgHdr);
  4593       if( rc!=SQLITE_OK ){
  4594         sqlite3PagerUnref(pPgHdr);
  4595         return rc;
  4596       }
  4597     }
  4598 
  4599     /* Increment the value just read and write it back to byte 24. */
  4600     change_counter = sqlite3Get4byte((u8*)pPager->dbFileVers);
  4601     change_counter++;
  4602     put32bits(((char*)PGHDR_TO_DATA(pPgHdr))+24, change_counter);
  4603 
  4604 #ifdef SQLITE_ENABLE_ATOMIC_WRITE
  4605     if( isDirect && pPager->fd->pMethods ){
  4606       const void *zBuf = PGHDR_TO_DATA(pPgHdr);
  4607       rc = sqlite3OsWrite(pPager->fd, zBuf, pPager->pageSize, 0);
  4608     }
  4609 #endif
  4610 
  4611     /* Release the page reference. */
  4612     sqlite3PagerUnref(pPgHdr);
  4613     pPager->changeCountDone = 1;
  4614   }
  4615   return rc;
  4616 }
  4617 
  4618 /*
  4619 ** Sync the pager file to disk.
  4620 */
  4621 int sqlite3PagerSync(Pager *pPager){
  4622   int rc;
  4623   pagerEnter(pPager);
  4624   rc = sqlite3OsSync(pPager->fd, pPager->sync_flags);
  4625   pagerLeave(pPager);
  4626   return rc;
  4627 }
  4628 
  4629 /*
  4630 ** Sync the database file for the pager pPager. zMaster points to the name
  4631 ** of a master journal file that should be written into the individual
  4632 ** journal file. zMaster may be NULL, which is interpreted as no master
  4633 ** journal (a single database transaction).
  4634 **
  4635 ** This routine ensures that the journal is synced, all dirty pages written
  4636 ** to the database file and the database file synced. The only thing that
  4637 ** remains to commit the transaction is to delete the journal file (or
  4638 ** master journal file if specified).
  4639 **
  4640 ** Note that if zMaster==NULL, this does not overwrite a previous value
  4641 ** passed to an sqlite3PagerCommitPhaseOne() call.
  4642 **
  4643 ** If parameter nTrunc is non-zero, then the pager file is truncated to
  4644 ** nTrunc pages (this is used by auto-vacuum databases).
  4645 **
  4646 ** If the final parameter - noSync - is true, then the database file itself
  4647 ** is not synced. The caller must call sqlite3PagerSync() directly to
  4648 ** sync the database file before calling CommitPhaseTwo() to delete the
  4649 ** journal file in this case.
  4650 */
  4651 int sqlite3PagerCommitPhaseOne(
  4652   Pager *pPager, 
  4653   const char *zMaster, 
  4654   Pgno nTrunc,
  4655   int noSync
  4656 ){
  4657   int rc = SQLITE_OK;
  4658 
  4659   if( pPager->errCode ){
  4660     return pPager->errCode;
  4661   }
  4662 
  4663   /* If no changes have been made, we can leave the transaction early.
  4664   */
  4665   if( pPager->dbModified==0 &&
  4666         (pPager->journalMode!=PAGER_JOURNALMODE_DELETE ||
  4667           pPager->exclusiveMode!=0) ){
  4668     assert( pPager->dirtyCache==0 || pPager->journalOpen==0 );
  4669     return SQLITE_OK;
  4670   }
  4671 
  4672   PAGERTRACE4("DATABASE SYNC: File=%s zMaster=%s nTrunc=%d\n", 
  4673       pPager->zFilename, zMaster, nTrunc);
  4674   pagerEnter(pPager);
  4675 
  4676   /* If this is an in-memory db, or no pages have been written to, or this
  4677   ** function has already been called, it is a no-op.
  4678   */
  4679   if( pPager->state!=PAGER_SYNCED && !MEMDB && pPager->dirtyCache ){
  4680     PgHdr *pPg;
  4681 
  4682 #ifdef SQLITE_ENABLE_ATOMIC_WRITE
  4683     /* The atomic-write optimization can be used if all of the
  4684     ** following are true:
  4685     **
  4686     **    + The file-system supports the atomic-write property for
  4687     **      blocks of size page-size, and
  4688     **    + This commit is not part of a multi-file transaction, and
  4689     **    + Exactly one page has been modified and store in the journal file.
  4690     **
  4691     ** If the optimization can be used, then the journal file will never
  4692     ** be created for this transaction.
  4693     */
  4694     int useAtomicWrite = (
  4695         !zMaster && 
  4696         pPager->journalOpen &&
  4697         pPager->journalOff==jrnlBufferSize(pPager) && 
  4698         nTrunc==0 && 
  4699         (0==pPager->pDirty || 0==pPager->pDirty->pDirty)
  4700     );
  4701     assert( pPager->journalOpen || pPager->journalMode==PAGER_JOURNALMODE_OFF );
  4702     if( useAtomicWrite ){
  4703       /* Update the nRec field in the journal file. */
  4704       int offset = pPager->journalHdr + sizeof(aJournalMagic);
  4705       assert(pPager->nRec==1);
  4706       rc = write32bits(pPager->jfd, offset, pPager->nRec);
  4707 
  4708       /* Update the db file change counter. The following call will modify
  4709       ** the in-memory representation of page 1 to include the updated
  4710       ** change counter and then write page 1 directly to the database
  4711       ** file. Because of the atomic-write property of the host file-system, 
  4712       ** this is safe.
  4713       */
  4714       if( rc==SQLITE_OK ){
  4715         rc = pager_incr_changecounter(pPager, 1);
  4716       }
  4717     }else{
  4718       rc = sqlite3JournalCreate(pPager->jfd);
  4719     }
  4720 
  4721     if( !useAtomicWrite && rc==SQLITE_OK )
  4722 #endif
  4723 
  4724     /* If a master journal file name has already been written to the
  4725     ** journal file, then no sync is required. This happens when it is
  4726     ** written, then the process fails to upgrade from a RESERVED to an
  4727     ** EXCLUSIVE lock. The next time the process tries to commit the
  4728     ** transaction the m-j name will have already been written.
  4729     */
  4730     if( !pPager->setMaster ){
  4731       rc = pager_incr_changecounter(pPager, 0);
  4732       if( rc!=SQLITE_OK ) goto sync_exit;
  4733       if( pPager->journalMode!=PAGER_JOURNALMODE_OFF ){
  4734 #ifndef SQLITE_OMIT_AUTOVACUUM
  4735         if( nTrunc!=0 ){
  4736           /* If this transaction has made the database smaller, then all pages
  4737           ** being discarded by the truncation must be written to the journal
  4738           ** file.
  4739           */
  4740           Pgno i;
  4741           int iSkip = PAGER_MJ_PGNO(pPager);
  4742           for( i=nTrunc+1; i<=pPager->origDbSize; i++ ){
  4743             if( !sqlite3BitvecTest(pPager->pInJournal, i) && i!=iSkip ){
  4744               rc = sqlite3PagerGet(pPager, i, &pPg);
  4745               if( rc!=SQLITE_OK ) goto sync_exit;
  4746               rc = sqlite3PagerWrite(pPg);
  4747               sqlite3PagerUnref(pPg);
  4748               if( rc!=SQLITE_OK ) goto sync_exit;
  4749             }
  4750           } 
  4751         }
  4752 #endif
  4753         rc = writeMasterJournal(pPager, zMaster);
  4754         if( rc!=SQLITE_OK ) goto sync_exit;
  4755         rc = syncJournal(pPager);
  4756       }
  4757     }
  4758     if( rc!=SQLITE_OK ) goto sync_exit;
  4759 
  4760 #ifndef SQLITE_OMIT_AUTOVACUUM
  4761     if( nTrunc!=0 ){
  4762       rc = sqlite3PagerTruncate(pPager, nTrunc);
  4763       if( rc!=SQLITE_OK ) goto sync_exit;
  4764     }
  4765 #endif
  4766 
  4767     /* Write all dirty pages to the database file */
  4768     pPg = pager_get_all_dirty_pages(pPager);
  4769     rc = pager_write_pagelist(pPg);
  4770     if( rc!=SQLITE_OK ){
  4771       assert( rc!=SQLITE_IOERR_BLOCKED );
  4772       /* The error might have left the dirty list all fouled up here,
  4773       ** but that does not matter because if the if the dirty list did
  4774       ** get corrupted, then the transaction will roll back and
  4775       ** discard the dirty list.  There is an assert in
  4776       ** pager_get_all_dirty_pages() that verifies that no attempt
  4777       ** is made to use an invalid dirty list.
  4778       */
  4779       goto sync_exit;
  4780     }
  4781     pPager->pDirty = 0;
  4782 
  4783     /* Sync the database file. */
  4784     if( !pPager->noSync && !noSync ){
  4785       rc = sqlite3OsSync(pPager->fd, pPager->sync_flags);
  4786     }
  4787     IOTRACE(("DBSYNC %p\n", pPager))
  4788 
  4789     pPager->state = PAGER_SYNCED;
  4790   }else if( MEMDB && nTrunc!=0 ){
  4791     rc = sqlite3PagerTruncate(pPager, nTrunc);
  4792   }
  4793 
  4794 sync_exit:
  4795   if( rc==SQLITE_IOERR_BLOCKED ){
  4796     /* pager_incr_changecounter() may attempt to obtain an exclusive
  4797      * lock to spill the cache and return IOERR_BLOCKED. But since 
  4798      * there is no chance the cache is inconsistent, it is
  4799      * better to return SQLITE_BUSY.
  4800      */
  4801     rc = SQLITE_BUSY;
  4802   }
  4803   pagerLeave(pPager);
  4804   return rc;
  4805 }
  4806 
  4807 
  4808 /*
  4809 ** Commit all changes to the database and release the write lock.
  4810 **
  4811 ** If the commit fails for any reason, a rollback attempt is made
  4812 ** and an error code is returned.  If the commit worked, SQLITE_OK
  4813 ** is returned.
  4814 */
  4815 int sqlite3PagerCommitPhaseTwo(Pager *pPager){
  4816   int rc;
  4817   PgHdr *pPg;
  4818 
  4819   if( pPager->errCode ){
  4820     return pPager->errCode;
  4821   }
  4822   if( pPager->state<PAGER_RESERVED ){
  4823     return SQLITE_ERROR;
  4824   }
  4825   if( pPager->dbModified==0 &&
  4826         (pPager->journalMode!=PAGER_JOURNALMODE_DELETE ||
  4827           pPager->exclusiveMode!=0) ){
  4828     assert( pPager->dirtyCache==0 || pPager->journalOpen==0 );
  4829     return SQLITE_OK;
  4830   }
  4831   pagerEnter(pPager);
  4832   PAGERTRACE2("COMMIT %d\n", PAGERID(pPager));
  4833   if( MEMDB ){
  4834     pPg = pager_get_all_dirty_pages(pPager);
  4835     while( pPg ){
  4836       PgHistory *pHist = PGHDR_TO_HIST(pPg, pPager);
  4837       clearHistory(pHist);
  4838       pPg->dirty = 0;
  4839       pPg->inJournal = 0;
  4840       pHist->inStmt = 0;
  4841       pPg->needSync = 0;
  4842       pHist->pPrevStmt = pHist->pNextStmt = 0;
  4843       pPg = pPg->pDirty;
  4844     }
  4845     pPager->pDirty = 0;
  4846 #ifndef NDEBUG
  4847     for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
  4848       PgHistory *pHist = PGHDR_TO_HIST(pPg, pPager);
  4849       assert( !pPg->alwaysRollback );
  4850       assert( !pHist->pOrig );
  4851       assert( !pHist->pStmt );
  4852     }
  4853 #endif
  4854     pPager->pStmt = 0;
  4855     pPager->state = PAGER_SHARED;
  4856     pagerLeave(pPager);
  4857     return SQLITE_OK;
  4858   }
  4859   assert( pPager->state==PAGER_SYNCED || !pPager->dirtyCache );
  4860   rc = pager_end_transaction(pPager, pPager->setMaster);
  4861   rc = pager_error(pPager, rc);
  4862   pagerLeave(pPager);
  4863   return rc;
  4864 }
  4865 
  4866 /*
  4867 ** Rollback all changes.  The database falls back to PAGER_SHARED mode.
  4868 ** All in-memory cache pages revert to their original data contents.
  4869 ** The journal is deleted.
  4870 **
  4871 ** This routine cannot fail unless some other process is not following
  4872 ** the correct locking protocol or unless some other
  4873 ** process is writing trash into the journal file (SQLITE_CORRUPT) or
  4874 ** unless a prior malloc() failed (SQLITE_NOMEM).  Appropriate error
  4875 ** codes are returned for all these occasions.  Otherwise,
  4876 ** SQLITE_OK is returned.
  4877 */
  4878 int sqlite3PagerRollback(Pager *pPager){
  4879   int rc;
  4880   PAGERTRACE2("ROLLBACK %d\n", PAGERID(pPager));
  4881   if( MEMDB ){
  4882     PgHdr *p;
  4883     for(p=pPager->pAll; p; p=p->pNextAll){
  4884       PgHistory *pHist;
  4885       assert( !p->alwaysRollback );
  4886       if( !p->dirty ){
  4887         assert( !((PgHistory *)PGHDR_TO_HIST(p, pPager))->pOrig );
  4888         assert( !((PgHistory *)PGHDR_TO_HIST(p, pPager))->pStmt );
  4889         continue;
  4890       }
  4891 
  4892       pHist = PGHDR_TO_HIST(p, pPager);
  4893       if( pHist->pOrig ){
  4894         memcpy(PGHDR_TO_DATA(p), pHist->pOrig, pPager->pageSize);
  4895         PAGERTRACE3("ROLLBACK-PAGE %d of %d\n", p->pgno, PAGERID(pPager));
  4896       }else{
  4897         PAGERTRACE3("PAGE %d is clean on %d\n", p->pgno, PAGERID(pPager));
  4898       }
  4899       clearHistory(pHist);
  4900       p->dirty = 0;
  4901       p->inJournal = 0;
  4902       pHist->inStmt = 0;
  4903       pHist->pPrevStmt = pHist->pNextStmt = 0;
  4904       if( pPager->xReiniter ){
  4905         pPager->xReiniter(p, pPager->pageSize);
  4906       }
  4907     }
  4908     pPager->pDirty = 0;
  4909     pPager->pStmt = 0;
  4910     pPager->dbSize = pPager->origDbSize;
  4911     pager_truncate_cache(pPager);
  4912     pPager->stmtInUse = 0;
  4913     pPager->state = PAGER_SHARED;
  4914     return SQLITE_OK;
  4915   }
  4916 
  4917   pagerEnter(pPager);
  4918   if( !pPager->dirtyCache || !pPager->journalOpen ){
  4919     rc = pager_end_transaction(pPager, pPager->setMaster);
  4920     pagerLeave(pPager);
  4921     return rc;
  4922   }
  4923 
  4924   if( pPager->errCode && pPager->errCode!=SQLITE_FULL ){
  4925     if( pPager->state>=PAGER_EXCLUSIVE ){
  4926       pager_playback(pPager, 0);
  4927     }
  4928     pagerLeave(pPager);
  4929     return pPager->errCode;
  4930   }
  4931   if( pPager->state==PAGER_RESERVED ){
  4932     int rc2;
  4933     rc = pager_playback(pPager, 0);
  4934     rc2 = pager_end_transaction(pPager, pPager->setMaster);
  4935     if( rc==SQLITE_OK ){
  4936       rc = rc2;
  4937     }
  4938   }else{
  4939     rc = pager_playback(pPager, 0);
  4940   }
  4941   /* pager_reset(pPager); */
  4942   pPager->dbSize = -1;
  4943 
  4944   /* If an error occurs during a ROLLBACK, we can no longer trust the pager
  4945   ** cache. So call pager_error() on the way out to make any error 
  4946   ** persistent.
  4947   */
  4948   rc = pager_error(pPager, rc);
  4949   pagerLeave(pPager);
  4950   return rc;
  4951 }
  4952 
  4953 /*
  4954 ** Return TRUE if the database file is opened read-only.  Return FALSE
  4955 ** if the database is (in theory) writable.
  4956 */
  4957 int sqlite3PagerIsreadonly(Pager *pPager){
  4958   return pPager->readOnly;
  4959 }
  4960 
  4961 /*
  4962 ** Return the number of references to the pager.
  4963 */
  4964 int sqlite3PagerRefcount(Pager *pPager){
  4965   return pPager->nRef;
  4966 }
  4967 
  4968 #ifdef SQLITE_TEST
  4969 /*
  4970 ** This routine is used for testing and analysis only.
  4971 */
  4972 int *sqlite3PagerStats(Pager *pPager){
  4973   static int a[11];
  4974   a[0] = pPager->nRef;
  4975   a[1] = pPager->nPage;
  4976   a[2] = pPager->mxPage;
  4977   a[3] = pPager->dbSize;
  4978   a[4] = pPager->state;
  4979   a[5] = pPager->errCode;
  4980   a[6] = pPager->nHit;
  4981   a[7] = pPager->nMiss;
  4982   a[8] = 0;  /* Used to be pPager->nOvfl */
  4983   a[9] = pPager->nRead;
  4984   a[10] = pPager->nWrite;
  4985   return a;
  4986 }
  4987 int sqlite3PagerIsMemdb(Pager *pPager){
  4988   return MEMDB;
  4989 }
  4990 #endif
  4991 
  4992 /*
  4993 ** Set the statement rollback point.
  4994 **
  4995 ** This routine should be called with the transaction journal already
  4996 ** open.  A new statement journal is created that can be used to rollback
  4997 ** changes of a single SQL command within a larger transaction.
  4998 */
  4999 static int pagerStmtBegin(Pager *pPager){
  5000   int rc;
  5001   assert( !pPager->stmtInUse );
  5002   assert( pPager->state>=PAGER_SHARED );
  5003   assert( pPager->dbSize>=0 );
  5004   PAGERTRACE2("STMT-BEGIN %d\n", PAGERID(pPager));
  5005   if( MEMDB ){
  5006     pPager->stmtInUse = 1;
  5007     pPager->stmtSize = pPager->dbSize;
  5008     return SQLITE_OK;
  5009   }
  5010   if( !pPager->journalOpen ){
  5011     pPager->stmtAutoopen = 1;
  5012     return SQLITE_OK;
  5013   }
  5014   assert( pPager->journalOpen );
  5015   pagerLeave(pPager);
  5016   assert( pPager->pInStmt==0 );
  5017   pPager->pInStmt = sqlite3BitvecCreate(pPager->dbSize);
  5018   pagerEnter(pPager);
  5019   if( pPager->pInStmt==0 ){
  5020     /* sqlite3OsLock(pPager->fd, SHARED_LOCK); */
  5021     return SQLITE_NOMEM;
  5022   }
  5023   pPager->stmtJSize = pPager->journalOff;
  5024   pPager->stmtSize = pPager->dbSize;
  5025   pPager->stmtHdrOff = 0;
  5026   pPager->stmtCksum = pPager->cksumInit;
  5027   if( !pPager->stmtOpen ){
  5028     rc = sqlite3PagerOpentemp(pPager, pPager->stfd, SQLITE_OPEN_SUBJOURNAL);
  5029     if( rc ){
  5030       goto stmt_begin_failed;
  5031     }
  5032     pPager->stmtOpen = 1;
  5033     pPager->stmtNRec = 0;
  5034   }
  5035   pPager->stmtInUse = 1;
  5036   return SQLITE_OK;
  5037  
  5038 stmt_begin_failed:
  5039   if( pPager->pInStmt ){
  5040     sqlite3BitvecDestroy(pPager->pInStmt);
  5041     pPager->pInStmt = 0;
  5042   }
  5043   return rc;
  5044 }
  5045 int sqlite3PagerStmtBegin(Pager *pPager){
  5046   int rc;
  5047   pagerEnter(pPager);
  5048   rc = pagerStmtBegin(pPager);
  5049   pagerLeave(pPager);
  5050   return rc;
  5051 }
  5052 
  5053 /*
  5054 ** Commit a statement.
  5055 */
  5056 int sqlite3PagerStmtCommit(Pager *pPager){
  5057   pagerEnter(pPager);
  5058   if( pPager->stmtInUse ){
  5059     PgHdr *pPg, *pNext;
  5060     PAGERTRACE2("STMT-COMMIT %d\n", PAGERID(pPager));
  5061     if( !MEMDB ){
  5062       /* sqlite3OsTruncate(pPager->stfd, 0); */
  5063       sqlite3BitvecDestroy(pPager->pInStmt);
  5064       pPager->pInStmt = 0;
  5065     }else{
  5066       for(pPg=pPager->pStmt; pPg; pPg=pNext){
  5067         PgHistory *pHist = PGHDR_TO_HIST(pPg, pPager);
  5068         pNext = pHist->pNextStmt;
  5069         assert( pHist->inStmt );
  5070         pHist->inStmt = 0;
  5071         pHist->pPrevStmt = pHist->pNextStmt = 0;
  5072         sqlite3PageFree(pHist->pStmt);
  5073         pHist->pStmt = 0;
  5074       }
  5075     }
  5076     pPager->stmtNRec = 0;
  5077     pPager->stmtInUse = 0;
  5078     pPager->pStmt = 0;
  5079   }
  5080   pPager->stmtAutoopen = 0;
  5081   pagerLeave(pPager);
  5082   return SQLITE_OK;
  5083 }
  5084 
  5085 /*
  5086 ** Rollback a statement.
  5087 */
  5088 int sqlite3PagerStmtRollback(Pager *pPager){
  5089   int rc;
  5090   pagerEnter(pPager);
  5091   if( pPager->stmtInUse ){
  5092     PAGERTRACE2("STMT-ROLLBACK %d\n", PAGERID(pPager));
  5093     if( MEMDB ){
  5094       PgHdr *pPg;
  5095       PgHistory *pHist;
  5096       for(pPg=pPager->pStmt; pPg; pPg=pHist->pNextStmt){
  5097         pHist = PGHDR_TO_HIST(pPg, pPager);
  5098         if( pHist->pStmt ){
  5099           memcpy(PGHDR_TO_DATA(pPg), pHist->pStmt, pPager->pageSize);
  5100           sqlite3PageFree(pHist->pStmt);
  5101           pHist->pStmt = 0;
  5102         }
  5103       }
  5104       pPager->dbSize = pPager->stmtSize;
  5105       pager_truncate_cache(pPager);
  5106       rc = SQLITE_OK;
  5107     }else{
  5108       rc = pager_stmt_playback(pPager);
  5109     }
  5110     sqlite3PagerStmtCommit(pPager);
  5111   }else{
  5112     rc = SQLITE_OK;
  5113   }
  5114   pPager->stmtAutoopen = 0;
  5115   pagerLeave(pPager);
  5116   return rc;
  5117 }
  5118 
  5119 /*
  5120 ** Return the full pathname of the database file.
  5121 */
  5122 const char *sqlite3PagerFilename(Pager *pPager){
  5123   return pPager->zFilename;
  5124 }
  5125 
  5126 /*
  5127 ** Return the VFS structure for the pager.
  5128 */
  5129 const sqlite3_vfs *sqlite3PagerVfs(Pager *pPager){
  5130   return pPager->pVfs;
  5131 }
  5132 
  5133 /*
  5134 ** Return the file handle for the database file associated
  5135 ** with the pager.  This might return NULL if the file has
  5136 ** not yet been opened.
  5137 */
  5138 sqlite3_file *sqlite3PagerFile(Pager *pPager){
  5139   return pPager->fd;
  5140 }
  5141 
  5142 /*
  5143 ** Return the directory of the database file.
  5144 */
  5145 const char *sqlite3PagerDirname(Pager *pPager){
  5146   return pPager->zDirectory;
  5147 }
  5148 
  5149 /*
  5150 ** Return the full pathname of the journal file.
  5151 */
  5152 const char *sqlite3PagerJournalname(Pager *pPager){
  5153   return pPager->zJournal;
  5154 }
  5155 
  5156 /*
  5157 ** Return true if fsync() calls are disabled for this pager.  Return FALSE
  5158 ** if fsync()s are executed normally.
  5159 */
  5160 int sqlite3PagerNosync(Pager *pPager){
  5161   return pPager->noSync;
  5162 }
  5163 
  5164 #ifdef SQLITE_HAS_CODEC
  5165 /*
  5166 ** Set the codec for this pager
  5167 */
  5168 void sqlite3PagerSetCodec(
  5169   Pager *pPager,
  5170   void *(*xCodec)(void*,void*,Pgno,int),
  5171   void *pCodecArg
  5172 ){
  5173   pPager->xCodec = xCodec;
  5174   pPager->pCodecArg = pCodecArg;
  5175 }
  5176 #endif
  5177 
  5178 #ifndef SQLITE_OMIT_AUTOVACUUM
  5179 /*
  5180 ** Move the page pPg to location pgno in the file.
  5181 **
  5182 ** There must be no references to the page previously located at
  5183 ** pgno (which we call pPgOld) though that page is allowed to be
  5184 ** in cache.  If the page previous located at pgno is not already
  5185 ** in the rollback journal, it is not put there by by this routine.
  5186 **
  5187 ** References to the page pPg remain valid. Updating any
  5188 ** meta-data associated with pPg (i.e. data stored in the nExtra bytes
  5189 ** allocated along with the page) is the responsibility of the caller.
  5190 **
  5191 ** A transaction must be active when this routine is called. It used to be
  5192 ** required that a statement transaction was not active, but this restriction
  5193 ** has been removed (CREATE INDEX needs to move a page when a statement
  5194 ** transaction is active).
  5195 **
  5196 ** If the fourth argument, isCommit, is non-zero, then this page is being
  5197 ** moved as part of a database reorganization just before the transaction 
  5198 ** is being committed. In this case, it is guaranteed that the database page 
  5199 ** pPg refers to will not be written to again within this transaction.
  5200 */
  5201 int sqlite3PagerMovepage(Pager *pPager, DbPage *pPg, Pgno pgno, int isCommit){
  5202   PgHdr *pPgOld;  /* The page being overwritten. */
  5203   int h;
  5204   Pgno needSyncPgno = 0;
  5205 
  5206   pagerEnter(pPager);
  5207   assert( pPg->nRef>0 );
  5208 
  5209   PAGERTRACE5("MOVE %d page %d (needSync=%d) moves to %d\n", 
  5210       PAGERID(pPager), pPg->pgno, pPg->needSync, pgno);
  5211   IOTRACE(("MOVE %p %d %d\n", pPager, pPg->pgno, pgno))
  5212 
  5213   pager_get_content(pPg);
  5214 
  5215   /* If the journal needs to be sync()ed before page pPg->pgno can
  5216   ** be written to, store pPg->pgno in local variable needSyncPgno.
  5217   **
  5218   ** If the isCommit flag is set, there is no need to remember that
  5219   ** the journal needs to be sync()ed before database page pPg->pgno 
  5220   ** can be written to. The caller has already promised not to write to it.
  5221   */
  5222   if( pPg->needSync && !isCommit ){
  5223     needSyncPgno = pPg->pgno;
  5224     assert( pPg->inJournal || (int)pgno>pPager->origDbSize );
  5225     assert( pPg->dirty );
  5226     assert( pPager->needSync );
  5227   }
  5228 
  5229   /* Unlink pPg from its hash-chain */
  5230   unlinkHashChain(pPager, pPg);
  5231 
  5232   /* If the cache contains a page with page-number pgno, remove it
  5233   ** from its hash chain. Also, if the PgHdr.needSync was set for 
  5234   ** page pgno before the 'move' operation, it needs to be retained 
  5235   ** for the page moved there.
  5236   */
  5237   pPg->needSync = 0;
  5238   pPgOld = pager_lookup(pPager, pgno);
  5239   if( pPgOld ){
  5240     assert( pPgOld->nRef==0 );
  5241     unlinkHashChain(pPager, pPgOld);
  5242     makeClean(pPgOld);
  5243     pPg->needSync = pPgOld->needSync;
  5244   }else{
  5245     pPg->needSync = 0;
  5246   }
  5247   pPg->inJournal = sqlite3BitvecTest(pPager->pInJournal, pgno);
  5248 
  5249   /* Change the page number for pPg and insert it into the new hash-chain. */
  5250   assert( pgno!=0 );
  5251   pPg->pgno = pgno;
  5252   h = pgno & (pPager->nHash-1);
  5253   if( pPager->aHash[h] ){
  5254     assert( pPager->aHash[h]->pPrevHash==0 );
  5255     pPager->aHash[h]->pPrevHash = pPg;
  5256   }
  5257   pPg->pNextHash = pPager->aHash[h];
  5258   pPager->aHash[h] = pPg;
  5259   pPg->pPrevHash = 0;
  5260 
  5261   makeDirty(pPg);
  5262   pPager->dirtyCache = 1;
  5263   pPager->dbModified = 1;
  5264 
  5265   if( needSyncPgno ){
  5266     /* If needSyncPgno is non-zero, then the journal file needs to be 
  5267     ** sync()ed before any data is written to database file page needSyncPgno.
  5268     ** Currently, no such page exists in the page-cache and the 
  5269     ** "is journaled" bitvec flag has been set. This needs to be remedied by
  5270     ** loading the page into the pager-cache and setting the PgHdr.needSync 
  5271     ** flag.
  5272     **
  5273     ** If the attempt to load the page into the page-cache fails, (due
  5274     ** to a malloc() or IO failure), clear the bit in the pInJournal[]
  5275     ** array. Otherwise, if the page is loaded and written again in
  5276     ** this transaction, it may be written to the database file before
  5277     ** it is synced into the journal file. This way, it may end up in
  5278     ** the journal file twice, but that is not a problem.
  5279     **
  5280     ** The sqlite3PagerGet() call may cause the journal to sync. So make
  5281     ** sure the Pager.needSync flag is set too.
  5282     */
  5283     int rc;
  5284     PgHdr *pPgHdr;
  5285     assert( pPager->needSync );
  5286     rc = sqlite3PagerGet(pPager, needSyncPgno, &pPgHdr);
  5287     if( rc!=SQLITE_OK ){
  5288       if( pPager->pInJournal && (int)needSyncPgno<=pPager->origDbSize ){
  5289         sqlite3BitvecClear(pPager->pInJournal, needSyncPgno);
  5290       }
  5291       pagerLeave(pPager);
  5292       return rc;
  5293     }
  5294     pPager->needSync = 1;
  5295     pPgHdr->needSync = 1;
  5296     pPgHdr->inJournal = 1;
  5297     makeDirty(pPgHdr);
  5298     sqlite3PagerUnref(pPgHdr);
  5299   }
  5300 
  5301   pagerLeave(pPager);
  5302   return SQLITE_OK;
  5303 }
  5304 #endif
  5305 
  5306 /*
  5307 ** Return a pointer to the data for the specified page.
  5308 */
  5309 void *sqlite3PagerGetData(DbPage *pPg){
  5310   return PGHDR_TO_DATA(pPg);
  5311 }
  5312 
  5313 /*
  5314 ** Return a pointer to the Pager.nExtra bytes of "extra" space 
  5315 ** allocated along with the specified page.
  5316 */
  5317 void *sqlite3PagerGetExtra(DbPage *pPg){
  5318   Pager *pPager = pPg->pPager;
  5319   return (pPager?PGHDR_TO_EXTRA(pPg, pPager):0);
  5320 }
  5321 
  5322 /*
  5323 ** Get/set the locking-mode for this pager. Parameter eMode must be one
  5324 ** of PAGER_LOCKINGMODE_QUERY, PAGER_LOCKINGMODE_NORMAL or 
  5325 ** PAGER_LOCKINGMODE_EXCLUSIVE. If the parameter is not _QUERY, then
  5326 ** the locking-mode is set to the value specified.
  5327 **
  5328 ** The returned value is either PAGER_LOCKINGMODE_NORMAL or
  5329 ** PAGER_LOCKINGMODE_EXCLUSIVE, indicating the current (possibly updated)
  5330 ** locking-mode.
  5331 */
  5332 int sqlite3PagerLockingMode(Pager *pPager, int eMode){
  5333   assert( eMode==PAGER_LOCKINGMODE_QUERY
  5334             || eMode==PAGER_LOCKINGMODE_NORMAL
  5335             || eMode==PAGER_LOCKINGMODE_EXCLUSIVE );
  5336   assert( PAGER_LOCKINGMODE_QUERY<0 );
  5337   assert( PAGER_LOCKINGMODE_NORMAL>=0 && PAGER_LOCKINGMODE_EXCLUSIVE>=0 );
  5338   if( eMode>=0 && !pPager->tempFile ){
  5339     pPager->exclusiveMode = eMode;
  5340   }
  5341   return (int)pPager->exclusiveMode;
  5342 }
  5343 
  5344 /*
  5345 ** Get/set the journal-mode for this pager. Parameter eMode must be one
  5346 ** of PAGER_JOURNALMODE_QUERY, PAGER_JOURNALMODE_DELETE or 
  5347 ** PAGER_JOURNALMODE_PERSIST. If the parameter is not _QUERY, then
  5348 ** the journal-mode is set to the value specified.
  5349 **
  5350 ** The returned value is either PAGER_JOURNALMODE_DELETE or
  5351 ** PAGER_JOURNALMODE_PERSIST, indicating the current (possibly updated)
  5352 ** journal-mode.
  5353 */
  5354 int sqlite3PagerJournalMode(Pager *pPager, int eMode){
  5355   assert( eMode==PAGER_JOURNALMODE_QUERY
  5356             || eMode==PAGER_JOURNALMODE_DELETE
  5357             || eMode==PAGER_JOURNALMODE_PERSIST
  5358             || eMode==PAGER_JOURNALMODE_OFF );
  5359   assert( PAGER_JOURNALMODE_QUERY<0 );
  5360   assert( PAGER_JOURNALMODE_DELETE>=0 && PAGER_JOURNALMODE_PERSIST>=0 );
  5361   if( eMode>=0 ){
  5362     pPager->journalMode = eMode;
  5363   }
  5364   return (int)pPager->journalMode;
  5365 }
  5366 
  5367 /*
  5368 ** Get/set the size-limit used for persistent journal files.
  5369 */
  5370 i64 sqlite3PagerJournalSizeLimit(Pager *pPager, i64 iLimit){
  5371   if( iLimit>=-1 ){
  5372     pPager->journalSizeLimit = iLimit;
  5373   }
  5374   return pPager->journalSizeLimit;
  5375 }
  5376 
  5377 #endif /* SQLITE_OMIT_DISKIO */