os/persistentdata/persistentstorage/sqlite3api/SQLite/btree.c
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
     1 /*
     2 ** 2004 April 6
     3 **
     4 ** The author disclaims copyright to this source code.  In place of
     5 ** a legal notice, here is a blessing:
     6 **
     7 **    May you do good and not evil.
     8 **    May you find forgiveness for yourself and forgive others.
     9 **    May you share freely, never taking more than you give.
    10 **
    11 *************************************************************************
    12 ** $Id: btree.c,v 1.524 2008/09/30 17:18:17 drh Exp $
    13 **
    14 ** This file implements a external (disk-based) database using BTrees.
    15 ** See the header comment on "btreeInt.h" for additional information.
    16 ** Including a description of file format and an overview of operation.
    17 */
    18 #include "btreeInt.h"
    19 
    20 /*
    21 ** The header string that appears at the beginning of every
    22 ** SQLite database.
    23 */
    24 static const char zMagicHeader[] = SQLITE_FILE_HEADER;
    25 
    26 /*
    27 ** Set this global variable to 1 to enable tracing using the TRACE
    28 ** macro.
    29 */
    30 #if 0
    31 int sqlite3BtreeTrace=0;  /* True to enable tracing */
    32 # define TRACE(X)  if(sqlite3BtreeTrace){printf X;fflush(stdout);}
    33 #else
    34 # define TRACE(X)
    35 #endif
    36 
    37 /*
    38 ** Sometimes we need a small amount of code such as a variable initialization
    39 ** to setup for a later assert() statement.  We do not want this code to
    40 ** appear when assert() is disabled.  The following macro is therefore
    41 ** used to contain that setup code.  The "VVA" acronym stands for
    42 ** "Verification, Validation, and Accreditation".  In other words, the
    43 ** code within VVA_ONLY() will only run during verification processes.
    44 */
    45 #ifndef NDEBUG
    46 # define VVA_ONLY(X)  X
    47 #else
    48 # define VVA_ONLY(X)
    49 #endif
    50 
    51 
    52 
    53 #ifndef SQLITE_OMIT_SHARED_CACHE
    54 /*
    55 ** A list of BtShared objects that are eligible for participation
    56 ** in shared cache.  This variable has file scope during normal builds,
    57 ** but the test harness needs to access it so we make it global for 
    58 ** test builds.
    59 */
    60 #ifdef SQLITE_TEST
    61 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
    62 #else
    63 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
    64 #endif
    65 #endif /* SQLITE_OMIT_SHARED_CACHE */
    66 
    67 #ifndef SQLITE_OMIT_SHARED_CACHE
    68 /*
    69 ** Enable or disable the shared pager and schema features.
    70 **
    71 ** This routine has no effect on existing database connections.
    72 ** The shared cache setting effects only future calls to
    73 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
    74 */
    75 SQLITE_EXPORT int sqlite3_enable_shared_cache(int enable){
    76   sqlite3GlobalConfig.sharedCacheEnabled = enable;
    77   return SQLITE_OK;
    78 }
    79 #endif
    80 
    81 
    82 /*
    83 ** Forward declaration
    84 */
    85 static int checkReadLocks(Btree*, Pgno, BtCursor*, i64);
    86 
    87 
    88 #ifdef SQLITE_OMIT_SHARED_CACHE
    89   /*
    90   ** The functions queryTableLock(), lockTable() and unlockAllTables()
    91   ** manipulate entries in the BtShared.pLock linked list used to store
    92   ** shared-cache table level locks. If the library is compiled with the
    93   ** shared-cache feature disabled, then there is only ever one user
    94   ** of each BtShared structure and so this locking is not necessary. 
    95   ** So define the lock related functions as no-ops.
    96   */
    97   #define queryTableLock(a,b,c) SQLITE_OK
    98   #define lockTable(a,b,c) SQLITE_OK
    99   #define unlockAllTables(a)
   100 #endif
   101 
   102 #ifndef SQLITE_OMIT_SHARED_CACHE
   103 /*
   104 ** Query to see if btree handle p may obtain a lock of type eLock 
   105 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
   106 ** SQLITE_OK if the lock may be obtained (by calling lockTable()), or
   107 ** SQLITE_LOCKED if not.
   108 */
   109 static int queryTableLock(Btree *p, Pgno iTab, u8 eLock){
   110   BtShared *pBt = p->pBt;
   111   BtLock *pIter;
   112 
   113   assert( sqlite3BtreeHoldsMutex(p) );
   114   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
   115   assert( p->db!=0 );
   116   
   117   /* This is a no-op if the shared-cache is not enabled */
   118   if( !p->sharable ){
   119     return SQLITE_OK;
   120   }
   121 
   122   /* If some other connection is holding an exclusive lock, the
   123   ** requested lock may not be obtained.
   124   */
   125   if( pBt->pExclusive && pBt->pExclusive!=p ){
   126     return SQLITE_LOCKED;
   127   }
   128 
   129   /* This (along with lockTable()) is where the ReadUncommitted flag is
   130   ** dealt with. If the caller is querying for a read-lock and the flag is
   131   ** set, it is unconditionally granted - even if there are write-locks
   132   ** on the table. If a write-lock is requested, the ReadUncommitted flag
   133   ** is not considered.
   134   **
   135   ** In function lockTable(), if a read-lock is demanded and the 
   136   ** ReadUncommitted flag is set, no entry is added to the locks list 
   137   ** (BtShared.pLock).
   138   **
   139   ** To summarize: If the ReadUncommitted flag is set, then read cursors do
   140   ** not create or respect table locks. The locking procedure for a 
   141   ** write-cursor does not change.
   142   */
   143   if( 
   144     0==(p->db->flags&SQLITE_ReadUncommitted) || 
   145     eLock==WRITE_LOCK ||
   146     iTab==MASTER_ROOT
   147   ){
   148     for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
   149       if( pIter->pBtree!=p && pIter->iTable==iTab && 
   150           (pIter->eLock!=eLock || eLock!=READ_LOCK) ){
   151         return SQLITE_LOCKED;
   152       }
   153     }
   154   }
   155   return SQLITE_OK;
   156 }
   157 #endif /* !SQLITE_OMIT_SHARED_CACHE */
   158 
   159 #ifndef SQLITE_OMIT_SHARED_CACHE
   160 /*
   161 ** Add a lock on the table with root-page iTable to the shared-btree used
   162 ** by Btree handle p. Parameter eLock must be either READ_LOCK or 
   163 ** WRITE_LOCK.
   164 **
   165 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_BUSY and
   166 ** SQLITE_NOMEM may also be returned.
   167 */
   168 static int lockTable(Btree *p, Pgno iTable, u8 eLock){
   169   BtShared *pBt = p->pBt;
   170   BtLock *pLock = 0;
   171   BtLock *pIter;
   172 
   173   assert( sqlite3BtreeHoldsMutex(p) );
   174   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
   175   assert( p->db!=0 );
   176 
   177   /* This is a no-op if the shared-cache is not enabled */
   178   if( !p->sharable ){
   179     return SQLITE_OK;
   180   }
   181 
   182   assert( SQLITE_OK==queryTableLock(p, iTable, eLock) );
   183 
   184   /* If the read-uncommitted flag is set and a read-lock is requested,
   185   ** return early without adding an entry to the BtShared.pLock list. See
   186   ** comment in function queryTableLock() for more info on handling 
   187   ** the ReadUncommitted flag.
   188   */
   189   if( 
   190     (p->db->flags&SQLITE_ReadUncommitted) && 
   191     (eLock==READ_LOCK) &&
   192     iTable!=MASTER_ROOT
   193   ){
   194     return SQLITE_OK;
   195   }
   196 
   197   /* First search the list for an existing lock on this table. */
   198   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
   199     if( pIter->iTable==iTable && pIter->pBtree==p ){
   200       pLock = pIter;
   201       break;
   202     }
   203   }
   204 
   205   /* If the above search did not find a BtLock struct associating Btree p
   206   ** with table iTable, allocate one and link it into the list.
   207   */
   208   if( !pLock ){
   209     pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
   210     if( !pLock ){
   211       return SQLITE_NOMEM;
   212     }
   213     pLock->iTable = iTable;
   214     pLock->pBtree = p;
   215     pLock->pNext = pBt->pLock;
   216     pBt->pLock = pLock;
   217   }
   218 
   219   /* Set the BtLock.eLock variable to the maximum of the current lock
   220   ** and the requested lock. This means if a write-lock was already held
   221   ** and a read-lock requested, we don't incorrectly downgrade the lock.
   222   */
   223   assert( WRITE_LOCK>READ_LOCK );
   224   if( eLock>pLock->eLock ){
   225     pLock->eLock = eLock;
   226   }
   227 
   228   return SQLITE_OK;
   229 }
   230 #endif /* !SQLITE_OMIT_SHARED_CACHE */
   231 
   232 #ifndef SQLITE_OMIT_SHARED_CACHE
   233 /*
   234 ** Release all the table locks (locks obtained via calls to the lockTable()
   235 ** procedure) held by Btree handle p.
   236 */
   237 static void unlockAllTables(Btree *p){
   238   BtShared *pBt = p->pBt;
   239   BtLock **ppIter = &pBt->pLock;
   240 
   241   assert( sqlite3BtreeHoldsMutex(p) );
   242   assert( p->sharable || 0==*ppIter );
   243 
   244   while( *ppIter ){
   245     BtLock *pLock = *ppIter;
   246     assert( pBt->pExclusive==0 || pBt->pExclusive==pLock->pBtree );
   247     if( pLock->pBtree==p ){
   248       *ppIter = pLock->pNext;
   249       sqlite3_free(pLock);
   250     }else{
   251       ppIter = &pLock->pNext;
   252     }
   253   }
   254 
   255   if( pBt->pExclusive==p ){
   256     pBt->pExclusive = 0;
   257   }
   258 }
   259 #endif /* SQLITE_OMIT_SHARED_CACHE */
   260 
   261 static void releasePage(MemPage *pPage);  /* Forward reference */
   262 
   263 /*
   264 ** Verify that the cursor holds a mutex on the BtShared
   265 */
   266 #ifndef NDEBUG
   267 static int cursorHoldsMutex(BtCursor *p){
   268   return sqlite3_mutex_held(p->pBt->mutex);
   269 }
   270 #endif
   271 
   272 
   273 #ifndef SQLITE_OMIT_INCRBLOB
   274 /*
   275 ** Invalidate the overflow page-list cache for cursor pCur, if any.
   276 */
   277 static void invalidateOverflowCache(BtCursor *pCur){
   278   assert( cursorHoldsMutex(pCur) );
   279   sqlite3_free(pCur->aOverflow);
   280   pCur->aOverflow = 0;
   281 }
   282 
   283 /*
   284 ** Invalidate the overflow page-list cache for all cursors opened
   285 ** on the shared btree structure pBt.
   286 */
   287 static void invalidateAllOverflowCache(BtShared *pBt){
   288   BtCursor *p;
   289   assert( sqlite3_mutex_held(pBt->mutex) );
   290   for(p=pBt->pCursor; p; p=p->pNext){
   291     invalidateOverflowCache(p);
   292   }
   293 }
   294 #else
   295   #define invalidateOverflowCache(x)
   296   #define invalidateAllOverflowCache(x)
   297 #endif
   298 
   299 /*
   300 ** Save the current cursor position in the variables BtCursor.nKey 
   301 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
   302 */
   303 static int saveCursorPosition(BtCursor *pCur){
   304   int rc;
   305 
   306   assert( CURSOR_VALID==pCur->eState );
   307   assert( 0==pCur->pKey );
   308   assert( cursorHoldsMutex(pCur) );
   309 
   310   rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);
   311 
   312   /* If this is an intKey table, then the above call to BtreeKeySize()
   313   ** stores the integer key in pCur->nKey. In this case this value is
   314   ** all that is required. Otherwise, if pCur is not open on an intKey
   315   ** table, then malloc space for and store the pCur->nKey bytes of key 
   316   ** data.
   317   */
   318   if( rc==SQLITE_OK && 0==pCur->apPage[0]->intKey){
   319     void *pKey = sqlite3Malloc(pCur->nKey);
   320     if( pKey ){
   321       rc = sqlite3BtreeKey(pCur, 0, pCur->nKey, pKey);
   322       if( rc==SQLITE_OK ){
   323         pCur->pKey = pKey;
   324       }else{
   325         sqlite3_free(pKey);
   326       }
   327     }else{
   328       rc = SQLITE_NOMEM;
   329     }
   330   }
   331   assert( !pCur->apPage[0]->intKey || !pCur->pKey );
   332 
   333   if( rc==SQLITE_OK ){
   334     int i;
   335     for(i=0; i<=pCur->iPage; i++){
   336       releasePage(pCur->apPage[i]);
   337       pCur->apPage[i] = 0;
   338     }
   339     pCur->iPage = -1;
   340     pCur->eState = CURSOR_REQUIRESEEK;
   341   }
   342 
   343   invalidateOverflowCache(pCur);
   344   return rc;
   345 }
   346 
   347 /*
   348 ** Save the positions of all cursors except pExcept open on the table 
   349 ** with root-page iRoot. Usually, this is called just before cursor
   350 ** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()).
   351 */
   352 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
   353   BtCursor *p;
   354   assert( sqlite3_mutex_held(pBt->mutex) );
   355   assert( pExcept==0 || pExcept->pBt==pBt );
   356   for(p=pBt->pCursor; p; p=p->pNext){
   357     if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) && 
   358         p->eState==CURSOR_VALID ){
   359       int rc = saveCursorPosition(p);
   360       if( SQLITE_OK!=rc ){
   361         return rc;
   362       }
   363     }
   364   }
   365   return SQLITE_OK;
   366 }
   367 
   368 /*
   369 ** Clear the current cursor position.
   370 */
   371 static void clearCursorPosition(BtCursor *pCur){
   372   assert( cursorHoldsMutex(pCur) );
   373   sqlite3_free(pCur->pKey);
   374   pCur->pKey = 0;
   375   pCur->eState = CURSOR_INVALID;
   376 }
   377 
   378 /*
   379 ** Restore the cursor to the position it was in (or as close to as possible)
   380 ** when saveCursorPosition() was called. Note that this call deletes the 
   381 ** saved position info stored by saveCursorPosition(), so there can be
   382 ** at most one effective restoreCursorPosition() call after each 
   383 ** saveCursorPosition().
   384 */
   385 int sqlite3BtreeRestoreCursorPosition(BtCursor *pCur){
   386   int rc;
   387   assert( cursorHoldsMutex(pCur) );
   388   assert( pCur->eState>=CURSOR_REQUIRESEEK );
   389   if( pCur->eState==CURSOR_FAULT ){
   390     return pCur->skip;
   391   }
   392   pCur->eState = CURSOR_INVALID;
   393   rc = sqlite3BtreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skip);
   394   if( rc==SQLITE_OK ){
   395     sqlite3_free(pCur->pKey);
   396     pCur->pKey = 0;
   397     assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
   398   }
   399   return rc;
   400 }
   401 
   402 #define restoreCursorPosition(p) \
   403   (p->eState>=CURSOR_REQUIRESEEK ? \
   404          sqlite3BtreeRestoreCursorPosition(p) : \
   405          SQLITE_OK)
   406 
   407 /*
   408 ** Determine whether or not a cursor has moved from the position it
   409 ** was last placed at.  Cursor can move when the row they are pointing
   410 ** at is deleted out from under them.
   411 **
   412 ** This routine returns an error code if something goes wrong.  The
   413 ** integer *pHasMoved is set to one if the cursor has moved and 0 if not.
   414 */
   415 int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved){
   416   int rc;
   417 
   418   rc = restoreCursorPosition(pCur);
   419   if( rc ){
   420     *pHasMoved = 1;
   421     return rc;
   422   }
   423   if( pCur->eState!=CURSOR_VALID || pCur->skip!=0 ){
   424     *pHasMoved = 1;
   425   }else{
   426     *pHasMoved = 0;
   427   }
   428   return SQLITE_OK;
   429 }
   430 
   431 #ifndef SQLITE_OMIT_AUTOVACUUM
   432 /*
   433 ** Given a page number of a regular database page, return the page
   434 ** number for the pointer-map page that contains the entry for the
   435 ** input page number.
   436 */
   437 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
   438   int nPagesPerMapPage, iPtrMap, ret;
   439   assert( sqlite3_mutex_held(pBt->mutex) );
   440   nPagesPerMapPage = (pBt->usableSize/5)+1;
   441   iPtrMap = (pgno-2)/nPagesPerMapPage;
   442   ret = (iPtrMap*nPagesPerMapPage) + 2; 
   443   if( ret==PENDING_BYTE_PAGE(pBt) ){
   444     ret++;
   445   }
   446   return ret;
   447 }
   448 
   449 /*
   450 ** Write an entry into the pointer map.
   451 **
   452 ** This routine updates the pointer map entry for page number 'key'
   453 ** so that it maps to type 'eType' and parent page number 'pgno'.
   454 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
   455 */
   456 static int ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent){
   457   DbPage *pDbPage;  /* The pointer map page */
   458   u8 *pPtrmap;      /* The pointer map data */
   459   Pgno iPtrmap;     /* The pointer map page number */
   460   int offset;       /* Offset in pointer map page */
   461   int rc;
   462 
   463   assert( sqlite3_mutex_held(pBt->mutex) );
   464   /* The master-journal page number must never be used as a pointer map page */
   465   assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
   466 
   467   assert( pBt->autoVacuum );
   468   if( key==0 ){
   469     return SQLITE_CORRUPT_BKPT;
   470   }
   471   iPtrmap = PTRMAP_PAGENO(pBt, key);
   472   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
   473   if( rc!=SQLITE_OK ){
   474     return rc;
   475   }
   476   offset = PTRMAP_PTROFFSET(iPtrmap, key);
   477   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
   478 
   479   if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
   480     TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
   481     rc = sqlite3PagerWrite(pDbPage);
   482     if( rc==SQLITE_OK ){
   483       pPtrmap[offset] = eType;
   484       put4byte(&pPtrmap[offset+1], parent);
   485     }
   486   }
   487 
   488   sqlite3PagerUnref(pDbPage);
   489   return rc;
   490 }
   491 
   492 /*
   493 ** Read an entry from the pointer map.
   494 **
   495 ** This routine retrieves the pointer map entry for page 'key', writing
   496 ** the type and parent page number to *pEType and *pPgno respectively.
   497 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
   498 */
   499 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
   500   DbPage *pDbPage;   /* The pointer map page */
   501   int iPtrmap;       /* Pointer map page index */
   502   u8 *pPtrmap;       /* Pointer map page data */
   503   int offset;        /* Offset of entry in pointer map */
   504   int rc;
   505 
   506   assert( sqlite3_mutex_held(pBt->mutex) );
   507 
   508   iPtrmap = PTRMAP_PAGENO(pBt, key);
   509   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
   510   if( rc!=0 ){
   511     return rc;
   512   }
   513   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
   514 
   515   offset = PTRMAP_PTROFFSET(iPtrmap, key);
   516   assert( pEType!=0 );
   517   *pEType = pPtrmap[offset];
   518   if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
   519 
   520   sqlite3PagerUnref(pDbPage);
   521   if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
   522   return SQLITE_OK;
   523 }
   524 
   525 #else /* if defined SQLITE_OMIT_AUTOVACUUM */
   526   #define ptrmapPut(w,x,y,z) SQLITE_OK
   527   #define ptrmapGet(w,x,y,z) SQLITE_OK
   528   #define ptrmapPutOvfl(y,z) SQLITE_OK
   529 #endif
   530 
   531 /*
   532 ** Given a btree page and a cell index (0 means the first cell on
   533 ** the page, 1 means the second cell, and so forth) return a pointer
   534 ** to the cell content.
   535 **
   536 ** This routine works only for pages that do not contain overflow cells.
   537 */
   538 #define findCell(P,I) \
   539   ((P)->aData + ((P)->maskPage & get2byte(&(P)->aData[(P)->cellOffset+2*(I)])))
   540 
   541 /*
   542 ** This a more complex version of findCell() that works for
   543 ** pages that do contain overflow cells.  See insert
   544 */
   545 static u8 *findOverflowCell(MemPage *pPage, int iCell){
   546   int i;
   547   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
   548   for(i=pPage->nOverflow-1; i>=0; i--){
   549     int k;
   550     struct _OvflCell *pOvfl;
   551     pOvfl = &pPage->aOvfl[i];
   552     k = pOvfl->idx;
   553     if( k<=iCell ){
   554       if( k==iCell ){
   555         return pOvfl->pCell;
   556       }
   557       iCell--;
   558     }
   559   }
   560   return findCell(pPage, iCell);
   561 }
   562 
   563 /*
   564 ** Parse a cell content block and fill in the CellInfo structure.  There
   565 ** are two versions of this function.  sqlite3BtreeParseCell() takes a 
   566 ** cell index as the second argument and sqlite3BtreeParseCellPtr() 
   567 ** takes a pointer to the body of the cell as its second argument.
   568 **
   569 ** Within this file, the parseCell() macro can be called instead of
   570 ** sqlite3BtreeParseCellPtr(). Using some compilers, this will be faster.
   571 */
   572 void sqlite3BtreeParseCellPtr(
   573   MemPage *pPage,         /* Page containing the cell */
   574   u8 *pCell,              /* Pointer to the cell text. */
   575   CellInfo *pInfo         /* Fill in this structure */
   576 ){
   577   int n;                  /* Number bytes in cell content header */
   578   u32 nPayload;           /* Number of bytes of cell payload */
   579 
   580   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
   581 
   582   pInfo->pCell = pCell;
   583   assert( pPage->leaf==0 || pPage->leaf==1 );
   584   n = pPage->childPtrSize;
   585   assert( n==4-4*pPage->leaf );
   586   if( pPage->intKey ){
   587     if( pPage->hasData ){
   588       n += getVarint32(&pCell[n], nPayload);
   589     }else{
   590       nPayload = 0;
   591     }
   592     n += getVarint(&pCell[n], (u64*)&pInfo->nKey);
   593     pInfo->nData = nPayload;
   594   }else{
   595     pInfo->nData = 0;
   596     n += getVarint32(&pCell[n], nPayload);
   597     pInfo->nKey = nPayload;
   598   }
   599   pInfo->nPayload = nPayload;
   600   pInfo->nHeader = n;
   601   if( likely(nPayload<=pPage->maxLocal) ){
   602     /* This is the (easy) common case where the entire payload fits
   603     ** on the local page.  No overflow is required.
   604     */
   605     int nSize;          /* Total size of cell content in bytes */
   606     nSize = nPayload + n;
   607     pInfo->nLocal = nPayload;
   608     pInfo->iOverflow = 0;
   609     if( (nSize & ~3)==0 ){
   610       nSize = 4;        /* Minimum cell size is 4 */
   611     }
   612     pInfo->nSize = nSize;
   613   }else{
   614     /* If the payload will not fit completely on the local page, we have
   615     ** to decide how much to store locally and how much to spill onto
   616     ** overflow pages.  The strategy is to minimize the amount of unused
   617     ** space on overflow pages while keeping the amount of local storage
   618     ** in between minLocal and maxLocal.
   619     **
   620     ** Warning:  changing the way overflow payload is distributed in any
   621     ** way will result in an incompatible file format.
   622     */
   623     int minLocal;  /* Minimum amount of payload held locally */
   624     int maxLocal;  /* Maximum amount of payload held locally */
   625     int surplus;   /* Overflow payload available for local storage */
   626 
   627     minLocal = pPage->minLocal;
   628     maxLocal = pPage->maxLocal;
   629     surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);
   630     if( surplus <= maxLocal ){
   631       pInfo->nLocal = surplus;
   632     }else{
   633       pInfo->nLocal = minLocal;
   634     }
   635     pInfo->iOverflow = pInfo->nLocal + n;
   636     pInfo->nSize = pInfo->iOverflow + 4;
   637   }
   638 }
   639 #define parseCell(pPage, iCell, pInfo) \
   640   sqlite3BtreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo))
   641 void sqlite3BtreeParseCell(
   642   MemPage *pPage,         /* Page containing the cell */
   643   int iCell,              /* The cell index.  First cell is 0 */
   644   CellInfo *pInfo         /* Fill in this structure */
   645 ){
   646   parseCell(pPage, iCell, pInfo);
   647 }
   648 
   649 /*
   650 ** Compute the total number of bytes that a Cell needs in the cell
   651 ** data area of the btree-page.  The return number includes the cell
   652 ** data header and the local payload, but not any overflow page or
   653 ** the space used by the cell pointer.
   654 */
   655 #ifndef NDEBUG
   656 static u16 cellSize(MemPage *pPage, int iCell){
   657   CellInfo info;
   658   sqlite3BtreeParseCell(pPage, iCell, &info);
   659   return info.nSize;
   660 }
   661 #endif
   662 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
   663   CellInfo info;
   664   sqlite3BtreeParseCellPtr(pPage, pCell, &info);
   665   return info.nSize;
   666 }
   667 
   668 #ifndef SQLITE_OMIT_AUTOVACUUM
   669 /*
   670 ** If the cell pCell, part of page pPage contains a pointer
   671 ** to an overflow page, insert an entry into the pointer-map
   672 ** for the overflow page.
   673 */
   674 static int ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell){
   675   CellInfo info;
   676   assert( pCell!=0 );
   677   sqlite3BtreeParseCellPtr(pPage, pCell, &info);
   678   assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
   679   if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
   680     Pgno ovfl = get4byte(&pCell[info.iOverflow]);
   681     return ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno);
   682   }
   683   return SQLITE_OK;
   684 }
   685 /*
   686 ** If the cell with index iCell on page pPage contains a pointer
   687 ** to an overflow page, insert an entry into the pointer-map
   688 ** for the overflow page.
   689 */
   690 static int ptrmapPutOvfl(MemPage *pPage, int iCell){
   691   u8 *pCell;
   692   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
   693   pCell = findOverflowCell(pPage, iCell);
   694   return ptrmapPutOvflPtr(pPage, pCell);
   695 }
   696 #endif
   697 
   698 
   699 /*
   700 ** Defragment the page given.  All Cells are moved to the
   701 ** end of the page and all free space is collected into one
   702 ** big FreeBlk that occurs in between the header and cell
   703 ** pointer array and the cell content area.
   704 */
   705 static void defragmentPage(MemPage *pPage){
   706   int i;                     /* Loop counter */
   707   int pc;                    /* Address of a i-th cell */
   708   int addr;                  /* Offset of first byte after cell pointer array */
   709   int hdr;                   /* Offset to the page header */
   710   int size;                  /* Size of a cell */
   711   int usableSize;            /* Number of usable bytes on a page */
   712   int cellOffset;            /* Offset to the cell pointer array */
   713   int cbrk;                  /* Offset to the cell content area */
   714   int nCell;                 /* Number of cells on the page */
   715   unsigned char *data;       /* The page data */
   716   unsigned char *temp;       /* Temp area for cell content */
   717 
   718   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
   719   assert( pPage->pBt!=0 );
   720   assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
   721   assert( pPage->nOverflow==0 );
   722   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
   723   temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
   724   data = pPage->aData;
   725   hdr = pPage->hdrOffset;
   726   cellOffset = pPage->cellOffset;
   727   nCell = pPage->nCell;
   728   assert( nCell==get2byte(&data[hdr+3]) );
   729   usableSize = pPage->pBt->usableSize;
   730   cbrk = get2byte(&data[hdr+5]);
   731   memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk);
   732   cbrk = usableSize;
   733   for(i=0; i<nCell; i++){
   734     u8 *pAddr;     /* The i-th cell pointer */
   735     pAddr = &data[cellOffset + i*2];
   736     pc = get2byte(pAddr);
   737     assert( pc<pPage->pBt->usableSize );
   738     size = cellSizePtr(pPage, &temp[pc]);
   739     cbrk -= size;
   740     memcpy(&data[cbrk], &temp[pc], size);
   741     put2byte(pAddr, cbrk);
   742   }
   743   assert( cbrk>=cellOffset+2*nCell );
   744   put2byte(&data[hdr+5], cbrk);
   745   data[hdr+1] = 0;
   746   data[hdr+2] = 0;
   747   data[hdr+7] = 0;
   748   addr = cellOffset+2*nCell;
   749   memset(&data[addr], 0, cbrk-addr);
   750 }
   751 
   752 /*
   753 ** Allocate nByte bytes of space on a page.
   754 **
   755 ** Return the index into pPage->aData[] of the first byte of
   756 ** the new allocation.  The caller guarantees that there is enough
   757 ** space.  This routine will never fail.
   758 **
   759 ** If the page contains nBytes of free space but does not contain
   760 ** nBytes of contiguous free space, then this routine automatically
   761 ** calls defragementPage() to consolidate all free space before 
   762 ** allocating the new chunk.
   763 */
   764 static int allocateSpace(MemPage *pPage, int nByte){
   765   int addr, pc, hdr;
   766   int size;
   767   int nFrag;
   768   int top;
   769   int nCell;
   770   int cellOffset;
   771   unsigned char *data;
   772   
   773   data = pPage->aData;
   774   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
   775   assert( pPage->pBt );
   776   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
   777   assert( nByte>=0 );  /* Minimum cell size is 4 */
   778   assert( pPage->nFree>=nByte );
   779   assert( pPage->nOverflow==0 );
   780   pPage->nFree -= nByte;
   781   hdr = pPage->hdrOffset;
   782 
   783   nFrag = data[hdr+7];
   784   if( nFrag<60 ){
   785     /* Search the freelist looking for a slot big enough to satisfy the
   786     ** space request. */
   787     addr = hdr+1;
   788     while( (pc = get2byte(&data[addr]))>0 ){
   789       size = get2byte(&data[pc+2]);
   790       if( size>=nByte ){
   791         if( size<nByte+4 ){
   792           memcpy(&data[addr], &data[pc], 2);
   793           data[hdr+7] = nFrag + size - nByte;
   794           return pc;
   795         }else{
   796           put2byte(&data[pc+2], size-nByte);
   797           return pc + size - nByte;
   798         }
   799       }
   800       addr = pc;
   801     }
   802   }
   803 
   804   /* Allocate memory from the gap in between the cell pointer array
   805   ** and the cell content area.
   806   */
   807   top = get2byte(&data[hdr+5]);
   808   nCell = get2byte(&data[hdr+3]);
   809   cellOffset = pPage->cellOffset;
   810   if( nFrag>=60 || cellOffset + 2*nCell > top - nByte ){
   811     defragmentPage(pPage);
   812     top = get2byte(&data[hdr+5]);
   813   }
   814   top -= nByte;
   815   assert( cellOffset + 2*nCell <= top );
   816   put2byte(&data[hdr+5], top);
   817   return top;
   818 }
   819 
   820 /*
   821 ** Return a section of the pPage->aData to the freelist.
   822 ** The first byte of the new free block is pPage->aDisk[start]
   823 ** and the size of the block is "size" bytes.
   824 **
   825 ** Most of the effort here is involved in coalesing adjacent
   826 ** free blocks into a single big free block.
   827 */
   828 static void freeSpace(MemPage *pPage, int start, int size){
   829   int addr, pbegin, hdr;
   830   unsigned char *data = pPage->aData;
   831 
   832   assert( pPage->pBt!=0 );
   833   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
   834   assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) );
   835   assert( (start + size)<=pPage->pBt->usableSize );
   836   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
   837   assert( size>=0 );   /* Minimum cell size is 4 */
   838 
   839 #ifdef SQLITE_SECURE_DELETE
   840   /* Overwrite deleted information with zeros when the SECURE_DELETE 
   841   ** option is enabled at compile-time */
   842   memset(&data[start], 0, size);
   843 #endif
   844 
   845   /* Add the space back into the linked list of freeblocks */
   846   hdr = pPage->hdrOffset;
   847   addr = hdr + 1;
   848   while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
   849     assert( pbegin<=pPage->pBt->usableSize-4 );
   850     assert( pbegin>addr );
   851     addr = pbegin;
   852   }
   853   assert( pbegin<=pPage->pBt->usableSize-4 );
   854   assert( pbegin>addr || pbegin==0 );
   855   put2byte(&data[addr], start);
   856   put2byte(&data[start], pbegin);
   857   put2byte(&data[start+2], size);
   858   pPage->nFree += size;
   859 
   860   /* Coalesce adjacent free blocks */
   861   addr = pPage->hdrOffset + 1;
   862   while( (pbegin = get2byte(&data[addr]))>0 ){
   863     int pnext, psize;
   864     assert( pbegin>addr );
   865     assert( pbegin<=pPage->pBt->usableSize-4 );
   866     pnext = get2byte(&data[pbegin]);
   867     psize = get2byte(&data[pbegin+2]);
   868     if( pbegin + psize + 3 >= pnext && pnext>0 ){
   869       int frag = pnext - (pbegin+psize);
   870       assert( frag<=data[pPage->hdrOffset+7] );
   871       data[pPage->hdrOffset+7] -= frag;
   872       put2byte(&data[pbegin], get2byte(&data[pnext]));
   873       put2byte(&data[pbegin+2], pnext+get2byte(&data[pnext+2])-pbegin);
   874     }else{
   875       addr = pbegin;
   876     }
   877   }
   878 
   879   /* If the cell content area begins with a freeblock, remove it. */
   880   if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){
   881     int top;
   882     pbegin = get2byte(&data[hdr+1]);
   883     memcpy(&data[hdr+1], &data[pbegin], 2);
   884     top = get2byte(&data[hdr+5]);
   885     put2byte(&data[hdr+5], top + get2byte(&data[pbegin+2]));
   886   }
   887 }
   888 
   889 /*
   890 ** Decode the flags byte (the first byte of the header) for a page
   891 ** and initialize fields of the MemPage structure accordingly.
   892 **
   893 ** Only the following combinations are supported.  Anything different
   894 ** indicates a corrupt database files:
   895 **
   896 **         PTF_ZERODATA
   897 **         PTF_ZERODATA | PTF_LEAF
   898 **         PTF_LEAFDATA | PTF_INTKEY
   899 **         PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
   900 */
   901 static int decodeFlags(MemPage *pPage, int flagByte){
   902   BtShared *pBt;     /* A copy of pPage->pBt */
   903 
   904   assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
   905   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
   906   pPage->leaf = flagByte>>3;  assert( PTF_LEAF == 1<<3 );
   907   flagByte &= ~PTF_LEAF;
   908   pPage->childPtrSize = 4-4*pPage->leaf;
   909   pBt = pPage->pBt;
   910   if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
   911     pPage->intKey = 1;
   912     pPage->hasData = pPage->leaf;
   913     pPage->maxLocal = pBt->maxLeaf;
   914     pPage->minLocal = pBt->minLeaf;
   915   }else if( flagByte==PTF_ZERODATA ){
   916     pPage->intKey = 0;
   917     pPage->hasData = 0;
   918     pPage->maxLocal = pBt->maxLocal;
   919     pPage->minLocal = pBt->minLocal;
   920   }else{
   921     return SQLITE_CORRUPT_BKPT;
   922   }
   923   return SQLITE_OK;
   924 }
   925 
   926 /*
   927 ** Initialize the auxiliary information for a disk block.
   928 **
   929 ** Return SQLITE_OK on success.  If we see that the page does
   930 ** not contain a well-formed database page, then return 
   931 ** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
   932 ** guarantee that the page is well-formed.  It only shows that
   933 ** we failed to detect any corruption.
   934 */
   935 int sqlite3BtreeInitPage(MemPage *pPage){
   936 
   937   assert( pPage->pBt!=0 );
   938   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
   939   assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
   940   assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
   941   assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
   942 
   943   if( !pPage->isInit ){
   944     int pc;            /* Address of a freeblock within pPage->aData[] */
   945     int hdr;           /* Offset to beginning of page header */
   946     u8 *data;          /* Equal to pPage->aData */
   947     BtShared *pBt;        /* The main btree structure */
   948     int usableSize;    /* Amount of usable space on each page */
   949     int cellOffset;    /* Offset from start of page to first cell pointer */
   950     int nFree;         /* Number of unused bytes on the page */
   951     int top;           /* First byte of the cell content area */
   952 
   953     pBt = pPage->pBt;
   954 
   955     hdr = pPage->hdrOffset;
   956     data = pPage->aData;
   957     if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;
   958     assert( pBt->pageSize>=512 && pBt->pageSize<=32768 );
   959     pPage->maskPage = pBt->pageSize - 1;
   960     pPage->nOverflow = 0;
   961     usableSize = pBt->usableSize;
   962     pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
   963     top = get2byte(&data[hdr+5]);
   964     pPage->nCell = get2byte(&data[hdr+3]);
   965     if( pPage->nCell>MX_CELL(pBt) ){
   966       /* To many cells for a single page.  The page must be corrupt */
   967       return SQLITE_CORRUPT_BKPT;
   968     }
   969   
   970     /* Compute the total free space on the page */
   971     pc = get2byte(&data[hdr+1]);
   972     nFree = data[hdr+7] + top - (cellOffset + 2*pPage->nCell);
   973     while( pc>0 ){
   974       int next, size;
   975       if( pc>usableSize-4 ){
   976         /* Free block is off the page */
   977         return SQLITE_CORRUPT_BKPT; 
   978       }
   979       next = get2byte(&data[pc]);
   980       size = get2byte(&data[pc+2]);
   981       if( next>0 && next<=pc+size+3 ){
   982         /* Free blocks must be in accending order */
   983         return SQLITE_CORRUPT_BKPT; 
   984       }
   985       nFree += size;
   986       pc = next;
   987     }
   988     pPage->nFree = nFree;
   989     if( nFree>=usableSize ){
   990       /* Free space cannot exceed total page size */
   991       return SQLITE_CORRUPT_BKPT; 
   992     }
   993 
   994 #if 0
   995   /* Check that all the offsets in the cell offset array are within range. 
   996   ** 
   997   ** Omitting this consistency check and using the pPage->maskPage mask
   998   ** to prevent overrunning the page buffer in findCell() results in a
   999   ** 2.5% performance gain.
  1000   */
  1001   {
  1002     u8 *pOff;        /* Iterator used to check all cell offsets are in range */
  1003     u8 *pEnd;        /* Pointer to end of cell offset array */
  1004     u8 mask;         /* Mask of bits that must be zero in MSB of cell offsets */
  1005     mask = ~(((u8)(pBt->pageSize>>8))-1);
  1006     pEnd = &data[cellOffset + pPage->nCell*2];
  1007     for(pOff=&data[cellOffset]; pOff!=pEnd && !((*pOff)&mask); pOff+=2);
  1008     if( pOff!=pEnd ){
  1009       return SQLITE_CORRUPT_BKPT;
  1010     }
  1011   }
  1012 #endif
  1013 
  1014     pPage->isInit = 1;
  1015   }
  1016   return SQLITE_OK;
  1017 }
  1018 
  1019 /*
  1020 ** Set up a raw page so that it looks like a database page holding
  1021 ** no entries.
  1022 */
  1023 static void zeroPage(MemPage *pPage, int flags){
  1024   unsigned char *data = pPage->aData;
  1025   BtShared *pBt = pPage->pBt;
  1026   int hdr = pPage->hdrOffset;
  1027   int first;
  1028 
  1029   assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
  1030   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
  1031   assert( sqlite3PagerGetData(pPage->pDbPage) == data );
  1032   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  1033   assert( sqlite3_mutex_held(pBt->mutex) );
  1034   /*memset(&data[hdr], 0, pBt->usableSize - hdr);*/
  1035   data[hdr] = flags;
  1036   first = hdr + 8 + 4*((flags&PTF_LEAF)==0);
  1037   memset(&data[hdr+1], 0, 4);
  1038   data[hdr+7] = 0;
  1039   put2byte(&data[hdr+5], pBt->usableSize);
  1040   pPage->nFree = pBt->usableSize - first;
  1041   decodeFlags(pPage, flags);
  1042   pPage->hdrOffset = hdr;
  1043   pPage->cellOffset = first;
  1044   pPage->nOverflow = 0;
  1045   assert( pBt->pageSize>=512 && pBt->pageSize<=32768 );
  1046   pPage->maskPage = pBt->pageSize - 1;
  1047   pPage->nCell = 0;
  1048   pPage->isInit = 1;
  1049 }
  1050 
  1051 
  1052 /*
  1053 ** Convert a DbPage obtained from the pager into a MemPage used by
  1054 ** the btree layer.
  1055 */
  1056 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
  1057   MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
  1058   pPage->aData = sqlite3PagerGetData(pDbPage);
  1059   pPage->pDbPage = pDbPage;
  1060   pPage->pBt = pBt;
  1061   pPage->pgno = pgno;
  1062   pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
  1063   return pPage; 
  1064 }
  1065 
  1066 /*
  1067 ** Get a page from the pager.  Initialize the MemPage.pBt and
  1068 ** MemPage.aData elements if needed.
  1069 **
  1070 ** If the noContent flag is set, it means that we do not care about
  1071 ** the content of the page at this time.  So do not go to the disk
  1072 ** to fetch the content.  Just fill in the content with zeros for now.
  1073 ** If in the future we call sqlite3PagerWrite() on this page, that
  1074 ** means we have started to be concerned about content and the disk
  1075 ** read should occur at that point.
  1076 */
  1077 int sqlite3BtreeGetPage(
  1078   BtShared *pBt,       /* The btree */
  1079   Pgno pgno,           /* Number of the page to fetch */
  1080   MemPage **ppPage,    /* Return the page in this parameter */
  1081   int noContent        /* Do not load page content if true */
  1082 ){
  1083   int rc;
  1084   DbPage *pDbPage;
  1085 
  1086   assert( sqlite3_mutex_held(pBt->mutex) );
  1087   rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent);
  1088   if( rc ) return rc;
  1089   *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
  1090   return SQLITE_OK;
  1091 }
  1092 
  1093 /*
  1094 ** Return the size of the database file in pages.  Or return -1 if
  1095 ** there is any kind of error.
  1096 */
  1097 static int pagerPagecount(Pager *pPager){
  1098   int rc;
  1099   int nPage;
  1100   rc = sqlite3PagerPagecount(pPager, &nPage);
  1101   return (rc==SQLITE_OK?nPage:-1);
  1102 }
  1103 
  1104 /*
  1105 ** Get a page from the pager and initialize it.  This routine
  1106 ** is just a convenience wrapper around separate calls to
  1107 ** sqlite3BtreeGetPage() and sqlite3BtreeInitPage().
  1108 */
  1109 static int getAndInitPage(
  1110   BtShared *pBt,          /* The database file */
  1111   Pgno pgno,           /* Number of the page to get */
  1112   MemPage **ppPage     /* Write the page pointer here */
  1113 ){
  1114   int rc;
  1115   DbPage *pDbPage;
  1116   MemPage *pPage;
  1117 
  1118   assert( sqlite3_mutex_held(pBt->mutex) );
  1119   if( pgno==0 ){
  1120     return SQLITE_CORRUPT_BKPT; 
  1121   }
  1122 
  1123   /* It is often the case that the page we want is already in cache.
  1124   ** If so, get it directly.  This saves us from having to call
  1125   ** pagerPagecount() to make sure pgno is within limits, which results
  1126   ** in a measureable performance improvements.
  1127   */
  1128   pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
  1129   if( pDbPage ){
  1130     /* Page is already in cache */
  1131     *ppPage = pPage = btreePageFromDbPage(pDbPage, pgno, pBt);
  1132     rc = SQLITE_OK;
  1133   }else{
  1134     /* Page not in cache.  Acquire it. */
  1135     if( pgno>pagerPagecount(pBt->pPager) ){
  1136       return SQLITE_CORRUPT_BKPT; 
  1137     }
  1138     rc = sqlite3BtreeGetPage(pBt, pgno, ppPage, 0);
  1139     if( rc ) return rc;
  1140     pPage = *ppPage;
  1141   }
  1142   if( !pPage->isInit ){
  1143     rc = sqlite3BtreeInitPage(pPage);
  1144   }
  1145   if( rc!=SQLITE_OK ){
  1146     releasePage(pPage);
  1147     *ppPage = 0;
  1148   }
  1149   return rc;
  1150 }
  1151 
  1152 /*
  1153 ** Release a MemPage.  This should be called once for each prior
  1154 ** call to sqlite3BtreeGetPage.
  1155 */
  1156 static void releasePage(MemPage *pPage){
  1157   if( pPage ){
  1158     assert( pPage->aData );
  1159     assert( pPage->pBt );
  1160     assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
  1161     assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
  1162     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  1163     sqlite3PagerUnref(pPage->pDbPage);
  1164   }
  1165 }
  1166 
  1167 /*
  1168 ** During a rollback, when the pager reloads information into the cache
  1169 ** so that the cache is restored to its original state at the start of
  1170 ** the transaction, for each page restored this routine is called.
  1171 **
  1172 ** This routine needs to reset the extra data section at the end of the
  1173 ** page to agree with the restored data.
  1174 */
  1175 static void pageReinit(DbPage *pData){
  1176   MemPage *pPage;
  1177   pPage = (MemPage *)sqlite3PagerGetExtra(pData);
  1178   if( pPage->isInit ){
  1179     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  1180     pPage->isInit = 0;
  1181     if( sqlite3PagerPageRefcount(pData)>0 ){
  1182       sqlite3BtreeInitPage(pPage);
  1183     }
  1184   }
  1185 }
  1186 
  1187 /*
  1188 ** Invoke the busy handler for a btree.
  1189 */
  1190 static int sqlite3BtreeInvokeBusyHandler(void *pArg, int n){
  1191   BtShared *pBt = (BtShared*)pArg;
  1192   assert( pBt->db );
  1193   assert( sqlite3_mutex_held(pBt->db->mutex) );
  1194   return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
  1195 }
  1196 
  1197 /*
  1198 ** Open a database file.
  1199 ** 
  1200 ** zFilename is the name of the database file.  If zFilename is NULL
  1201 ** a new database with a random name is created.  This randomly named
  1202 ** database file will be deleted when sqlite3BtreeClose() is called.
  1203 ** If zFilename is ":memory:" then an in-memory database is created
  1204 ** that is automatically destroyed when it is closed.
  1205 */
  1206 int sqlite3BtreeOpen(
  1207   const char *zFilename,  /* Name of the file containing the BTree database */
  1208   sqlite3 *db,            /* Associated database handle */
  1209   Btree **ppBtree,        /* Pointer to new Btree object written here */
  1210   int flags,              /* Options */
  1211   int vfsFlags            /* Flags passed through to sqlite3_vfs.xOpen() */
  1212 ){
  1213   sqlite3_vfs *pVfs;      /* The VFS to use for this btree */
  1214   BtShared *pBt = 0;      /* Shared part of btree structure */
  1215   Btree *p;               /* Handle to return */
  1216   int rc = SQLITE_OK;
  1217   int nReserve;
  1218   unsigned char zDbHeader[100];
  1219 
  1220   /* Set the variable isMemdb to true for an in-memory database, or 
  1221   ** false for a file-based database. This symbol is only required if
  1222   ** either of the shared-data or autovacuum features are compiled 
  1223   ** into the library.
  1224   */
  1225 #if !defined(SQLITE_OMIT_SHARED_CACHE) || !defined(SQLITE_OMIT_AUTOVACUUM)
  1226   #ifdef SQLITE_OMIT_MEMORYDB
  1227     const int isMemdb = 0;
  1228   #else
  1229     const int isMemdb = zFilename && !strcmp(zFilename, ":memory:");
  1230   #endif
  1231 #endif
  1232 
  1233   assert( db!=0 );
  1234   assert( sqlite3_mutex_held(db->mutex) );
  1235 
  1236   pVfs = db->pVfs;
  1237   p = sqlite3MallocZero(sizeof(Btree));
  1238   if( !p ){
  1239     return SQLITE_NOMEM;
  1240   }
  1241   p->inTrans = TRANS_NONE;
  1242   p->db = db;
  1243 
  1244 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
  1245   /*
  1246   ** If this Btree is a candidate for shared cache, try to find an
  1247   ** existing BtShared object that we can share with
  1248   */
  1249   if( isMemdb==0
  1250    && (db->flags & SQLITE_Vtab)==0
  1251    && zFilename && zFilename[0]
  1252   ){
  1253     if( sqlite3GlobalConfig.sharedCacheEnabled ){
  1254       int nFullPathname = pVfs->mxPathname+1;
  1255       char *zFullPathname = sqlite3Malloc(nFullPathname);
  1256       sqlite3_mutex *mutexShared;
  1257       p->sharable = 1;
  1258       db->flags |= SQLITE_SharedCache;
  1259       if( !zFullPathname ){
  1260         sqlite3_free(p);
  1261         return SQLITE_NOMEM;
  1262       }
  1263       sqlite3OsFullPathname(pVfs, zFilename, nFullPathname, zFullPathname);
  1264       mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
  1265       sqlite3_mutex_enter(mutexShared);
  1266       for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
  1267         assert( pBt->nRef>0 );
  1268         if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager))
  1269                  && sqlite3PagerVfs(pBt->pPager)==pVfs ){
  1270           p->pBt = pBt;
  1271           pBt->nRef++;
  1272           break;
  1273         }
  1274       }
  1275       sqlite3_mutex_leave(mutexShared);
  1276       sqlite3_free(zFullPathname);
  1277     }
  1278 #ifdef SQLITE_DEBUG
  1279     else{
  1280       /* In debug mode, we mark all persistent databases as sharable
  1281       ** even when they are not.  This exercises the locking code and
  1282       ** gives more opportunity for asserts(sqlite3_mutex_held())
  1283       ** statements to find locking problems.
  1284       */
  1285       p->sharable = 1;
  1286     }
  1287 #endif
  1288   }
  1289 #endif
  1290   if( pBt==0 ){
  1291     /*
  1292     ** The following asserts make sure that structures used by the btree are
  1293     ** the right size.  This is to guard against size changes that result
  1294     ** when compiling on a different architecture.
  1295     */
  1296     assert( sizeof(i64)==8 || sizeof(i64)==4 );
  1297     assert( sizeof(u64)==8 || sizeof(u64)==4 );
  1298     assert( sizeof(u32)==4 );
  1299     assert( sizeof(u16)==2 );
  1300     assert( sizeof(Pgno)==4 );
  1301   
  1302     pBt = sqlite3MallocZero( sizeof(*pBt) );
  1303     if( pBt==0 ){
  1304       rc = SQLITE_NOMEM;
  1305       goto btree_open_out;
  1306     }
  1307     pBt->busyHdr.xFunc = sqlite3BtreeInvokeBusyHandler;
  1308     pBt->busyHdr.pArg = pBt;
  1309     rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
  1310                           EXTRA_SIZE, flags, vfsFlags);
  1311     if( rc==SQLITE_OK ){
  1312       rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
  1313     }
  1314     if( rc!=SQLITE_OK ){
  1315       goto btree_open_out;
  1316     }
  1317     sqlite3PagerSetBusyhandler(pBt->pPager, &pBt->busyHdr);
  1318     p->pBt = pBt;
  1319   
  1320     sqlite3PagerSetReiniter(pBt->pPager, pageReinit);
  1321     pBt->pCursor = 0;
  1322     pBt->pPage1 = 0;
  1323     pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager);
  1324     pBt->pageSize = get2byte(&zDbHeader[16]);
  1325     if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
  1326          || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
  1327       pBt->pageSize = 0;
  1328       sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
  1329 #ifndef SQLITE_OMIT_AUTOVACUUM
  1330       /* If the magic name ":memory:" will create an in-memory database, then
  1331       ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
  1332       ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
  1333       ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
  1334       ** regular file-name. In this case the auto-vacuum applies as per normal.
  1335       */
  1336       if( zFilename && !isMemdb ){
  1337         pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
  1338         pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
  1339       }
  1340 #endif
  1341       nReserve = 0;
  1342     }else{
  1343       nReserve = zDbHeader[20];
  1344       pBt->pageSizeFixed = 1;
  1345 #ifndef SQLITE_OMIT_AUTOVACUUM
  1346       pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
  1347       pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
  1348 #endif
  1349     }
  1350     pBt->usableSize = pBt->pageSize - nReserve;
  1351     assert( (pBt->pageSize & 7)==0 );  /* 8-byte alignment of pageSize */
  1352     sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
  1353    
  1354 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
  1355     /* Add the new BtShared object to the linked list sharable BtShareds.
  1356     */
  1357     if( p->sharable ){
  1358       sqlite3_mutex *mutexShared;
  1359       pBt->nRef = 1;
  1360       mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
  1361       if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
  1362         pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
  1363         if( pBt->mutex==0 ){
  1364           rc = SQLITE_NOMEM;
  1365           db->mallocFailed = 0;
  1366           goto btree_open_out;
  1367         }
  1368       }
  1369       sqlite3_mutex_enter(mutexShared);
  1370       pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
  1371       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
  1372       sqlite3_mutex_leave(mutexShared);
  1373     }
  1374 #endif
  1375   }
  1376 
  1377 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
  1378   /* If the new Btree uses a sharable pBtShared, then link the new
  1379   ** Btree into the list of all sharable Btrees for the same connection.
  1380   ** The list is kept in ascending order by pBt address.
  1381   */
  1382   if( p->sharable ){
  1383     int i;
  1384     Btree *pSib;
  1385     for(i=0; i<db->nDb; i++){
  1386       if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
  1387         while( pSib->pPrev ){ pSib = pSib->pPrev; }
  1388         if( p->pBt<pSib->pBt ){
  1389           p->pNext = pSib;
  1390           p->pPrev = 0;
  1391           pSib->pPrev = p;
  1392         }else{
  1393           while( pSib->pNext && pSib->pNext->pBt<p->pBt ){
  1394             pSib = pSib->pNext;
  1395           }
  1396           p->pNext = pSib->pNext;
  1397           p->pPrev = pSib;
  1398           if( p->pNext ){
  1399             p->pNext->pPrev = p;
  1400           }
  1401           pSib->pNext = p;
  1402         }
  1403         break;
  1404       }
  1405     }
  1406   }
  1407 #endif
  1408   *ppBtree = p;
  1409 
  1410 btree_open_out:
  1411   if( rc!=SQLITE_OK ){
  1412     if( pBt && pBt->pPager ){
  1413       sqlite3PagerClose(pBt->pPager);
  1414     }
  1415     sqlite3_free(pBt);
  1416     sqlite3_free(p);
  1417     *ppBtree = 0;
  1418   }
  1419   return rc;
  1420 }
  1421 
  1422 /*
  1423 ** Decrement the BtShared.nRef counter.  When it reaches zero,
  1424 ** remove the BtShared structure from the sharing list.  Return
  1425 ** true if the BtShared.nRef counter reaches zero and return
  1426 ** false if it is still positive.
  1427 */
  1428 static int removeFromSharingList(BtShared *pBt){
  1429 #ifndef SQLITE_OMIT_SHARED_CACHE
  1430   sqlite3_mutex *pMaster;
  1431   BtShared *pList;
  1432   int removed = 0;
  1433 
  1434   assert( sqlite3_mutex_notheld(pBt->mutex) );
  1435   pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
  1436   sqlite3_mutex_enter(pMaster);
  1437   pBt->nRef--;
  1438   if( pBt->nRef<=0 ){
  1439     if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
  1440       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
  1441     }else{
  1442       pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
  1443       while( ALWAYS(pList) && pList->pNext!=pBt ){
  1444         pList=pList->pNext;
  1445       }
  1446       if( ALWAYS(pList) ){
  1447         pList->pNext = pBt->pNext;
  1448       }
  1449     }
  1450     if( SQLITE_THREADSAFE ){
  1451       sqlite3_mutex_free(pBt->mutex);
  1452     }
  1453     removed = 1;
  1454   }
  1455   sqlite3_mutex_leave(pMaster);
  1456   return removed;
  1457 #else
  1458   return 1;
  1459 #endif
  1460 }
  1461 
  1462 /*
  1463 ** Make sure pBt->pTmpSpace points to an allocation of 
  1464 ** MX_CELL_SIZE(pBt) bytes.
  1465 */
  1466 static void allocateTempSpace(BtShared *pBt){
  1467   if( !pBt->pTmpSpace ){
  1468     pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
  1469   }
  1470 }
  1471 
  1472 /*
  1473 ** Free the pBt->pTmpSpace allocation
  1474 */
  1475 static void freeTempSpace(BtShared *pBt){
  1476   sqlite3PageFree( pBt->pTmpSpace);
  1477   pBt->pTmpSpace = 0;
  1478 }
  1479 
  1480 /*
  1481 ** Close an open database and invalidate all cursors.
  1482 */
  1483 int sqlite3BtreeClose(Btree *p){
  1484   BtShared *pBt = p->pBt;
  1485   BtCursor *pCur;
  1486 
  1487   /* Close all cursors opened via this handle.  */
  1488   assert( sqlite3_mutex_held(p->db->mutex) );
  1489   sqlite3BtreeEnter(p);
  1490   pBt->db = p->db;
  1491   pCur = pBt->pCursor;
  1492   while( pCur ){
  1493     BtCursor *pTmp = pCur;
  1494     pCur = pCur->pNext;
  1495     if( pTmp->pBtree==p ){
  1496       sqlite3BtreeCloseCursor(pTmp);
  1497     }
  1498   }
  1499 
  1500   /* Rollback any active transaction and free the handle structure.
  1501   ** The call to sqlite3BtreeRollback() drops any table-locks held by
  1502   ** this handle.
  1503   */
  1504   sqlite3BtreeRollback(p);
  1505   sqlite3BtreeLeave(p);
  1506 
  1507   /* If there are still other outstanding references to the shared-btree
  1508   ** structure, return now. The remainder of this procedure cleans 
  1509   ** up the shared-btree.
  1510   */
  1511   assert( p->wantToLock==0 && p->locked==0 );
  1512   if( !p->sharable || removeFromSharingList(pBt) ){
  1513     /* The pBt is no longer on the sharing list, so we can access
  1514     ** it without having to hold the mutex.
  1515     **
  1516     ** Clean out and delete the BtShared object.
  1517     */
  1518     assert( !pBt->pCursor );
  1519     sqlite3PagerClose(pBt->pPager);
  1520     if( pBt->xFreeSchema && pBt->pSchema ){
  1521       pBt->xFreeSchema(pBt->pSchema);
  1522     }
  1523     sqlite3_free(pBt->pSchema);
  1524     freeTempSpace(pBt);
  1525     sqlite3_free(pBt);
  1526   }
  1527 
  1528 #ifndef SQLITE_OMIT_SHARED_CACHE
  1529   assert( p->wantToLock==0 );
  1530   assert( p->locked==0 );
  1531   if( p->pPrev ) p->pPrev->pNext = p->pNext;
  1532   if( p->pNext ) p->pNext->pPrev = p->pPrev;
  1533 #endif
  1534 
  1535   sqlite3_free(p);
  1536   return SQLITE_OK;
  1537 }
  1538 
  1539 /*
  1540 ** Change the limit on the number of pages allowed in the cache.
  1541 **
  1542 ** The maximum number of cache pages is set to the absolute
  1543 ** value of mxPage.  If mxPage is negative, the pager will
  1544 ** operate asynchronously - it will not stop to do fsync()s
  1545 ** to insure data is written to the disk surface before
  1546 ** continuing.  Transactions still work if synchronous is off,
  1547 ** and the database cannot be corrupted if this program
  1548 ** crashes.  But if the operating system crashes or there is
  1549 ** an abrupt power failure when synchronous is off, the database
  1550 ** could be left in an inconsistent and unrecoverable state.
  1551 ** Synchronous is on by default so database corruption is not
  1552 ** normally a worry.
  1553 */
  1554 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
  1555   BtShared *pBt = p->pBt;
  1556   assert( sqlite3_mutex_held(p->db->mutex) );
  1557   sqlite3BtreeEnter(p);
  1558   sqlite3PagerSetCachesize(pBt->pPager, mxPage);
  1559   sqlite3BtreeLeave(p);
  1560   return SQLITE_OK;
  1561 }
  1562 
  1563 /*
  1564 ** Change the way data is synced to disk in order to increase or decrease
  1565 ** how well the database resists damage due to OS crashes and power
  1566 ** failures.  Level 1 is the same as asynchronous (no syncs() occur and
  1567 ** there is a high probability of damage)  Level 2 is the default.  There
  1568 ** is a very low but non-zero probability of damage.  Level 3 reduces the
  1569 ** probability of damage to near zero but with a write performance reduction.
  1570 */
  1571 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
  1572 int sqlite3BtreeSetSafetyLevel(Btree *p, int level, int fullSync){
  1573   BtShared *pBt = p->pBt;
  1574   assert( sqlite3_mutex_held(p->db->mutex) );
  1575   sqlite3BtreeEnter(p);
  1576   sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync);
  1577   sqlite3BtreeLeave(p);
  1578   return SQLITE_OK;
  1579 }
  1580 #endif
  1581 
  1582 /*
  1583 ** Return TRUE if the given btree is set to safety level 1.  In other
  1584 ** words, return TRUE if no sync() occurs on the disk files.
  1585 */
  1586 int sqlite3BtreeSyncDisabled(Btree *p){
  1587   BtShared *pBt = p->pBt;
  1588   int rc;
  1589   assert( sqlite3_mutex_held(p->db->mutex) );  
  1590   sqlite3BtreeEnter(p);
  1591   assert( pBt && pBt->pPager );
  1592   rc = sqlite3PagerNosync(pBt->pPager);
  1593   sqlite3BtreeLeave(p);
  1594   return rc;
  1595 }
  1596 
  1597 #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
  1598 /*
  1599 ** Change the default pages size and the number of reserved bytes per page.
  1600 **
  1601 ** The page size must be a power of 2 between 512 and 65536.  If the page
  1602 ** size supplied does not meet this constraint then the page size is not
  1603 ** changed.
  1604 **
  1605 ** Page sizes are constrained to be a power of two so that the region
  1606 ** of the database file used for locking (beginning at PENDING_BYTE,
  1607 ** the first byte past the 1GB boundary, 0x40000000) needs to occur
  1608 ** at the beginning of a page.
  1609 **
  1610 ** If parameter nReserve is less than zero, then the number of reserved
  1611 ** bytes per page is left unchanged.
  1612 */
  1613 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve){
  1614   int rc = SQLITE_OK;
  1615   BtShared *pBt = p->pBt;
  1616   sqlite3BtreeEnter(p);
  1617   if( pBt->pageSizeFixed ){
  1618     sqlite3BtreeLeave(p);
  1619     return SQLITE_READONLY;
  1620   }
  1621   if( nReserve<0 ){
  1622     nReserve = pBt->pageSize - pBt->usableSize;
  1623   }
  1624   if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
  1625         ((pageSize-1)&pageSize)==0 ){
  1626     assert( (pageSize & 7)==0 );
  1627     assert( !pBt->pPage1 && !pBt->pCursor );
  1628     pBt->pageSize = pageSize;
  1629     freeTempSpace(pBt);
  1630     rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
  1631   }
  1632   pBt->usableSize = pBt->pageSize - nReserve;
  1633   sqlite3BtreeLeave(p);
  1634   return rc;
  1635 }
  1636 
  1637 /*
  1638 ** Return the currently defined page size
  1639 */
  1640 int sqlite3BtreeGetPageSize(Btree *p){
  1641   return p->pBt->pageSize;
  1642 }
  1643 int sqlite3BtreeGetReserve(Btree *p){
  1644   int n;
  1645   sqlite3BtreeEnter(p);
  1646   n = p->pBt->pageSize - p->pBt->usableSize;
  1647   sqlite3BtreeLeave(p);
  1648   return n;
  1649 }
  1650 
  1651 /*
  1652 ** Set the maximum page count for a database if mxPage is positive.
  1653 ** No changes are made if mxPage is 0 or negative.
  1654 ** Regardless of the value of mxPage, return the maximum page count.
  1655 */
  1656 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
  1657   int n;
  1658   sqlite3BtreeEnter(p);
  1659   n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
  1660   sqlite3BtreeLeave(p);
  1661   return n;
  1662 }
  1663 #endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */
  1664 
  1665 /*
  1666 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
  1667 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
  1668 ** is disabled. The default value for the auto-vacuum property is 
  1669 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
  1670 */
  1671 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
  1672 #ifdef SQLITE_OMIT_AUTOVACUUM
  1673   return SQLITE_READONLY;
  1674 #else
  1675   BtShared *pBt = p->pBt;
  1676   int rc = SQLITE_OK;
  1677   int av = (autoVacuum?1:0);
  1678 
  1679   sqlite3BtreeEnter(p);
  1680   if( pBt->pageSizeFixed && av!=pBt->autoVacuum ){
  1681     rc = SQLITE_READONLY;
  1682   }else{
  1683     pBt->autoVacuum = av;
  1684   }
  1685   sqlite3BtreeLeave(p);
  1686   return rc;
  1687 #endif
  1688 }
  1689 
  1690 /*
  1691 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is 
  1692 ** enabled 1 is returned. Otherwise 0.
  1693 */
  1694 int sqlite3BtreeGetAutoVacuum(Btree *p){
  1695 #ifdef SQLITE_OMIT_AUTOVACUUM
  1696   return BTREE_AUTOVACUUM_NONE;
  1697 #else
  1698   int rc;
  1699   sqlite3BtreeEnter(p);
  1700   rc = (
  1701     (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
  1702     (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
  1703     BTREE_AUTOVACUUM_INCR
  1704   );
  1705   sqlite3BtreeLeave(p);
  1706   return rc;
  1707 #endif
  1708 }
  1709 
  1710 
  1711 /*
  1712 ** Get a reference to pPage1 of the database file.  This will
  1713 ** also acquire a readlock on that file.
  1714 **
  1715 ** SQLITE_OK is returned on success.  If the file is not a
  1716 ** well-formed database file, then SQLITE_CORRUPT is returned.
  1717 ** SQLITE_BUSY is returned if the database is locked.  SQLITE_NOMEM
  1718 ** is returned if we run out of memory. 
  1719 */
  1720 static int lockBtree(BtShared *pBt){
  1721   int rc;
  1722   MemPage *pPage1;
  1723   int nPage;
  1724 
  1725   assert( sqlite3_mutex_held(pBt->mutex) );
  1726   if( pBt->pPage1 ) return SQLITE_OK;
  1727   rc = sqlite3BtreeGetPage(pBt, 1, &pPage1, 0);
  1728   if( rc!=SQLITE_OK ) return rc;
  1729 
  1730   /* Do some checking to help insure the file we opened really is
  1731   ** a valid database file. 
  1732   */
  1733   rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
  1734   if( rc!=SQLITE_OK ){
  1735     goto page1_init_failed;
  1736   }else if( nPage>0 ){
  1737     int pageSize;
  1738     int usableSize;
  1739     u8 *page1 = pPage1->aData;
  1740     rc = SQLITE_NOTADB;
  1741     if( memcmp(page1, zMagicHeader, 16)!=0 ){
  1742       goto page1_init_failed;
  1743     }
  1744     if( page1[18]>1 ){
  1745       pBt->readOnly = 1;
  1746     }
  1747     if( page1[19]>1 ){
  1748       goto page1_init_failed;
  1749     }
  1750 
  1751     /* The maximum embedded fraction must be exactly 25%.  And the minimum
  1752     ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data.
  1753     ** The original design allowed these amounts to vary, but as of
  1754     ** version 3.6.0, we require them to be fixed.
  1755     */
  1756     if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
  1757       goto page1_init_failed;
  1758     }
  1759     pageSize = get2byte(&page1[16]);
  1760     if( ((pageSize-1)&pageSize)!=0 || pageSize<512 ||
  1761         (SQLITE_MAX_PAGE_SIZE<32768 && pageSize>SQLITE_MAX_PAGE_SIZE)
  1762     ){
  1763       goto page1_init_failed;
  1764     }
  1765     assert( (pageSize & 7)==0 );
  1766     usableSize = pageSize - page1[20];
  1767     if( pageSize!=pBt->pageSize ){
  1768       /* After reading the first page of the database assuming a page size
  1769       ** of BtShared.pageSize, we have discovered that the page-size is
  1770       ** actually pageSize. Unlock the database, leave pBt->pPage1 at
  1771       ** zero and return SQLITE_OK. The caller will call this function
  1772       ** again with the correct page-size.
  1773       */
  1774       releasePage(pPage1);
  1775       pBt->usableSize = usableSize;
  1776       pBt->pageSize = pageSize;
  1777       freeTempSpace(pBt);
  1778       sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
  1779       return SQLITE_OK;
  1780     }
  1781     if( usableSize<500 ){
  1782       goto page1_init_failed;
  1783     }
  1784     pBt->pageSize = pageSize;
  1785     pBt->usableSize = usableSize;
  1786 #ifndef SQLITE_OMIT_AUTOVACUUM
  1787     pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
  1788     pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
  1789 #endif
  1790   }
  1791 
  1792   /* maxLocal is the maximum amount of payload to store locally for
  1793   ** a cell.  Make sure it is small enough so that at least minFanout
  1794   ** cells can will fit on one page.  We assume a 10-byte page header.
  1795   ** Besides the payload, the cell must store:
  1796   **     2-byte pointer to the cell
  1797   **     4-byte child pointer
  1798   **     9-byte nKey value
  1799   **     4-byte nData value
  1800   **     4-byte overflow page pointer
  1801   ** So a cell consists of a 2-byte poiner, a header which is as much as
  1802   ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
  1803   ** page pointer.
  1804   */
  1805   pBt->maxLocal = (pBt->usableSize-12)*64/255 - 23;
  1806   pBt->minLocal = (pBt->usableSize-12)*32/255 - 23;
  1807   pBt->maxLeaf = pBt->usableSize - 35;
  1808   pBt->minLeaf = (pBt->usableSize-12)*32/255 - 23;
  1809   assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
  1810   pBt->pPage1 = pPage1;
  1811   return SQLITE_OK;
  1812 
  1813 page1_init_failed:
  1814   releasePage(pPage1);
  1815   pBt->pPage1 = 0;
  1816   return rc;
  1817 }
  1818 
  1819 /*
  1820 ** This routine works like lockBtree() except that it also invokes the
  1821 ** busy callback if there is lock contention.
  1822 */
  1823 static int lockBtreeWithRetry(Btree *pRef){
  1824   int rc = SQLITE_OK;
  1825 
  1826   assert( sqlite3BtreeHoldsMutex(pRef) );
  1827   if( pRef->inTrans==TRANS_NONE ){
  1828     u8 inTransaction = pRef->pBt->inTransaction;
  1829     btreeIntegrity(pRef);
  1830     rc = sqlite3BtreeBeginTrans(pRef, 0);
  1831     pRef->pBt->inTransaction = inTransaction;
  1832     pRef->inTrans = TRANS_NONE;
  1833     if( rc==SQLITE_OK ){
  1834       pRef->pBt->nTransaction--;
  1835     }
  1836     btreeIntegrity(pRef);
  1837   }
  1838   return rc;
  1839 }
  1840        
  1841 
  1842 /*
  1843 ** If there are no outstanding cursors and we are not in the middle
  1844 ** of a transaction but there is a read lock on the database, then
  1845 ** this routine unrefs the first page of the database file which 
  1846 ** has the effect of releasing the read lock.
  1847 **
  1848 ** If there are any outstanding cursors, this routine is a no-op.
  1849 **
  1850 ** If there is a transaction in progress, this routine is a no-op.
  1851 */
  1852 static void unlockBtreeIfUnused(BtShared *pBt){
  1853   assert( sqlite3_mutex_held(pBt->mutex) );
  1854   if( pBt->inTransaction==TRANS_NONE && pBt->pCursor==0 && pBt->pPage1!=0 ){
  1855     if( sqlite3PagerRefcount(pBt->pPager)>=1 ){
  1856       assert( pBt->pPage1->aData );
  1857 #if 0
  1858       if( pBt->pPage1->aData==0 ){
  1859         MemPage *pPage = pBt->pPage1;
  1860         pPage->aData = sqlite3PagerGetData(pPage->pDbPage);
  1861         pPage->pBt = pBt;
  1862         pPage->pgno = 1;
  1863       }
  1864 #endif
  1865       releasePage(pBt->pPage1);
  1866     }
  1867     pBt->pPage1 = 0;
  1868     pBt->inStmt = 0;
  1869   }
  1870 }
  1871 
  1872 /*
  1873 ** Create a new database by initializing the first page of the
  1874 ** file.
  1875 */
  1876 static int newDatabase(BtShared *pBt){
  1877   MemPage *pP1;
  1878   unsigned char *data;
  1879   int rc;
  1880   int nPage;
  1881 
  1882   assert( sqlite3_mutex_held(pBt->mutex) );
  1883   rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
  1884   if( rc!=SQLITE_OK || nPage>0 ){
  1885     return rc;
  1886   }
  1887   pP1 = pBt->pPage1;
  1888   assert( pP1!=0 );
  1889   data = pP1->aData;
  1890   rc = sqlite3PagerWrite(pP1->pDbPage);
  1891   if( rc ) return rc;
  1892   memcpy(data, zMagicHeader, sizeof(zMagicHeader));
  1893   assert( sizeof(zMagicHeader)==16 );
  1894   put2byte(&data[16], pBt->pageSize);
  1895   data[18] = 1;
  1896   data[19] = 1;
  1897   data[20] = pBt->pageSize - pBt->usableSize;
  1898   data[21] = 64;
  1899   data[22] = 32;
  1900   data[23] = 32;
  1901   memset(&data[24], 0, 100-24);
  1902   zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
  1903   pBt->pageSizeFixed = 1;
  1904 #ifndef SQLITE_OMIT_AUTOVACUUM
  1905   assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
  1906   assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
  1907   put4byte(&data[36 + 4*4], pBt->autoVacuum);
  1908   put4byte(&data[36 + 7*4], pBt->incrVacuum);
  1909 #endif
  1910   return SQLITE_OK;
  1911 }
  1912 
  1913 /*
  1914 ** Attempt to start a new transaction. A write-transaction
  1915 ** is started if the second argument is nonzero, otherwise a read-
  1916 ** transaction.  If the second argument is 2 or more and exclusive
  1917 ** transaction is started, meaning that no other process is allowed
  1918 ** to access the database.  A preexisting transaction may not be
  1919 ** upgraded to exclusive by calling this routine a second time - the
  1920 ** exclusivity flag only works for a new transaction.
  1921 **
  1922 ** A write-transaction must be started before attempting any 
  1923 ** changes to the database.  None of the following routines 
  1924 ** will work unless a transaction is started first:
  1925 **
  1926 **      sqlite3BtreeCreateTable()
  1927 **      sqlite3BtreeCreateIndex()
  1928 **      sqlite3BtreeClearTable()
  1929 **      sqlite3BtreeDropTable()
  1930 **      sqlite3BtreeInsert()
  1931 **      sqlite3BtreeDelete()
  1932 **      sqlite3BtreeUpdateMeta()
  1933 **
  1934 ** If an initial attempt to acquire the lock fails because of lock contention
  1935 ** and the database was previously unlocked, then invoke the busy handler
  1936 ** if there is one.  But if there was previously a read-lock, do not
  1937 ** invoke the busy handler - just return SQLITE_BUSY.  SQLITE_BUSY is 
  1938 ** returned when there is already a read-lock in order to avoid a deadlock.
  1939 **
  1940 ** Suppose there are two processes A and B.  A has a read lock and B has
  1941 ** a reserved lock.  B tries to promote to exclusive but is blocked because
  1942 ** of A's read lock.  A tries to promote to reserved but is blocked by B.
  1943 ** One or the other of the two processes must give way or there can be
  1944 ** no progress.  By returning SQLITE_BUSY and not invoking the busy callback
  1945 ** when A already has a read lock, we encourage A to give up and let B
  1946 ** proceed.
  1947 */
  1948 int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
  1949   BtShared *pBt = p->pBt;
  1950   int rc = SQLITE_OK;
  1951 
  1952   sqlite3BtreeEnter(p);
  1953   pBt->db = p->db;
  1954   btreeIntegrity(p);
  1955 
  1956   /* If the btree is already in a write-transaction, or it
  1957   ** is already in a read-transaction and a read-transaction
  1958   ** is requested, this is a no-op.
  1959   */
  1960   if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
  1961     goto trans_begun;
  1962   }
  1963 
  1964   /* Write transactions are not possible on a read-only database */
  1965   if( pBt->readOnly && wrflag ){
  1966     rc = SQLITE_READONLY;
  1967     goto trans_begun;
  1968   }
  1969 
  1970   /* If another database handle has already opened a write transaction 
  1971   ** on this shared-btree structure and a second write transaction is
  1972   ** requested, return SQLITE_BUSY.
  1973   */
  1974   if( pBt->inTransaction==TRANS_WRITE && wrflag ){
  1975     rc = SQLITE_BUSY;
  1976     goto trans_begun;
  1977   }
  1978 
  1979 #ifndef SQLITE_OMIT_SHARED_CACHE
  1980   if( wrflag>1 ){
  1981     BtLock *pIter;
  1982     for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
  1983       if( pIter->pBtree!=p ){
  1984         rc = SQLITE_BUSY;
  1985         goto trans_begun;
  1986       }
  1987     }
  1988   }
  1989 #endif
  1990 
  1991   do {
  1992     if( pBt->pPage1==0 ){
  1993       do{
  1994         rc = lockBtree(pBt);
  1995       }while( pBt->pPage1==0 && rc==SQLITE_OK );
  1996     }
  1997 
  1998     if( rc==SQLITE_OK && wrflag ){
  1999       if( pBt->readOnly ){
  2000         rc = SQLITE_READONLY;
  2001       }else{
  2002         rc = sqlite3PagerBegin(pBt->pPage1->pDbPage, wrflag>1);
  2003         if( rc==SQLITE_OK ){
  2004           rc = newDatabase(pBt);
  2005         }
  2006       }
  2007     }
  2008   
  2009     if( rc==SQLITE_OK ){
  2010       if( wrflag ) pBt->inStmt = 0;
  2011     }else{
  2012       unlockBtreeIfUnused(pBt);
  2013     }
  2014   }while( rc==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
  2015           sqlite3BtreeInvokeBusyHandler(pBt, 0) );
  2016 
  2017   if( rc==SQLITE_OK ){
  2018     if( p->inTrans==TRANS_NONE ){
  2019       pBt->nTransaction++;
  2020     }
  2021     p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
  2022     if( p->inTrans>pBt->inTransaction ){
  2023       pBt->inTransaction = p->inTrans;
  2024     }
  2025 #ifndef SQLITE_OMIT_SHARED_CACHE
  2026     if( wrflag>1 ){
  2027       assert( !pBt->pExclusive );
  2028       pBt->pExclusive = p;
  2029     }
  2030 #endif
  2031   }
  2032 
  2033 
  2034 trans_begun:
  2035   btreeIntegrity(p);
  2036   sqlite3BtreeLeave(p);
  2037   return rc;
  2038 }
  2039 
  2040 
  2041 #ifndef SQLITE_OMIT_AUTOVACUUM
  2042 
  2043 /*
  2044 ** Set the pointer-map entries for all children of page pPage. Also, if
  2045 ** pPage contains cells that point to overflow pages, set the pointer
  2046 ** map entries for the overflow pages as well.
  2047 */
  2048 static int setChildPtrmaps(MemPage *pPage){
  2049   int i;                             /* Counter variable */
  2050   int nCell;                         /* Number of cells in page pPage */
  2051   int rc;                            /* Return code */
  2052   BtShared *pBt = pPage->pBt;
  2053   int isInitOrig = pPage->isInit;
  2054   Pgno pgno = pPage->pgno;
  2055 
  2056   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  2057   rc = sqlite3BtreeInitPage(pPage);
  2058   if( rc!=SQLITE_OK ){
  2059     goto set_child_ptrmaps_out;
  2060   }
  2061   nCell = pPage->nCell;
  2062 
  2063   for(i=0; i<nCell; i++){
  2064     u8 *pCell = findCell(pPage, i);
  2065 
  2066     rc = ptrmapPutOvflPtr(pPage, pCell);
  2067     if( rc!=SQLITE_OK ){
  2068       goto set_child_ptrmaps_out;
  2069     }
  2070 
  2071     if( !pPage->leaf ){
  2072       Pgno childPgno = get4byte(pCell);
  2073       rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
  2074       if( rc!=SQLITE_OK ) goto set_child_ptrmaps_out;
  2075     }
  2076   }
  2077 
  2078   if( !pPage->leaf ){
  2079     Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
  2080     rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
  2081   }
  2082 
  2083 set_child_ptrmaps_out:
  2084   pPage->isInit = isInitOrig;
  2085   return rc;
  2086 }
  2087 
  2088 /*
  2089 ** Somewhere on pPage, which is guarenteed to be a btree page, not an overflow
  2090 ** page, is a pointer to page iFrom. Modify this pointer so that it points to
  2091 ** iTo. Parameter eType describes the type of pointer to be modified, as 
  2092 ** follows:
  2093 **
  2094 ** PTRMAP_BTREE:     pPage is a btree-page. The pointer points at a child 
  2095 **                   page of pPage.
  2096 **
  2097 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
  2098 **                   page pointed to by one of the cells on pPage.
  2099 **
  2100 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
  2101 **                   overflow page in the list.
  2102 */
  2103 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
  2104   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  2105   if( eType==PTRMAP_OVERFLOW2 ){
  2106     /* The pointer is always the first 4 bytes of the page in this case.  */
  2107     if( get4byte(pPage->aData)!=iFrom ){
  2108       return SQLITE_CORRUPT_BKPT;
  2109     }
  2110     put4byte(pPage->aData, iTo);
  2111   }else{
  2112     int isInitOrig = pPage->isInit;
  2113     int i;
  2114     int nCell;
  2115 
  2116     sqlite3BtreeInitPage(pPage);
  2117     nCell = pPage->nCell;
  2118 
  2119     for(i=0; i<nCell; i++){
  2120       u8 *pCell = findCell(pPage, i);
  2121       if( eType==PTRMAP_OVERFLOW1 ){
  2122         CellInfo info;
  2123         sqlite3BtreeParseCellPtr(pPage, pCell, &info);
  2124         if( info.iOverflow ){
  2125           if( iFrom==get4byte(&pCell[info.iOverflow]) ){
  2126             put4byte(&pCell[info.iOverflow], iTo);
  2127             break;
  2128           }
  2129         }
  2130       }else{
  2131         if( get4byte(pCell)==iFrom ){
  2132           put4byte(pCell, iTo);
  2133           break;
  2134         }
  2135       }
  2136     }
  2137   
  2138     if( i==nCell ){
  2139       if( eType!=PTRMAP_BTREE || 
  2140           get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
  2141         return SQLITE_CORRUPT_BKPT;
  2142       }
  2143       put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
  2144     }
  2145 
  2146     pPage->isInit = isInitOrig;
  2147   }
  2148   return SQLITE_OK;
  2149 }
  2150 
  2151 
  2152 /*
  2153 ** Move the open database page pDbPage to location iFreePage in the 
  2154 ** database. The pDbPage reference remains valid.
  2155 */
  2156 static int relocatePage(
  2157   BtShared *pBt,           /* Btree */
  2158   MemPage *pDbPage,        /* Open page to move */
  2159   u8 eType,                /* Pointer map 'type' entry for pDbPage */
  2160   Pgno iPtrPage,           /* Pointer map 'page-no' entry for pDbPage */
  2161   Pgno iFreePage,          /* The location to move pDbPage to */
  2162   int isCommit
  2163 ){
  2164   MemPage *pPtrPage;   /* The page that contains a pointer to pDbPage */
  2165   Pgno iDbPage = pDbPage->pgno;
  2166   Pager *pPager = pBt->pPager;
  2167   int rc;
  2168 
  2169   assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 || 
  2170       eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
  2171   assert( sqlite3_mutex_held(pBt->mutex) );
  2172   assert( pDbPage->pBt==pBt );
  2173 
  2174   /* Move page iDbPage from its current location to page number iFreePage */
  2175   TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n", 
  2176       iDbPage, iFreePage, iPtrPage, eType));
  2177   rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
  2178   if( rc!=SQLITE_OK ){
  2179     return rc;
  2180   }
  2181   pDbPage->pgno = iFreePage;
  2182 
  2183   /* If pDbPage was a btree-page, then it may have child pages and/or cells
  2184   ** that point to overflow pages. The pointer map entries for all these
  2185   ** pages need to be changed.
  2186   **
  2187   ** If pDbPage is an overflow page, then the first 4 bytes may store a
  2188   ** pointer to a subsequent overflow page. If this is the case, then
  2189   ** the pointer map needs to be updated for the subsequent overflow page.
  2190   */
  2191   if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
  2192     rc = setChildPtrmaps(pDbPage);
  2193     if( rc!=SQLITE_OK ){
  2194       return rc;
  2195     }
  2196   }else{
  2197     Pgno nextOvfl = get4byte(pDbPage->aData);
  2198     if( nextOvfl!=0 ){
  2199       rc = ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage);
  2200       if( rc!=SQLITE_OK ){
  2201         return rc;
  2202       }
  2203     }
  2204   }
  2205 
  2206   /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
  2207   ** that it points at iFreePage. Also fix the pointer map entry for
  2208   ** iPtrPage.
  2209   */
  2210   if( eType!=PTRMAP_ROOTPAGE ){
  2211     rc = sqlite3BtreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
  2212     if( rc!=SQLITE_OK ){
  2213       return rc;
  2214     }
  2215     rc = sqlite3PagerWrite(pPtrPage->pDbPage);
  2216     if( rc!=SQLITE_OK ){
  2217       releasePage(pPtrPage);
  2218       return rc;
  2219     }
  2220     rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
  2221     releasePage(pPtrPage);
  2222     if( rc==SQLITE_OK ){
  2223       rc = ptrmapPut(pBt, iFreePage, eType, iPtrPage);
  2224     }
  2225   }
  2226   return rc;
  2227 }
  2228 
  2229 /* Forward declaration required by incrVacuumStep(). */
  2230 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
  2231 
  2232 /*
  2233 ** Perform a single step of an incremental-vacuum. If successful,
  2234 ** return SQLITE_OK. If there is no work to do (and therefore no
  2235 ** point in calling this function again), return SQLITE_DONE.
  2236 **
  2237 ** More specificly, this function attempts to re-organize the 
  2238 ** database so that the last page of the file currently in use
  2239 ** is no longer in use.
  2240 **
  2241 ** If the nFin parameter is non-zero, the implementation assumes
  2242 ** that the caller will keep calling incrVacuumStep() until
  2243 ** it returns SQLITE_DONE or an error, and that nFin is the
  2244 ** number of pages the database file will contain after this 
  2245 ** process is complete.
  2246 */
  2247 static int incrVacuumStep(BtShared *pBt, Pgno nFin){
  2248   Pgno iLastPg;             /* Last page in the database */
  2249   Pgno nFreeList;           /* Number of pages still on the free-list */
  2250 
  2251   assert( sqlite3_mutex_held(pBt->mutex) );
  2252   iLastPg = pBt->nTrunc;
  2253   if( iLastPg==0 ){
  2254     iLastPg = pagerPagecount(pBt->pPager);
  2255   }
  2256 
  2257   if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
  2258     int rc;
  2259     u8 eType;
  2260     Pgno iPtrPage;
  2261 
  2262     nFreeList = get4byte(&pBt->pPage1->aData[36]);
  2263     if( nFreeList==0 || nFin==iLastPg ){
  2264       return SQLITE_DONE;
  2265     }
  2266 
  2267     rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
  2268     if( rc!=SQLITE_OK ){
  2269       return rc;
  2270     }
  2271     if( eType==PTRMAP_ROOTPAGE ){
  2272       return SQLITE_CORRUPT_BKPT;
  2273     }
  2274 
  2275     if( eType==PTRMAP_FREEPAGE ){
  2276       if( nFin==0 ){
  2277         /* Remove the page from the files free-list. This is not required
  2278         ** if nFin is non-zero. In that case, the free-list will be
  2279         ** truncated to zero after this function returns, so it doesn't 
  2280         ** matter if it still contains some garbage entries.
  2281         */
  2282         Pgno iFreePg;
  2283         MemPage *pFreePg;
  2284         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1);
  2285         if( rc!=SQLITE_OK ){
  2286           return rc;
  2287         }
  2288         assert( iFreePg==iLastPg );
  2289         releasePage(pFreePg);
  2290       }
  2291     } else {
  2292       Pgno iFreePg;             /* Index of free page to move pLastPg to */
  2293       MemPage *pLastPg;
  2294 
  2295       rc = sqlite3BtreeGetPage(pBt, iLastPg, &pLastPg, 0);
  2296       if( rc!=SQLITE_OK ){
  2297         return rc;
  2298       }
  2299 
  2300       /* If nFin is zero, this loop runs exactly once and page pLastPg
  2301       ** is swapped with the first free page pulled off the free list.
  2302       **
  2303       ** On the other hand, if nFin is greater than zero, then keep
  2304       ** looping until a free-page located within the first nFin pages
  2305       ** of the file is found.
  2306       */
  2307       do {
  2308         MemPage *pFreePg;
  2309         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0);
  2310         if( rc!=SQLITE_OK ){
  2311           releasePage(pLastPg);
  2312           return rc;
  2313         }
  2314         releasePage(pFreePg);
  2315       }while( nFin!=0 && iFreePg>nFin );
  2316       assert( iFreePg<iLastPg );
  2317       
  2318       rc = sqlite3PagerWrite(pLastPg->pDbPage);
  2319       if( rc==SQLITE_OK ){
  2320         rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, nFin!=0);
  2321       }
  2322       releasePage(pLastPg);
  2323       if( rc!=SQLITE_OK ){
  2324         return rc;
  2325       }
  2326     }
  2327   }
  2328 
  2329   pBt->nTrunc = iLastPg - 1;
  2330   while( pBt->nTrunc==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, pBt->nTrunc) ){
  2331     pBt->nTrunc--;
  2332   }
  2333   return SQLITE_OK;
  2334 }
  2335 
  2336 /*
  2337 ** A write-transaction must be opened before calling this function.
  2338 ** It performs a single unit of work towards an incremental vacuum.
  2339 **
  2340 ** If the incremental vacuum is finished after this function has run,
  2341 ** SQLITE_DONE is returned. If it is not finished, but no error occured,
  2342 ** SQLITE_OK is returned. Otherwise an SQLite error code. 
  2343 */
  2344 int sqlite3BtreeIncrVacuum(Btree *p){
  2345   int rc;
  2346   BtShared *pBt = p->pBt;
  2347 
  2348   sqlite3BtreeEnter(p);
  2349   pBt->db = p->db;
  2350   assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
  2351   if( !pBt->autoVacuum ){
  2352     rc = SQLITE_DONE;
  2353   }else{
  2354     invalidateAllOverflowCache(pBt);
  2355     rc = incrVacuumStep(pBt, 0);
  2356   }
  2357   sqlite3BtreeLeave(p);
  2358   return rc;
  2359 }
  2360 
  2361 /*
  2362 ** This routine is called prior to sqlite3PagerCommit when a transaction
  2363 ** is commited for an auto-vacuum database.
  2364 **
  2365 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
  2366 ** the database file should be truncated to during the commit process. 
  2367 ** i.e. the database has been reorganized so that only the first *pnTrunc
  2368 ** pages are in use.
  2369 */
  2370 static int autoVacuumCommit(BtShared *pBt, Pgno *pnTrunc){
  2371   int rc = SQLITE_OK;
  2372   Pager *pPager = pBt->pPager;
  2373   VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager) );
  2374 
  2375   assert( sqlite3_mutex_held(pBt->mutex) );
  2376   invalidateAllOverflowCache(pBt);
  2377   assert(pBt->autoVacuum);
  2378   if( !pBt->incrVacuum ){
  2379     Pgno nFin = 0;
  2380 
  2381     if( pBt->nTrunc==0 ){
  2382       Pgno nFree;
  2383       Pgno nPtrmap;
  2384       const int pgsz = pBt->pageSize;
  2385       int nOrig = pagerPagecount(pBt->pPager);
  2386 
  2387       if( PTRMAP_ISPAGE(pBt, nOrig) ){
  2388         return SQLITE_CORRUPT_BKPT;
  2389       }
  2390       if( nOrig==PENDING_BYTE_PAGE(pBt) ){
  2391         nOrig--;
  2392       }
  2393       nFree = get4byte(&pBt->pPage1->aData[36]);
  2394       nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+pgsz/5)/(pgsz/5);
  2395       nFin = nOrig - nFree - nPtrmap;
  2396       if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<=PENDING_BYTE_PAGE(pBt) ){
  2397         nFin--;
  2398       }
  2399       while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
  2400         nFin--;
  2401       }
  2402     }
  2403 
  2404     while( rc==SQLITE_OK ){
  2405       rc = incrVacuumStep(pBt, nFin);
  2406     }
  2407     if( rc==SQLITE_DONE ){
  2408       assert(nFin==0 || pBt->nTrunc==0 || nFin<=pBt->nTrunc);
  2409       rc = SQLITE_OK;
  2410       if( pBt->nTrunc && nFin ){
  2411         rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
  2412         put4byte(&pBt->pPage1->aData[32], 0);
  2413         put4byte(&pBt->pPage1->aData[36], 0);
  2414         pBt->nTrunc = nFin;
  2415       }
  2416     }
  2417     if( rc!=SQLITE_OK ){
  2418       sqlite3PagerRollback(pPager);
  2419     }
  2420   }
  2421 
  2422   if( rc==SQLITE_OK ){
  2423     *pnTrunc = pBt->nTrunc;
  2424     pBt->nTrunc = 0;
  2425   }
  2426   assert( nRef==sqlite3PagerRefcount(pPager) );
  2427   return rc;
  2428 }
  2429 
  2430 #endif
  2431 
  2432 /*
  2433 ** This routine does the first phase of a two-phase commit.  This routine
  2434 ** causes a rollback journal to be created (if it does not already exist)
  2435 ** and populated with enough information so that if a power loss occurs
  2436 ** the database can be restored to its original state by playing back
  2437 ** the journal.  Then the contents of the journal are flushed out to
  2438 ** the disk.  After the journal is safely on oxide, the changes to the
  2439 ** database are written into the database file and flushed to oxide.
  2440 ** At the end of this call, the rollback journal still exists on the
  2441 ** disk and we are still holding all locks, so the transaction has not
  2442 ** committed.  See sqlite3BtreeCommit() for the second phase of the
  2443 ** commit process.
  2444 **
  2445 ** This call is a no-op if no write-transaction is currently active on pBt.
  2446 **
  2447 ** Otherwise, sync the database file for the btree pBt. zMaster points to
  2448 ** the name of a master journal file that should be written into the
  2449 ** individual journal file, or is NULL, indicating no master journal file 
  2450 ** (single database transaction).
  2451 **
  2452 ** When this is called, the master journal should already have been
  2453 ** created, populated with this journal pointer and synced to disk.
  2454 **
  2455 ** Once this is routine has returned, the only thing required to commit
  2456 ** the write-transaction for this database file is to delete the journal.
  2457 */
  2458 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
  2459   int rc = SQLITE_OK;
  2460   if( p->inTrans==TRANS_WRITE ){
  2461     BtShared *pBt = p->pBt;
  2462     Pgno nTrunc = 0;
  2463     sqlite3BtreeEnter(p);
  2464     pBt->db = p->db;
  2465 #ifndef SQLITE_OMIT_AUTOVACUUM
  2466     if( pBt->autoVacuum ){
  2467       rc = autoVacuumCommit(pBt, &nTrunc); 
  2468       if( rc!=SQLITE_OK ){
  2469         sqlite3BtreeLeave(p);
  2470         return rc;
  2471       }
  2472     }
  2473 #endif
  2474     rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, nTrunc, 0);
  2475     sqlite3BtreeLeave(p);
  2476   }
  2477   return rc;
  2478 }
  2479 
  2480 /*
  2481 ** Commit the transaction currently in progress.
  2482 **
  2483 ** This routine implements the second phase of a 2-phase commit.  The
  2484 ** sqlite3BtreeSync() routine does the first phase and should be invoked
  2485 ** prior to calling this routine.  The sqlite3BtreeSync() routine did
  2486 ** all the work of writing information out to disk and flushing the
  2487 ** contents so that they are written onto the disk platter.  All this
  2488 ** routine has to do is delete or truncate the rollback journal
  2489 ** (which causes the transaction to commit) and drop locks.
  2490 **
  2491 ** This will release the write lock on the database file.  If there
  2492 ** are no active cursors, it also releases the read lock.
  2493 */
  2494 int sqlite3BtreeCommitPhaseTwo(Btree *p){
  2495   BtShared *pBt = p->pBt;
  2496 
  2497   sqlite3BtreeEnter(p);
  2498   pBt->db = p->db;
  2499   btreeIntegrity(p);
  2500 
  2501   /* If the handle has a write-transaction open, commit the shared-btrees 
  2502   ** transaction and set the shared state to TRANS_READ.
  2503   */
  2504   if( p->inTrans==TRANS_WRITE ){
  2505     int rc;
  2506     assert( pBt->inTransaction==TRANS_WRITE );
  2507     assert( pBt->nTransaction>0 );
  2508     rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
  2509     if( rc!=SQLITE_OK ){
  2510       sqlite3BtreeLeave(p);
  2511       return rc;
  2512     }
  2513     pBt->inTransaction = TRANS_READ;
  2514     pBt->inStmt = 0;
  2515   }
  2516   unlockAllTables(p);
  2517 
  2518   /* If the handle has any kind of transaction open, decrement the transaction
  2519   ** count of the shared btree. If the transaction count reaches 0, set
  2520   ** the shared state to TRANS_NONE. The unlockBtreeIfUnused() call below
  2521   ** will unlock the pager.
  2522   */
  2523   if( p->inTrans!=TRANS_NONE ){
  2524     pBt->nTransaction--;
  2525     if( 0==pBt->nTransaction ){
  2526       pBt->inTransaction = TRANS_NONE;
  2527     }
  2528   }
  2529 
  2530   /* Set the handles current transaction state to TRANS_NONE and unlock
  2531   ** the pager if this call closed the only read or write transaction.
  2532   */
  2533   p->inTrans = TRANS_NONE;
  2534   unlockBtreeIfUnused(pBt);
  2535 
  2536   btreeIntegrity(p);
  2537   sqlite3BtreeLeave(p);
  2538   return SQLITE_OK;
  2539 }
  2540 
  2541 /*
  2542 ** Do both phases of a commit.
  2543 */
  2544 int sqlite3BtreeCommit(Btree *p){
  2545   int rc;
  2546   sqlite3BtreeEnter(p);
  2547   rc = sqlite3BtreeCommitPhaseOne(p, 0);
  2548   if( rc==SQLITE_OK ){
  2549     rc = sqlite3BtreeCommitPhaseTwo(p);
  2550   }
  2551   sqlite3BtreeLeave(p);
  2552   return rc;
  2553 }
  2554 
  2555 #ifndef NDEBUG
  2556 /*
  2557 ** Return the number of write-cursors open on this handle. This is for use
  2558 ** in assert() expressions, so it is only compiled if NDEBUG is not
  2559 ** defined.
  2560 **
  2561 ** For the purposes of this routine, a write-cursor is any cursor that
  2562 ** is capable of writing to the databse.  That means the cursor was
  2563 ** originally opened for writing and the cursor has not be disabled
  2564 ** by having its state changed to CURSOR_FAULT.
  2565 */
  2566 static int countWriteCursors(BtShared *pBt){
  2567   BtCursor *pCur;
  2568   int r = 0;
  2569   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
  2570     if( pCur->wrFlag && pCur->eState!=CURSOR_FAULT ) r++; 
  2571   }
  2572   return r;
  2573 }
  2574 #endif
  2575 
  2576 /*
  2577 ** This routine sets the state to CURSOR_FAULT and the error
  2578 ** code to errCode for every cursor on BtShared that pBtree
  2579 ** references.
  2580 **
  2581 ** Every cursor is tripped, including cursors that belong
  2582 ** to other database connections that happen to be sharing
  2583 ** the cache with pBtree.
  2584 **
  2585 ** This routine gets called when a rollback occurs.
  2586 ** All cursors using the same cache must be tripped
  2587 ** to prevent them from trying to use the btree after
  2588 ** the rollback.  The rollback may have deleted tables
  2589 ** or moved root pages, so it is not sufficient to
  2590 ** save the state of the cursor.  The cursor must be
  2591 ** invalidated.
  2592 */
  2593 void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){
  2594   BtCursor *p;
  2595   sqlite3BtreeEnter(pBtree);
  2596   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
  2597     clearCursorPosition(p);
  2598     p->eState = CURSOR_FAULT;
  2599     p->skip = errCode;
  2600   }
  2601   sqlite3BtreeLeave(pBtree);
  2602 }
  2603 
  2604 /*
  2605 ** Rollback the transaction in progress.  All cursors will be
  2606 ** invalided by this operation.  Any attempt to use a cursor
  2607 ** that was open at the beginning of this operation will result
  2608 ** in an error.
  2609 **
  2610 ** This will release the write lock on the database file.  If there
  2611 ** are no active cursors, it also releases the read lock.
  2612 */
  2613 int sqlite3BtreeRollback(Btree *p){
  2614   int rc;
  2615   BtShared *pBt = p->pBt;
  2616   MemPage *pPage1;
  2617 
  2618   sqlite3BtreeEnter(p);
  2619   pBt->db = p->db;
  2620   rc = saveAllCursors(pBt, 0, 0);
  2621 #ifndef SQLITE_OMIT_SHARED_CACHE
  2622   if( rc!=SQLITE_OK ){
  2623     /* This is a horrible situation. An IO or malloc() error occured whilst
  2624     ** trying to save cursor positions. If this is an automatic rollback (as
  2625     ** the result of a constraint, malloc() failure or IO error) then 
  2626     ** the cache may be internally inconsistent (not contain valid trees) so
  2627     ** we cannot simply return the error to the caller. Instead, abort 
  2628     ** all queries that may be using any of the cursors that failed to save.
  2629     */
  2630     sqlite3BtreeTripAllCursors(p, rc);
  2631   }
  2632 #endif
  2633   btreeIntegrity(p);
  2634   unlockAllTables(p);
  2635 
  2636   if( p->inTrans==TRANS_WRITE ){
  2637     int rc2;
  2638 
  2639 #ifndef SQLITE_OMIT_AUTOVACUUM
  2640     pBt->nTrunc = 0;
  2641 #endif
  2642 
  2643     assert( TRANS_WRITE==pBt->inTransaction );
  2644     rc2 = sqlite3PagerRollback(pBt->pPager);
  2645     if( rc2!=SQLITE_OK ){
  2646       rc = rc2;
  2647     }
  2648 
  2649     /* The rollback may have destroyed the pPage1->aData value.  So
  2650     ** call sqlite3BtreeGetPage() on page 1 again to make
  2651     ** sure pPage1->aData is set correctly. */
  2652     if( sqlite3BtreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
  2653       releasePage(pPage1);
  2654     }
  2655     assert( countWriteCursors(pBt)==0 );
  2656     pBt->inTransaction = TRANS_READ;
  2657   }
  2658 
  2659   if( p->inTrans!=TRANS_NONE ){
  2660     assert( pBt->nTransaction>0 );
  2661     pBt->nTransaction--;
  2662     if( 0==pBt->nTransaction ){
  2663       pBt->inTransaction = TRANS_NONE;
  2664     }
  2665   }
  2666 
  2667   p->inTrans = TRANS_NONE;
  2668   pBt->inStmt = 0;
  2669   unlockBtreeIfUnused(pBt);
  2670 
  2671   btreeIntegrity(p);
  2672   sqlite3BtreeLeave(p);
  2673   return rc;
  2674 }
  2675 
  2676 /*
  2677 ** Start a statement subtransaction.  The subtransaction can
  2678 ** can be rolled back independently of the main transaction.
  2679 ** You must start a transaction before starting a subtransaction.
  2680 ** The subtransaction is ended automatically if the main transaction
  2681 ** commits or rolls back.
  2682 **
  2683 ** Only one subtransaction may be active at a time.  It is an error to try
  2684 ** to start a new subtransaction if another subtransaction is already active.
  2685 **
  2686 ** Statement subtransactions are used around individual SQL statements
  2687 ** that are contained within a BEGIN...COMMIT block.  If a constraint
  2688 ** error occurs within the statement, the effect of that one statement
  2689 ** can be rolled back without having to rollback the entire transaction.
  2690 */
  2691 int sqlite3BtreeBeginStmt(Btree *p){
  2692   int rc;
  2693   BtShared *pBt = p->pBt;
  2694   sqlite3BtreeEnter(p);
  2695   pBt->db = p->db;
  2696   if( (p->inTrans!=TRANS_WRITE) || pBt->inStmt ){
  2697     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
  2698   }else{
  2699     assert( pBt->inTransaction==TRANS_WRITE );
  2700     rc = pBt->readOnly ? SQLITE_OK : sqlite3PagerStmtBegin(pBt->pPager);
  2701     pBt->inStmt = 1;
  2702   }
  2703   sqlite3BtreeLeave(p);
  2704   return rc;
  2705 }
  2706 
  2707 
  2708 /*
  2709 ** Commit the statment subtransaction currently in progress.  If no
  2710 ** subtransaction is active, this is a no-op.
  2711 */
  2712 int sqlite3BtreeCommitStmt(Btree *p){
  2713   int rc;
  2714   BtShared *pBt = p->pBt;
  2715   sqlite3BtreeEnter(p);
  2716   pBt->db = p->db;
  2717   if( pBt->inStmt && !pBt->readOnly ){
  2718     rc = sqlite3PagerStmtCommit(pBt->pPager);
  2719   }else{
  2720     rc = SQLITE_OK;
  2721   }
  2722   pBt->inStmt = 0;
  2723   sqlite3BtreeLeave(p);
  2724   return rc;
  2725 }
  2726 
  2727 /*
  2728 ** Rollback the active statement subtransaction.  If no subtransaction
  2729 ** is active this routine is a no-op.
  2730 **
  2731 ** All cursors will be invalidated by this operation.  Any attempt
  2732 ** to use a cursor that was open at the beginning of this operation
  2733 ** will result in an error.
  2734 */
  2735 int sqlite3BtreeRollbackStmt(Btree *p){
  2736   int rc = SQLITE_OK;
  2737   BtShared *pBt = p->pBt;
  2738   sqlite3BtreeEnter(p);
  2739   pBt->db = p->db;
  2740   if( pBt->inStmt && !pBt->readOnly ){
  2741     rc = sqlite3PagerStmtRollback(pBt->pPager);
  2742     pBt->inStmt = 0;
  2743   }
  2744   sqlite3BtreeLeave(p);
  2745   return rc;
  2746 }
  2747 
  2748 /*
  2749 ** Create a new cursor for the BTree whose root is on the page
  2750 ** iTable.  The act of acquiring a cursor gets a read lock on 
  2751 ** the database file.
  2752 **
  2753 ** If wrFlag==0, then the cursor can only be used for reading.
  2754 ** If wrFlag==1, then the cursor can be used for reading or for
  2755 ** writing if other conditions for writing are also met.  These
  2756 ** are the conditions that must be met in order for writing to
  2757 ** be allowed:
  2758 **
  2759 ** 1:  The cursor must have been opened with wrFlag==1
  2760 **
  2761 ** 2:  Other database connections that share the same pager cache
  2762 **     but which are not in the READ_UNCOMMITTED state may not have
  2763 **     cursors open with wrFlag==0 on the same table.  Otherwise
  2764 **     the changes made by this write cursor would be visible to
  2765 **     the read cursors in the other database connection.
  2766 **
  2767 ** 3:  The database must be writable (not on read-only media)
  2768 **
  2769 ** 4:  There must be an active transaction.
  2770 **
  2771 ** No checking is done to make sure that page iTable really is the
  2772 ** root page of a b-tree.  If it is not, then the cursor acquired
  2773 ** will not work correctly.
  2774 **
  2775 ** It is assumed that the sqlite3BtreeCursorSize() bytes of memory 
  2776 ** pointed to by pCur have been zeroed by the caller.
  2777 */
  2778 static int btreeCursor(
  2779   Btree *p,                              /* The btree */
  2780   int iTable,                            /* Root page of table to open */
  2781   int wrFlag,                            /* 1 to write. 0 read-only */
  2782   struct KeyInfo *pKeyInfo,              /* First arg to comparison function */
  2783   BtCursor *pCur                         /* Space for new cursor */
  2784 ){
  2785   int rc;
  2786   BtShared *pBt = p->pBt;
  2787 
  2788   assert( sqlite3BtreeHoldsMutex(p) );
  2789   if( wrFlag ){
  2790     if( pBt->readOnly ){
  2791       return SQLITE_READONLY;
  2792     }
  2793     if( checkReadLocks(p, iTable, 0, 0) ){
  2794       return SQLITE_LOCKED;
  2795     }
  2796   }
  2797 
  2798   if( pBt->pPage1==0 ){
  2799     rc = lockBtreeWithRetry(p);
  2800     if( rc!=SQLITE_OK ){
  2801       return rc;
  2802     }
  2803     if( pBt->readOnly && wrFlag ){
  2804       return SQLITE_READONLY;
  2805     }
  2806   }
  2807   pCur->pgnoRoot = (Pgno)iTable;
  2808   if( iTable==1 && pagerPagecount(pBt->pPager)==0 ){
  2809     rc = SQLITE_EMPTY;
  2810     goto create_cursor_exception;
  2811   }
  2812   rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]);
  2813   if( rc!=SQLITE_OK ){
  2814     goto create_cursor_exception;
  2815   }
  2816 
  2817   /* Now that no other errors can occur, finish filling in the BtCursor
  2818   ** variables, link the cursor into the BtShared list and set *ppCur (the
  2819   ** output argument to this function).
  2820   */
  2821   pCur->pKeyInfo = pKeyInfo;
  2822   pCur->pBtree = p;
  2823   pCur->pBt = pBt;
  2824   pCur->wrFlag = wrFlag;
  2825   pCur->pNext = pBt->pCursor;
  2826   if( pCur->pNext ){
  2827     pCur->pNext->pPrev = pCur;
  2828   }
  2829   pBt->pCursor = pCur;
  2830   pCur->eState = CURSOR_INVALID;
  2831 
  2832   return SQLITE_OK;
  2833 
  2834 create_cursor_exception:
  2835   releasePage(pCur->apPage[0]);
  2836   unlockBtreeIfUnused(pBt);
  2837   return rc;
  2838 }
  2839 int sqlite3BtreeCursor(
  2840   Btree *p,                                   /* The btree */
  2841   int iTable,                                 /* Root page of table to open */
  2842   int wrFlag,                                 /* 1 to write. 0 read-only */
  2843   struct KeyInfo *pKeyInfo,                   /* First arg to xCompare() */
  2844   BtCursor *pCur                              /* Write new cursor here */
  2845 ){
  2846   int rc;
  2847   sqlite3BtreeEnter(p);
  2848   p->pBt->db = p->db;
  2849   rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
  2850   sqlite3BtreeLeave(p);
  2851   return rc;
  2852 }
  2853 int sqlite3BtreeCursorSize(){
  2854   return sizeof(BtCursor);
  2855 }
  2856 
  2857 
  2858 
  2859 /*
  2860 ** Close a cursor.  The read lock on the database file is released
  2861 ** when the last cursor is closed.
  2862 */
  2863 int sqlite3BtreeCloseCursor(BtCursor *pCur){
  2864   Btree *pBtree = pCur->pBtree;
  2865   if( pBtree ){
  2866     int i;
  2867     BtShared *pBt = pCur->pBt;
  2868     sqlite3BtreeEnter(pBtree);
  2869     pBt->db = pBtree->db;
  2870     clearCursorPosition(pCur);
  2871     if( pCur->pPrev ){
  2872       pCur->pPrev->pNext = pCur->pNext;
  2873     }else{
  2874       pBt->pCursor = pCur->pNext;
  2875     }
  2876     if( pCur->pNext ){
  2877       pCur->pNext->pPrev = pCur->pPrev;
  2878     }
  2879     for(i=0; i<=pCur->iPage; i++){
  2880       releasePage(pCur->apPage[i]);
  2881     }
  2882     unlockBtreeIfUnused(pBt);
  2883     invalidateOverflowCache(pCur);
  2884     /* sqlite3_free(pCur); */
  2885     sqlite3BtreeLeave(pBtree);
  2886   }
  2887   return SQLITE_OK;
  2888 }
  2889 
  2890 /*
  2891 ** Make a temporary cursor by filling in the fields of pTempCur.
  2892 ** The temporary cursor is not on the cursor list for the Btree.
  2893 */
  2894 void sqlite3BtreeGetTempCursor(BtCursor *pCur, BtCursor *pTempCur){
  2895   int i;
  2896   assert( cursorHoldsMutex(pCur) );
  2897   memcpy(pTempCur, pCur, sizeof(BtCursor));
  2898   pTempCur->pNext = 0;
  2899   pTempCur->pPrev = 0;
  2900   for(i=0; i<=pTempCur->iPage; i++){
  2901     sqlite3PagerRef(pTempCur->apPage[i]->pDbPage);
  2902   }
  2903   assert( pCur->pKey==0 );
  2904 }
  2905 
  2906 /*
  2907 ** Delete a temporary cursor such as was made by the CreateTemporaryCursor()
  2908 ** function above.
  2909 */
  2910 void sqlite3BtreeReleaseTempCursor(BtCursor *pCur){
  2911   int i;
  2912   assert( cursorHoldsMutex(pCur) );
  2913   for(i=0; i<=pCur->iPage; i++){
  2914     sqlite3PagerUnref(pCur->apPage[i]->pDbPage);
  2915   }
  2916   sqlite3_free(pCur->pKey);
  2917 }
  2918 
  2919 /*
  2920 ** Make sure the BtCursor* given in the argument has a valid
  2921 ** BtCursor.info structure.  If it is not already valid, call
  2922 ** sqlite3BtreeParseCell() to fill it in.
  2923 **
  2924 ** BtCursor.info is a cache of the information in the current cell.
  2925 ** Using this cache reduces the number of calls to sqlite3BtreeParseCell().
  2926 **
  2927 ** 2007-06-25:  There is a bug in some versions of MSVC that cause the
  2928 ** compiler to crash when getCellInfo() is implemented as a macro.
  2929 ** But there is a measureable speed advantage to using the macro on gcc
  2930 ** (when less compiler optimizations like -Os or -O0 are used and the
  2931 ** compiler is not doing agressive inlining.)  So we use a real function
  2932 ** for MSVC and a macro for everything else.  Ticket #2457.
  2933 */
  2934 #ifndef NDEBUG
  2935   static void assertCellInfo(BtCursor *pCur){
  2936     CellInfo info;
  2937     int iPage = pCur->iPage;
  2938     memset(&info, 0, sizeof(info));
  2939     sqlite3BtreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info);
  2940     assert( memcmp(&info, &pCur->info, sizeof(info))==0 );
  2941   }
  2942 #else
  2943   #define assertCellInfo(x)
  2944 #endif
  2945 #ifdef _MSC_VER
  2946   /* Use a real function in MSVC to work around bugs in that compiler. */
  2947   static void getCellInfo(BtCursor *pCur){
  2948     if( pCur->info.nSize==0 ){
  2949       int iPage = pCur->iPage;
  2950       sqlite3BtreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);
  2951       pCur->validNKey = 1;
  2952     }else{
  2953       assertCellInfo(pCur);
  2954     }
  2955   }
  2956 #else /* if not _MSC_VER */
  2957   /* Use a macro in all other compilers so that the function is inlined */
  2958 #define getCellInfo(pCur)                                                      \
  2959   if( pCur->info.nSize==0 ){                                                   \
  2960     int iPage = pCur->iPage;                                                   \
  2961     sqlite3BtreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); \
  2962     pCur->validNKey = 1;                                                       \
  2963   }else{                                                                       \
  2964     assertCellInfo(pCur);                                                      \
  2965   }
  2966 #endif /* _MSC_VER */
  2967 
  2968 /*
  2969 ** Set *pSize to the size of the buffer needed to hold the value of
  2970 ** the key for the current entry.  If the cursor is not pointing
  2971 ** to a valid entry, *pSize is set to 0. 
  2972 **
  2973 ** For a table with the INTKEY flag set, this routine returns the key
  2974 ** itself, not the number of bytes in the key.
  2975 */
  2976 int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
  2977   int rc;
  2978 
  2979   assert( cursorHoldsMutex(pCur) );
  2980   rc = restoreCursorPosition(pCur);
  2981   if( rc==SQLITE_OK ){
  2982     assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
  2983     if( pCur->eState==CURSOR_INVALID ){
  2984       *pSize = 0;
  2985     }else{
  2986       getCellInfo(pCur);
  2987       *pSize = pCur->info.nKey;
  2988     }
  2989   }
  2990   return rc;
  2991 }
  2992 
  2993 /*
  2994 ** Set *pSize to the number of bytes of data in the entry the
  2995 ** cursor currently points to.  Always return SQLITE_OK.
  2996 ** Failure is not possible.  If the cursor is not currently
  2997 ** pointing to an entry (which can happen, for example, if
  2998 ** the database is empty) then *pSize is set to 0.
  2999 */
  3000 int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
  3001   int rc;
  3002 
  3003   assert( cursorHoldsMutex(pCur) );
  3004   rc = restoreCursorPosition(pCur);
  3005   if( rc==SQLITE_OK ){
  3006     assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
  3007     if( pCur->eState==CURSOR_INVALID ){
  3008       /* Not pointing at a valid entry - set *pSize to 0. */
  3009       *pSize = 0;
  3010     }else{
  3011       getCellInfo(pCur);
  3012       *pSize = pCur->info.nData;
  3013     }
  3014   }
  3015   return rc;
  3016 }
  3017 
  3018 /*
  3019 ** Given the page number of an overflow page in the database (parameter
  3020 ** ovfl), this function finds the page number of the next page in the 
  3021 ** linked list of overflow pages. If possible, it uses the auto-vacuum
  3022 ** pointer-map data instead of reading the content of page ovfl to do so. 
  3023 **
  3024 ** If an error occurs an SQLite error code is returned. Otherwise:
  3025 **
  3026 ** Unless pPgnoNext is NULL, the page number of the next overflow 
  3027 ** page in the linked list is written to *pPgnoNext. If page ovfl
  3028 ** is the last page in its linked list, *pPgnoNext is set to zero. 
  3029 **
  3030 ** If ppPage is not NULL, *ppPage is set to the MemPage* handle
  3031 ** for page ovfl. The underlying pager page may have been requested
  3032 ** with the noContent flag set, so the page data accessable via
  3033 ** this handle may not be trusted.
  3034 */
  3035 static int getOverflowPage(
  3036   BtShared *pBt, 
  3037   Pgno ovfl,                   /* Overflow page */
  3038   MemPage **ppPage,            /* OUT: MemPage handle */
  3039   Pgno *pPgnoNext              /* OUT: Next overflow page number */
  3040 ){
  3041   Pgno next = 0;
  3042   int rc;
  3043 
  3044   assert( sqlite3_mutex_held(pBt->mutex) );
  3045   /* One of these must not be NULL. Otherwise, why call this function? */
  3046   assert(ppPage || pPgnoNext);
  3047 
  3048   /* If pPgnoNext is NULL, then this function is being called to obtain
  3049   ** a MemPage* reference only. No page-data is required in this case.
  3050   */
  3051   if( !pPgnoNext ){
  3052     return sqlite3BtreeGetPage(pBt, ovfl, ppPage, 1);
  3053   }
  3054 
  3055 #ifndef SQLITE_OMIT_AUTOVACUUM
  3056   /* Try to find the next page in the overflow list using the
  3057   ** autovacuum pointer-map pages. Guess that the next page in 
  3058   ** the overflow list is page number (ovfl+1). If that guess turns 
  3059   ** out to be wrong, fall back to loading the data of page 
  3060   ** number ovfl to determine the next page number.
  3061   */
  3062   if( pBt->autoVacuum ){
  3063     Pgno pgno;
  3064     Pgno iGuess = ovfl+1;
  3065     u8 eType;
  3066 
  3067     while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
  3068       iGuess++;
  3069     }
  3070 
  3071     if( iGuess<=pagerPagecount(pBt->pPager) ){
  3072       rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
  3073       if( rc!=SQLITE_OK ){
  3074         return rc;
  3075       }
  3076       if( eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
  3077         next = iGuess;
  3078       }
  3079     }
  3080   }
  3081 #endif
  3082 
  3083   if( next==0 || ppPage ){
  3084     MemPage *pPage = 0;
  3085 
  3086     rc = sqlite3BtreeGetPage(pBt, ovfl, &pPage, next!=0);
  3087     assert(rc==SQLITE_OK || pPage==0);
  3088     if( next==0 && rc==SQLITE_OK ){
  3089       next = get4byte(pPage->aData);
  3090     }
  3091 
  3092     if( ppPage ){
  3093       *ppPage = pPage;
  3094     }else{
  3095       releasePage(pPage);
  3096     }
  3097   }
  3098   *pPgnoNext = next;
  3099 
  3100   return rc;
  3101 }
  3102 
  3103 /*
  3104 ** Copy data from a buffer to a page, or from a page to a buffer.
  3105 **
  3106 ** pPayload is a pointer to data stored on database page pDbPage.
  3107 ** If argument eOp is false, then nByte bytes of data are copied
  3108 ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
  3109 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
  3110 ** of data are copied from the buffer pBuf to pPayload.
  3111 **
  3112 ** SQLITE_OK is returned on success, otherwise an error code.
  3113 */
  3114 static int copyPayload(
  3115   void *pPayload,           /* Pointer to page data */
  3116   void *pBuf,               /* Pointer to buffer */
  3117   int nByte,                /* Number of bytes to copy */
  3118   int eOp,                  /* 0 -> copy from page, 1 -> copy to page */
  3119   DbPage *pDbPage           /* Page containing pPayload */
  3120 ){
  3121   if( eOp ){
  3122     /* Copy data from buffer to page (a write operation) */
  3123     int rc = sqlite3PagerWrite(pDbPage);
  3124     if( rc!=SQLITE_OK ){
  3125       return rc;
  3126     }
  3127     memcpy(pPayload, pBuf, nByte);
  3128   }else{
  3129     /* Copy data from page to buffer (a read operation) */
  3130     memcpy(pBuf, pPayload, nByte);
  3131   }
  3132   return SQLITE_OK;
  3133 }
  3134 
  3135 /*
  3136 ** This function is used to read or overwrite payload information
  3137 ** for the entry that the pCur cursor is pointing to. If the eOp
  3138 ** parameter is 0, this is a read operation (data copied into
  3139 ** buffer pBuf). If it is non-zero, a write (data copied from
  3140 ** buffer pBuf).
  3141 **
  3142 ** A total of "amt" bytes are read or written beginning at "offset".
  3143 ** Data is read to or from the buffer pBuf.
  3144 **
  3145 ** This routine does not make a distinction between key and data.
  3146 ** It just reads or writes bytes from the payload area.  Data might 
  3147 ** appear on the main page or be scattered out on multiple overflow 
  3148 ** pages.
  3149 **
  3150 ** If the BtCursor.isIncrblobHandle flag is set, and the current
  3151 ** cursor entry uses one or more overflow pages, this function
  3152 ** allocates space for and lazily popluates the overflow page-list 
  3153 ** cache array (BtCursor.aOverflow). Subsequent calls use this
  3154 ** cache to make seeking to the supplied offset more efficient.
  3155 **
  3156 ** Once an overflow page-list cache has been allocated, it may be
  3157 ** invalidated if some other cursor writes to the same table, or if
  3158 ** the cursor is moved to a different row. Additionally, in auto-vacuum
  3159 ** mode, the following events may invalidate an overflow page-list cache.
  3160 **
  3161 **   * An incremental vacuum,
  3162 **   * A commit in auto_vacuum="full" mode,
  3163 **   * Creating a table (may require moving an overflow page).
  3164 */
  3165 static int accessPayload(
  3166   BtCursor *pCur,      /* Cursor pointing to entry to read from */
  3167   int offset,          /* Begin reading this far into payload */
  3168   int amt,             /* Read this many bytes */
  3169   unsigned char *pBuf, /* Write the bytes into this buffer */ 
  3170   int skipKey,         /* offset begins at data if this is true */
  3171   int eOp              /* zero to read. non-zero to write. */
  3172 ){
  3173   unsigned char *aPayload;
  3174   int rc = SQLITE_OK;
  3175   u32 nKey;
  3176   int iIdx = 0;
  3177   MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */
  3178   BtShared *pBt;                              /* Btree this cursor belongs to */
  3179 
  3180   assert( pPage );
  3181   assert( pCur->eState==CURSOR_VALID );
  3182   assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
  3183   assert( offset>=0 );
  3184   assert( cursorHoldsMutex(pCur) );
  3185 
  3186   getCellInfo(pCur);
  3187   aPayload = pCur->info.pCell + pCur->info.nHeader;
  3188   nKey = (pPage->intKey ? 0 : pCur->info.nKey);
  3189 
  3190   if( skipKey ){
  3191     offset += nKey;
  3192   }
  3193   if( offset+amt > nKey+pCur->info.nData ){
  3194     /* Trying to read or write past the end of the data is an error */
  3195     return SQLITE_CORRUPT_BKPT;
  3196   }
  3197 
  3198   /* Check if data must be read/written to/from the btree page itself. */
  3199   if( offset<pCur->info.nLocal ){
  3200     int a = amt;
  3201     if( a+offset>pCur->info.nLocal ){
  3202       a = pCur->info.nLocal - offset;
  3203     }
  3204     rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
  3205     offset = 0;
  3206     pBuf += a;
  3207     amt -= a;
  3208   }else{
  3209     offset -= pCur->info.nLocal;
  3210   }
  3211 
  3212   pBt = pCur->pBt;
  3213   if( rc==SQLITE_OK && amt>0 ){
  3214     const int ovflSize = pBt->usableSize - 4;  /* Bytes content per ovfl page */
  3215     Pgno nextPage;
  3216 
  3217     nextPage = get4byte(&aPayload[pCur->info.nLocal]);
  3218 
  3219 #ifndef SQLITE_OMIT_INCRBLOB
  3220     /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[]
  3221     ** has not been allocated, allocate it now. The array is sized at
  3222     ** one entry for each overflow page in the overflow chain. The
  3223     ** page number of the first overflow page is stored in aOverflow[0],
  3224     ** etc. A value of 0 in the aOverflow[] array means "not yet known"
  3225     ** (the cache is lazily populated).
  3226     */
  3227     if( pCur->isIncrblobHandle && !pCur->aOverflow ){
  3228       int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
  3229       pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl);
  3230       if( nOvfl && !pCur->aOverflow ){
  3231         rc = SQLITE_NOMEM;
  3232       }
  3233     }
  3234 
  3235     /* If the overflow page-list cache has been allocated and the
  3236     ** entry for the first required overflow page is valid, skip
  3237     ** directly to it.
  3238     */
  3239     if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){
  3240       iIdx = (offset/ovflSize);
  3241       nextPage = pCur->aOverflow[iIdx];
  3242       offset = (offset%ovflSize);
  3243     }
  3244 #endif
  3245 
  3246     for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){
  3247 
  3248 #ifndef SQLITE_OMIT_INCRBLOB
  3249       /* If required, populate the overflow page-list cache. */
  3250       if( pCur->aOverflow ){
  3251         assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage);
  3252         pCur->aOverflow[iIdx] = nextPage;
  3253       }
  3254 #endif
  3255 
  3256       if( offset>=ovflSize ){
  3257         /* The only reason to read this page is to obtain the page
  3258         ** number for the next page in the overflow chain. The page
  3259         ** data is not required. So first try to lookup the overflow
  3260         ** page-list cache, if any, then fall back to the getOverflowPage()
  3261         ** function.
  3262         */
  3263 #ifndef SQLITE_OMIT_INCRBLOB
  3264         if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){
  3265           nextPage = pCur->aOverflow[iIdx+1];
  3266         } else 
  3267 #endif
  3268           rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
  3269         offset -= ovflSize;
  3270       }else{
  3271         /* Need to read this page properly. It contains some of the
  3272         ** range of data that is being read (eOp==0) or written (eOp!=0).
  3273         */
  3274         DbPage *pDbPage;
  3275         int a = amt;
  3276         rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage);
  3277         if( rc==SQLITE_OK ){
  3278           aPayload = sqlite3PagerGetData(pDbPage);
  3279           nextPage = get4byte(aPayload);
  3280           if( a + offset > ovflSize ){
  3281             a = ovflSize - offset;
  3282           }
  3283           rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
  3284           sqlite3PagerUnref(pDbPage);
  3285           offset = 0;
  3286           amt -= a;
  3287           pBuf += a;
  3288         }
  3289       }
  3290     }
  3291   }
  3292 
  3293   if( rc==SQLITE_OK && amt>0 ){
  3294     return SQLITE_CORRUPT_BKPT;
  3295   }
  3296   return rc;
  3297 }
  3298 
  3299 /*
  3300 ** Read part of the key associated with cursor pCur.  Exactly
  3301 ** "amt" bytes will be transfered into pBuf[].  The transfer
  3302 ** begins at "offset".
  3303 **
  3304 ** Return SQLITE_OK on success or an error code if anything goes
  3305 ** wrong.  An error is returned if "offset+amt" is larger than
  3306 ** the available payload.
  3307 */
  3308 int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
  3309   int rc;
  3310 
  3311   assert( cursorHoldsMutex(pCur) );
  3312   rc = restoreCursorPosition(pCur);
  3313   if( rc==SQLITE_OK ){
  3314     assert( pCur->eState==CURSOR_VALID );
  3315     assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
  3316     if( pCur->apPage[0]->intKey ){
  3317       return SQLITE_CORRUPT_BKPT;
  3318     }
  3319     assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
  3320     rc = accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0, 0);
  3321   }
  3322   return rc;
  3323 }
  3324 
  3325 /*
  3326 ** Read part of the data associated with cursor pCur.  Exactly
  3327 ** "amt" bytes will be transfered into pBuf[].  The transfer
  3328 ** begins at "offset".
  3329 **
  3330 ** Return SQLITE_OK on success or an error code if anything goes
  3331 ** wrong.  An error is returned if "offset+amt" is larger than
  3332 ** the available payload.
  3333 */
  3334 int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
  3335   int rc;
  3336 
  3337 #ifndef SQLITE_OMIT_INCRBLOB
  3338   if ( pCur->eState==CURSOR_INVALID ){
  3339     return SQLITE_ABORT;
  3340   }
  3341 #endif
  3342 
  3343   assert( cursorHoldsMutex(pCur) );
  3344   rc = restoreCursorPosition(pCur);
  3345   if( rc==SQLITE_OK ){
  3346     assert( pCur->eState==CURSOR_VALID );
  3347     assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
  3348     assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
  3349     rc = accessPayload(pCur, offset, amt, pBuf, 1, 0);
  3350   }
  3351   return rc;
  3352 }
  3353 
  3354 /*
  3355 ** Return a pointer to payload information from the entry that the 
  3356 ** pCur cursor is pointing to.  The pointer is to the beginning of
  3357 ** the key if skipKey==0 and it points to the beginning of data if
  3358 ** skipKey==1.  The number of bytes of available key/data is written
  3359 ** into *pAmt.  If *pAmt==0, then the value returned will not be
  3360 ** a valid pointer.
  3361 **
  3362 ** This routine is an optimization.  It is common for the entire key
  3363 ** and data to fit on the local page and for there to be no overflow
  3364 ** pages.  When that is so, this routine can be used to access the
  3365 ** key and data without making a copy.  If the key and/or data spills
  3366 ** onto overflow pages, then accessPayload() must be used to reassembly
  3367 ** the key/data and copy it into a preallocated buffer.
  3368 **
  3369 ** The pointer returned by this routine looks directly into the cached
  3370 ** page of the database.  The data might change or move the next time
  3371 ** any btree routine is called.
  3372 */
  3373 static const unsigned char *fetchPayload(
  3374   BtCursor *pCur,      /* Cursor pointing to entry to read from */
  3375   int *pAmt,           /* Write the number of available bytes here */
  3376   int skipKey          /* read beginning at data if this is true */
  3377 ){
  3378   unsigned char *aPayload;
  3379   MemPage *pPage;
  3380   u32 nKey;
  3381   int nLocal;
  3382 
  3383   assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]);
  3384   assert( pCur->eState==CURSOR_VALID );
  3385   assert( cursorHoldsMutex(pCur) );
  3386   pPage = pCur->apPage[pCur->iPage];
  3387   assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
  3388   getCellInfo(pCur);
  3389   aPayload = pCur->info.pCell;
  3390   aPayload += pCur->info.nHeader;
  3391   if( pPage->intKey ){
  3392     nKey = 0;
  3393   }else{
  3394     nKey = pCur->info.nKey;
  3395   }
  3396   if( skipKey ){
  3397     aPayload += nKey;
  3398     nLocal = pCur->info.nLocal - nKey;
  3399   }else{
  3400     nLocal = pCur->info.nLocal;
  3401     if( nLocal>nKey ){
  3402       nLocal = nKey;
  3403     }
  3404   }
  3405   *pAmt = nLocal;
  3406   return aPayload;
  3407 }
  3408 
  3409 
  3410 /*
  3411 ** For the entry that cursor pCur is point to, return as
  3412 ** many bytes of the key or data as are available on the local
  3413 ** b-tree page.  Write the number of available bytes into *pAmt.
  3414 **
  3415 ** The pointer returned is ephemeral.  The key/data may move
  3416 ** or be destroyed on the next call to any Btree routine,
  3417 ** including calls from other threads against the same cache.
  3418 ** Hence, a mutex on the BtShared should be held prior to calling
  3419 ** this routine.
  3420 **
  3421 ** These routines is used to get quick access to key and data
  3422 ** in the common case where no overflow pages are used.
  3423 */
  3424 const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){
  3425   assert( cursorHoldsMutex(pCur) );
  3426   if( pCur->eState==CURSOR_VALID ){
  3427     return (const void*)fetchPayload(pCur, pAmt, 0);
  3428   }
  3429   return 0;
  3430 }
  3431 const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){
  3432   assert( cursorHoldsMutex(pCur) );
  3433   if( pCur->eState==CURSOR_VALID ){
  3434     return (const void*)fetchPayload(pCur, pAmt, 1);
  3435   }
  3436   return 0;
  3437 }
  3438 
  3439 
  3440 /*
  3441 ** Move the cursor down to a new child page.  The newPgno argument is the
  3442 ** page number of the child page to move to.
  3443 */
  3444 static int moveToChild(BtCursor *pCur, u32 newPgno){
  3445   int rc;
  3446   int i = pCur->iPage;
  3447   MemPage *pNewPage;
  3448   BtShared *pBt = pCur->pBt;
  3449 
  3450   assert( cursorHoldsMutex(pCur) );
  3451   assert( pCur->eState==CURSOR_VALID );
  3452   assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
  3453   if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
  3454     return SQLITE_CORRUPT_BKPT;
  3455   }
  3456   rc = getAndInitPage(pBt, newPgno, &pNewPage);
  3457   if( rc ) return rc;
  3458   pCur->apPage[i+1] = pNewPage;
  3459   pCur->aiIdx[i+1] = 0;
  3460   pCur->iPage++;
  3461 
  3462   pCur->info.nSize = 0;
  3463   pCur->validNKey = 0;
  3464   if( pNewPage->nCell<1 ){
  3465     return SQLITE_CORRUPT_BKPT;
  3466   }
  3467   return SQLITE_OK;
  3468 }
  3469 
  3470 #ifndef NDEBUG
  3471 /*
  3472 ** Page pParent is an internal (non-leaf) tree page. This function 
  3473 ** asserts that page number iChild is the left-child if the iIdx'th
  3474 ** cell in page pParent. Or, if iIdx is equal to the total number of
  3475 ** cells in pParent, that page number iChild is the right-child of
  3476 ** the page.
  3477 */
  3478 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
  3479   assert( iIdx<=pParent->nCell );
  3480   if( iIdx==pParent->nCell ){
  3481     assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
  3482   }else{
  3483     assert( get4byte(findCell(pParent, iIdx))==iChild );
  3484   }
  3485 }
  3486 #else
  3487 #  define assertParentIndex(x,y,z) 
  3488 #endif
  3489 
  3490 /*
  3491 ** Move the cursor up to the parent page.
  3492 **
  3493 ** pCur->idx is set to the cell index that contains the pointer
  3494 ** to the page we are coming from.  If we are coming from the
  3495 ** right-most child page then pCur->idx is set to one more than
  3496 ** the largest cell index.
  3497 */
  3498 void sqlite3BtreeMoveToParent(BtCursor *pCur){
  3499   assert( cursorHoldsMutex(pCur) );
  3500   assert( pCur->eState==CURSOR_VALID );
  3501   assert( pCur->iPage>0 );
  3502   assert( pCur->apPage[pCur->iPage] );
  3503   assertParentIndex(
  3504     pCur->apPage[pCur->iPage-1], 
  3505     pCur->aiIdx[pCur->iPage-1], 
  3506     pCur->apPage[pCur->iPage]->pgno
  3507   );
  3508   releasePage(pCur->apPage[pCur->iPage]);
  3509   pCur->iPage--;
  3510   pCur->info.nSize = 0;
  3511   pCur->validNKey = 0;
  3512 }
  3513 
  3514 /*
  3515 ** Move the cursor to the root page
  3516 */
  3517 static int moveToRoot(BtCursor *pCur){
  3518   MemPage *pRoot;
  3519   int rc = SQLITE_OK;
  3520   Btree *p = pCur->pBtree;
  3521   BtShared *pBt = p->pBt;
  3522 
  3523   assert( cursorHoldsMutex(pCur) );
  3524   assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
  3525   assert( CURSOR_VALID   < CURSOR_REQUIRESEEK );
  3526   assert( CURSOR_FAULT   > CURSOR_REQUIRESEEK );
  3527   if( pCur->eState>=CURSOR_REQUIRESEEK ){
  3528     if( pCur->eState==CURSOR_FAULT ){
  3529       return pCur->skip;
  3530     }
  3531     clearCursorPosition(pCur);
  3532   }
  3533 
  3534   if( pCur->iPage>=0 ){
  3535     int i;
  3536     for(i=1; i<=pCur->iPage; i++){
  3537       releasePage(pCur->apPage[i]);
  3538     }
  3539   }else{
  3540     if( 
  3541       SQLITE_OK!=(rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]))
  3542     ){
  3543       pCur->eState = CURSOR_INVALID;
  3544       return rc;
  3545     }
  3546   }
  3547 
  3548   pRoot = pCur->apPage[0];
  3549   assert( pRoot->pgno==pCur->pgnoRoot );
  3550   pCur->iPage = 0;
  3551   pCur->aiIdx[0] = 0;
  3552   pCur->info.nSize = 0;
  3553   pCur->atLast = 0;
  3554   pCur->validNKey = 0;
  3555 
  3556   if( pRoot->nCell==0 && !pRoot->leaf ){
  3557     Pgno subpage;
  3558     assert( pRoot->pgno==1 );
  3559     subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
  3560     assert( subpage>0 );
  3561     pCur->eState = CURSOR_VALID;
  3562     rc = moveToChild(pCur, subpage);
  3563   }else{
  3564     pCur->eState = ((pRoot->nCell>0)?CURSOR_VALID:CURSOR_INVALID);
  3565   }
  3566   return rc;
  3567 }
  3568 
  3569 /*
  3570 ** Move the cursor down to the left-most leaf entry beneath the
  3571 ** entry to which it is currently pointing.
  3572 **
  3573 ** The left-most leaf is the one with the smallest key - the first
  3574 ** in ascending order.
  3575 */
  3576 static int moveToLeftmost(BtCursor *pCur){
  3577   Pgno pgno;
  3578   int rc = SQLITE_OK;
  3579   MemPage *pPage;
  3580 
  3581   assert( cursorHoldsMutex(pCur) );
  3582   assert( pCur->eState==CURSOR_VALID );
  3583   while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
  3584     assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
  3585     pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage]));
  3586     rc = moveToChild(pCur, pgno);
  3587   }
  3588   return rc;
  3589 }
  3590 
  3591 /*
  3592 ** Move the cursor down to the right-most leaf entry beneath the
  3593 ** page to which it is currently pointing.  Notice the difference
  3594 ** between moveToLeftmost() and moveToRightmost().  moveToLeftmost()
  3595 ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
  3596 ** finds the right-most entry beneath the *page*.
  3597 **
  3598 ** The right-most entry is the one with the largest key - the last
  3599 ** key in ascending order.
  3600 */
  3601 static int moveToRightmost(BtCursor *pCur){
  3602   Pgno pgno;
  3603   int rc = SQLITE_OK;
  3604   MemPage *pPage;
  3605 
  3606   assert( cursorHoldsMutex(pCur) );
  3607   assert( pCur->eState==CURSOR_VALID );
  3608   while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
  3609     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
  3610     pCur->aiIdx[pCur->iPage] = pPage->nCell;
  3611     rc = moveToChild(pCur, pgno);
  3612   }
  3613   if( rc==SQLITE_OK ){
  3614     pCur->aiIdx[pCur->iPage] = pPage->nCell-1;
  3615     pCur->info.nSize = 0;
  3616     pCur->validNKey = 0;
  3617   }
  3618   return rc;
  3619 }
  3620 
  3621 /* Move the cursor to the first entry in the table.  Return SQLITE_OK
  3622 ** on success.  Set *pRes to 0 if the cursor actually points to something
  3623 ** or set *pRes to 1 if the table is empty.
  3624 */
  3625 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
  3626   int rc;
  3627 
  3628   assert( cursorHoldsMutex(pCur) );
  3629   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
  3630   rc = moveToRoot(pCur);
  3631   if( rc==SQLITE_OK ){
  3632     if( pCur->eState==CURSOR_INVALID ){
  3633       assert( pCur->apPage[pCur->iPage]->nCell==0 );
  3634       *pRes = 1;
  3635       rc = SQLITE_OK;
  3636     }else{
  3637       assert( pCur->apPage[pCur->iPage]->nCell>0 );
  3638       *pRes = 0;
  3639       rc = moveToLeftmost(pCur);
  3640     }
  3641   }
  3642   return rc;
  3643 }
  3644 
  3645 /* Move the cursor to the last entry in the table.  Return SQLITE_OK
  3646 ** on success.  Set *pRes to 0 if the cursor actually points to something
  3647 ** or set *pRes to 1 if the table is empty.
  3648 */
  3649 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
  3650   int rc;
  3651  
  3652   assert( cursorHoldsMutex(pCur) );
  3653   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
  3654   rc = moveToRoot(pCur);
  3655   if( rc==SQLITE_OK ){
  3656     if( CURSOR_INVALID==pCur->eState ){
  3657       assert( pCur->apPage[pCur->iPage]->nCell==0 );
  3658       *pRes = 1;
  3659     }else{
  3660       assert( pCur->eState==CURSOR_VALID );
  3661       *pRes = 0;
  3662       rc = moveToRightmost(pCur);
  3663       getCellInfo(pCur);
  3664       pCur->atLast = rc==SQLITE_OK;
  3665     }
  3666   }
  3667   return rc;
  3668 }
  3669 
  3670 /* Move the cursor so that it points to an entry near the key 
  3671 ** specified by pIdxKey or intKey.   Return a success code.
  3672 **
  3673 ** For INTKEY tables, the intKey parameter is used.  pIdxKey 
  3674 ** must be NULL.  For index tables, pIdxKey is used and intKey
  3675 ** is ignored.
  3676 **
  3677 ** If an exact match is not found, then the cursor is always
  3678 ** left pointing at a leaf page which would hold the entry if it
  3679 ** were present.  The cursor might point to an entry that comes
  3680 ** before or after the key.
  3681 **
  3682 ** The result of comparing the key with the entry to which the
  3683 ** cursor is written to *pRes if pRes!=NULL.  The meaning of
  3684 ** this value is as follows:
  3685 **
  3686 **     *pRes<0      The cursor is left pointing at an entry that
  3687 **                  is smaller than pKey or if the table is empty
  3688 **                  and the cursor is therefore left point to nothing.
  3689 **
  3690 **     *pRes==0     The cursor is left pointing at an entry that
  3691 **                  exactly matches pKey.
  3692 **
  3693 **     *pRes>0      The cursor is left pointing at an entry that
  3694 **                  is larger than pKey.
  3695 **
  3696 */
  3697 int sqlite3BtreeMovetoUnpacked(
  3698   BtCursor *pCur,          /* The cursor to be moved */
  3699   UnpackedRecord *pIdxKey, /* Unpacked index key */
  3700   i64 intKey,              /* The table key */
  3701   int biasRight,           /* If true, bias the search to the high end */
  3702   int *pRes                /* Write search results here */
  3703 ){
  3704   int rc;
  3705 
  3706   assert( cursorHoldsMutex(pCur) );
  3707   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
  3708 
  3709   /* If the cursor is already positioned at the point we are trying
  3710   ** to move to, then just return without doing any work */
  3711   if( pCur->eState==CURSOR_VALID && pCur->validNKey 
  3712    && pCur->apPage[0]->intKey 
  3713   ){
  3714     if( pCur->info.nKey==intKey ){
  3715       *pRes = 0;
  3716       return SQLITE_OK;
  3717     }
  3718     if( pCur->atLast && pCur->info.nKey<intKey ){
  3719       *pRes = -1;
  3720       return SQLITE_OK;
  3721     }
  3722   }
  3723 
  3724   rc = moveToRoot(pCur);
  3725   if( rc ){
  3726     return rc;
  3727   }
  3728   assert( pCur->apPage[pCur->iPage] );
  3729   assert( pCur->apPage[pCur->iPage]->isInit );
  3730   if( pCur->eState==CURSOR_INVALID ){
  3731     *pRes = -1;
  3732     assert( pCur->apPage[pCur->iPage]->nCell==0 );
  3733     return SQLITE_OK;
  3734   }
  3735   assert( pCur->apPage[0]->intKey || pIdxKey );
  3736   for(;;){
  3737     int lwr, upr;
  3738     Pgno chldPg;
  3739     MemPage *pPage = pCur->apPage[pCur->iPage];
  3740     int c = -1;  /* pRes return if table is empty must be -1 */
  3741     lwr = 0;
  3742     upr = pPage->nCell-1;
  3743     if( !pPage->intKey && pIdxKey==0 ){
  3744       rc = SQLITE_CORRUPT_BKPT;
  3745       goto moveto_finish;
  3746     }
  3747     if( biasRight ){
  3748       pCur->aiIdx[pCur->iPage] = upr;
  3749     }else{
  3750       pCur->aiIdx[pCur->iPage] = (upr+lwr)/2;
  3751     }
  3752     if( lwr<=upr ) for(;;){
  3753       void *pCellKey;
  3754       i64 nCellKey;
  3755       int idx = pCur->aiIdx[pCur->iPage];
  3756       pCur->info.nSize = 0;
  3757       pCur->validNKey = 1;
  3758       if( pPage->intKey ){
  3759         u8 *pCell;
  3760         pCell = findCell(pPage, idx) + pPage->childPtrSize;
  3761         if( pPage->hasData ){
  3762           u32 dummy;
  3763           pCell += getVarint32(pCell, dummy);
  3764         }
  3765         getVarint(pCell, (u64*)&nCellKey);
  3766         if( nCellKey==intKey ){
  3767           c = 0;
  3768         }else if( nCellKey<intKey ){
  3769           c = -1;
  3770         }else{
  3771           assert( nCellKey>intKey );
  3772           c = +1;
  3773         }
  3774       }else{
  3775         int available;
  3776         pCellKey = (void *)fetchPayload(pCur, &available, 0);
  3777         nCellKey = pCur->info.nKey;
  3778         if( available>=nCellKey ){
  3779           c = sqlite3VdbeRecordCompare(nCellKey, pCellKey, pIdxKey);
  3780         }else{
  3781           pCellKey = sqlite3Malloc( nCellKey );
  3782           if( pCellKey==0 ){
  3783             rc = SQLITE_NOMEM;
  3784             goto moveto_finish;
  3785           }
  3786           rc = sqlite3BtreeKey(pCur, 0, nCellKey, (void *)pCellKey);
  3787           c = sqlite3VdbeRecordCompare(nCellKey, pCellKey, pIdxKey);
  3788           sqlite3_free(pCellKey);
  3789           if( rc ) goto moveto_finish;
  3790         }
  3791       }
  3792       if( c==0 ){
  3793         pCur->info.nKey = nCellKey;
  3794         if( pPage->intKey && !pPage->leaf ){
  3795           lwr = idx;
  3796           upr = lwr - 1;
  3797           break;
  3798         }else{
  3799           if( pRes ) *pRes = 0;
  3800           rc = SQLITE_OK;
  3801           goto moveto_finish;
  3802         }
  3803       }
  3804       if( c<0 ){
  3805         lwr = idx+1;
  3806       }else{
  3807         upr = idx-1;
  3808       }
  3809       if( lwr>upr ){
  3810         pCur->info.nKey = nCellKey;
  3811         break;
  3812       }
  3813       pCur->aiIdx[pCur->iPage] = (lwr+upr)/2;
  3814     }
  3815     assert( lwr==upr+1 );
  3816     assert( pPage->isInit );
  3817     if( pPage->leaf ){
  3818       chldPg = 0;
  3819     }else if( lwr>=pPage->nCell ){
  3820       chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
  3821     }else{
  3822       chldPg = get4byte(findCell(pPage, lwr));
  3823     }
  3824     if( chldPg==0 ){
  3825       assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
  3826       if( pRes ) *pRes = c;
  3827       rc = SQLITE_OK;
  3828       goto moveto_finish;
  3829     }
  3830     pCur->aiIdx[pCur->iPage] = lwr;
  3831     pCur->info.nSize = 0;
  3832     pCur->validNKey = 0;
  3833     rc = moveToChild(pCur, chldPg);
  3834     if( rc ) goto moveto_finish;
  3835   }
  3836 moveto_finish:
  3837   return rc;
  3838 }
  3839 
  3840 /*
  3841 ** In this version of BtreeMoveto, pKey is a packed index record
  3842 ** such as is generated by the OP_MakeRecord opcode.  Unpack the
  3843 ** record and then call BtreeMovetoUnpacked() to do the work.
  3844 */
  3845 int sqlite3BtreeMoveto(
  3846   BtCursor *pCur,     /* Cursor open on the btree to be searched */
  3847   const void *pKey,   /* Packed key if the btree is an index */
  3848   i64 nKey,           /* Integer key for tables.  Size of pKey for indices */
  3849   int bias,           /* Bias search to the high end */
  3850   int *pRes           /* Write search results here */
  3851 ){
  3852   int rc;                    /* Status code */
  3853   UnpackedRecord *pIdxKey;   /* Unpacked index key */
  3854   UnpackedRecord aSpace[16]; /* Temp space for pIdxKey - to avoid a malloc */
  3855 
  3856   if( pKey ){
  3857     pIdxKey = sqlite3VdbeRecordUnpack(pCur->pKeyInfo, nKey, pKey,
  3858                                       aSpace, sizeof(aSpace));
  3859     if( pIdxKey==0 ) return SQLITE_NOMEM;
  3860   }else{
  3861     pIdxKey = 0;
  3862   }
  3863   rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
  3864   if( pKey ){
  3865     sqlite3VdbeDeleteUnpackedRecord(pIdxKey);
  3866   }
  3867   return rc;
  3868 }
  3869 
  3870 
  3871 /*
  3872 ** Return TRUE if the cursor is not pointing at an entry of the table.
  3873 **
  3874 ** TRUE will be returned after a call to sqlite3BtreeNext() moves
  3875 ** past the last entry in the table or sqlite3BtreePrev() moves past
  3876 ** the first entry.  TRUE is also returned if the table is empty.
  3877 */
  3878 int sqlite3BtreeEof(BtCursor *pCur){
  3879   /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
  3880   ** have been deleted? This API will need to change to return an error code
  3881   ** as well as the boolean result value.
  3882   */
  3883   return (CURSOR_VALID!=pCur->eState);
  3884 }
  3885 
  3886 /*
  3887 ** Return the database connection handle for a cursor.
  3888 */
  3889 sqlite3 *sqlite3BtreeCursorDb(const BtCursor *pCur){
  3890   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
  3891   return pCur->pBtree->db;
  3892 }
  3893 
  3894 /*
  3895 ** Advance the cursor to the next entry in the database.  If
  3896 ** successful then set *pRes=0.  If the cursor
  3897 ** was already pointing to the last entry in the database before
  3898 ** this routine was called, then set *pRes=1.
  3899 */
  3900 int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
  3901   int rc;
  3902   int idx;
  3903   MemPage *pPage;
  3904 
  3905   assert( cursorHoldsMutex(pCur) );
  3906   rc = restoreCursorPosition(pCur);
  3907   if( rc!=SQLITE_OK ){
  3908     return rc;
  3909   }
  3910   assert( pRes!=0 );
  3911   if( CURSOR_INVALID==pCur->eState ){
  3912     *pRes = 1;
  3913     return SQLITE_OK;
  3914   }
  3915   if( pCur->skip>0 ){
  3916     pCur->skip = 0;
  3917     *pRes = 0;
  3918     return SQLITE_OK;
  3919   }
  3920   pCur->skip = 0;
  3921 
  3922   pPage = pCur->apPage[pCur->iPage];
  3923   idx = ++pCur->aiIdx[pCur->iPage];
  3924   assert( pPage->isInit );
  3925   assert( idx<=pPage->nCell );
  3926 
  3927   pCur->info.nSize = 0;
  3928   pCur->validNKey = 0;
  3929   if( idx>=pPage->nCell ){
  3930     if( !pPage->leaf ){
  3931       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
  3932       if( rc ) return rc;
  3933       rc = moveToLeftmost(pCur);
  3934       *pRes = 0;
  3935       return rc;
  3936     }
  3937     do{
  3938       if( pCur->iPage==0 ){
  3939         *pRes = 1;
  3940         pCur->eState = CURSOR_INVALID;
  3941         return SQLITE_OK;
  3942       }
  3943       sqlite3BtreeMoveToParent(pCur);
  3944       pPage = pCur->apPage[pCur->iPage];
  3945     }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell );
  3946     *pRes = 0;
  3947     if( pPage->intKey ){
  3948       rc = sqlite3BtreeNext(pCur, pRes);
  3949     }else{
  3950       rc = SQLITE_OK;
  3951     }
  3952     return rc;
  3953   }
  3954   *pRes = 0;
  3955   if( pPage->leaf ){
  3956     return SQLITE_OK;
  3957   }
  3958   rc = moveToLeftmost(pCur);
  3959   return rc;
  3960 }
  3961 
  3962 
  3963 /*
  3964 ** Step the cursor to the back to the previous entry in the database.  If
  3965 ** successful then set *pRes=0.  If the cursor
  3966 ** was already pointing to the first entry in the database before
  3967 ** this routine was called, then set *pRes=1.
  3968 */
  3969 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
  3970   int rc;
  3971   MemPage *pPage;
  3972 
  3973   assert( cursorHoldsMutex(pCur) );
  3974   rc = restoreCursorPosition(pCur);
  3975   if( rc!=SQLITE_OK ){
  3976     return rc;
  3977   }
  3978   pCur->atLast = 0;
  3979   if( CURSOR_INVALID==pCur->eState ){
  3980     *pRes = 1;
  3981     return SQLITE_OK;
  3982   }
  3983   if( pCur->skip<0 ){
  3984     pCur->skip = 0;
  3985     *pRes = 0;
  3986     return SQLITE_OK;
  3987   }
  3988   pCur->skip = 0;
  3989 
  3990   pPage = pCur->apPage[pCur->iPage];
  3991   assert( pPage->isInit );
  3992   if( !pPage->leaf ){
  3993     int idx = pCur->aiIdx[pCur->iPage];
  3994     rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
  3995     if( rc ){
  3996       return rc;
  3997     }
  3998     rc = moveToRightmost(pCur);
  3999   }else{
  4000     while( pCur->aiIdx[pCur->iPage]==0 ){
  4001       if( pCur->iPage==0 ){
  4002         pCur->eState = CURSOR_INVALID;
  4003         *pRes = 1;
  4004         return SQLITE_OK;
  4005       }
  4006       sqlite3BtreeMoveToParent(pCur);
  4007     }
  4008     pCur->info.nSize = 0;
  4009     pCur->validNKey = 0;
  4010 
  4011     pCur->aiIdx[pCur->iPage]--;
  4012     pPage = pCur->apPage[pCur->iPage];
  4013     if( pPage->intKey && !pPage->leaf ){
  4014       rc = sqlite3BtreePrevious(pCur, pRes);
  4015     }else{
  4016       rc = SQLITE_OK;
  4017     }
  4018   }
  4019   *pRes = 0;
  4020   return rc;
  4021 }
  4022 
  4023 /*
  4024 ** Allocate a new page from the database file.
  4025 **
  4026 ** The new page is marked as dirty.  (In other words, sqlite3PagerWrite()
  4027 ** has already been called on the new page.)  The new page has also
  4028 ** been referenced and the calling routine is responsible for calling
  4029 ** sqlite3PagerUnref() on the new page when it is done.
  4030 **
  4031 ** SQLITE_OK is returned on success.  Any other return value indicates
  4032 ** an error.  *ppPage and *pPgno are undefined in the event of an error.
  4033 ** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned.
  4034 **
  4035 ** If the "nearby" parameter is not 0, then a (feeble) effort is made to 
  4036 ** locate a page close to the page number "nearby".  This can be used in an
  4037 ** attempt to keep related pages close to each other in the database file,
  4038 ** which in turn can make database access faster.
  4039 **
  4040 ** If the "exact" parameter is not 0, and the page-number nearby exists 
  4041 ** anywhere on the free-list, then it is guarenteed to be returned. This
  4042 ** is only used by auto-vacuum databases when allocating a new table.
  4043 */
  4044 static int allocateBtreePage(
  4045   BtShared *pBt, 
  4046   MemPage **ppPage, 
  4047   Pgno *pPgno, 
  4048   Pgno nearby,
  4049   u8 exact
  4050 ){
  4051   MemPage *pPage1;
  4052   int rc;
  4053   int n;     /* Number of pages on the freelist */
  4054   int k;     /* Number of leaves on the trunk of the freelist */
  4055   MemPage *pTrunk = 0;
  4056   MemPage *pPrevTrunk = 0;
  4057 
  4058   assert( sqlite3_mutex_held(pBt->mutex) );
  4059   pPage1 = pBt->pPage1;
  4060   n = get4byte(&pPage1->aData[36]);
  4061   if( n>0 ){
  4062     /* There are pages on the freelist.  Reuse one of those pages. */
  4063     Pgno iTrunk;
  4064     u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
  4065     
  4066     /* If the 'exact' parameter was true and a query of the pointer-map
  4067     ** shows that the page 'nearby' is somewhere on the free-list, then
  4068     ** the entire-list will be searched for that page.
  4069     */
  4070 #ifndef SQLITE_OMIT_AUTOVACUUM
  4071     if( exact && nearby<=pagerPagecount(pBt->pPager) ){
  4072       u8 eType;
  4073       assert( nearby>0 );
  4074       assert( pBt->autoVacuum );
  4075       rc = ptrmapGet(pBt, nearby, &eType, 0);
  4076       if( rc ) return rc;
  4077       if( eType==PTRMAP_FREEPAGE ){
  4078         searchList = 1;
  4079       }
  4080       *pPgno = nearby;
  4081     }
  4082 #endif
  4083 
  4084     /* Decrement the free-list count by 1. Set iTrunk to the index of the
  4085     ** first free-list trunk page. iPrevTrunk is initially 1.
  4086     */
  4087     rc = sqlite3PagerWrite(pPage1->pDbPage);
  4088     if( rc ) return rc;
  4089     put4byte(&pPage1->aData[36], n-1);
  4090 
  4091     /* The code within this loop is run only once if the 'searchList' variable
  4092     ** is not true. Otherwise, it runs once for each trunk-page on the
  4093     ** free-list until the page 'nearby' is located.
  4094     */
  4095     do {
  4096       pPrevTrunk = pTrunk;
  4097       if( pPrevTrunk ){
  4098         iTrunk = get4byte(&pPrevTrunk->aData[0]);
  4099       }else{
  4100         iTrunk = get4byte(&pPage1->aData[32]);
  4101       }
  4102       rc = sqlite3BtreeGetPage(pBt, iTrunk, &pTrunk, 0);
  4103       if( rc ){
  4104         pTrunk = 0;
  4105         goto end_allocate_page;
  4106       }
  4107 
  4108       k = get4byte(&pTrunk->aData[4]);
  4109       if( k==0 && !searchList ){
  4110         /* The trunk has no leaves and the list is not being searched. 
  4111         ** So extract the trunk page itself and use it as the newly 
  4112         ** allocated page */
  4113         assert( pPrevTrunk==0 );
  4114         rc = sqlite3PagerWrite(pTrunk->pDbPage);
  4115         if( rc ){
  4116           goto end_allocate_page;
  4117         }
  4118         *pPgno = iTrunk;
  4119         memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
  4120         *ppPage = pTrunk;
  4121         pTrunk = 0;
  4122         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
  4123       }else if( k>pBt->usableSize/4 - 2 ){
  4124         /* Value of k is out of range.  Database corruption */
  4125         rc = SQLITE_CORRUPT_BKPT;
  4126         goto end_allocate_page;
  4127 #ifndef SQLITE_OMIT_AUTOVACUUM
  4128       }else if( searchList && nearby==iTrunk ){
  4129         /* The list is being searched and this trunk page is the page
  4130         ** to allocate, regardless of whether it has leaves.
  4131         */
  4132         assert( *pPgno==iTrunk );
  4133         *ppPage = pTrunk;
  4134         searchList = 0;
  4135         rc = sqlite3PagerWrite(pTrunk->pDbPage);
  4136         if( rc ){
  4137           goto end_allocate_page;
  4138         }
  4139         if( k==0 ){
  4140           if( !pPrevTrunk ){
  4141             memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
  4142           }else{
  4143             memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
  4144           }
  4145         }else{
  4146           /* The trunk page is required by the caller but it contains 
  4147           ** pointers to free-list leaves. The first leaf becomes a trunk
  4148           ** page in this case.
  4149           */
  4150           MemPage *pNewTrunk;
  4151           Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
  4152           rc = sqlite3BtreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0);
  4153           if( rc!=SQLITE_OK ){
  4154             goto end_allocate_page;
  4155           }
  4156           rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
  4157           if( rc!=SQLITE_OK ){
  4158             releasePage(pNewTrunk);
  4159             goto end_allocate_page;
  4160           }
  4161           memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
  4162           put4byte(&pNewTrunk->aData[4], k-1);
  4163           memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
  4164           releasePage(pNewTrunk);
  4165           if( !pPrevTrunk ){
  4166             put4byte(&pPage1->aData[32], iNewTrunk);
  4167           }else{
  4168             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
  4169             if( rc ){
  4170               goto end_allocate_page;
  4171             }
  4172             put4byte(&pPrevTrunk->aData[0], iNewTrunk);
  4173           }
  4174         }
  4175         pTrunk = 0;
  4176         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
  4177 #endif
  4178       }else{
  4179         /* Extract a leaf from the trunk */
  4180         int closest;
  4181         Pgno iPage;
  4182         unsigned char *aData = pTrunk->aData;
  4183         rc = sqlite3PagerWrite(pTrunk->pDbPage);
  4184         if( rc ){
  4185           goto end_allocate_page;
  4186         }
  4187         if( nearby>0 ){
  4188           int i, dist;
  4189           closest = 0;
  4190           dist = get4byte(&aData[8]) - nearby;
  4191           if( dist<0 ) dist = -dist;
  4192           for(i=1; i<k; i++){
  4193             int d2 = get4byte(&aData[8+i*4]) - nearby;
  4194             if( d2<0 ) d2 = -d2;
  4195             if( d2<dist ){
  4196               closest = i;
  4197               dist = d2;
  4198             }
  4199           }
  4200         }else{
  4201           closest = 0;
  4202         }
  4203 
  4204         iPage = get4byte(&aData[8+closest*4]);
  4205         if( !searchList || iPage==nearby ){
  4206           int nPage;
  4207           *pPgno = iPage;
  4208           nPage = pagerPagecount(pBt->pPager);
  4209           if( *pPgno>nPage ){
  4210             /* Free page off the end of the file */
  4211             rc = SQLITE_CORRUPT_BKPT;
  4212             goto end_allocate_page;
  4213           }
  4214           TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
  4215                  ": %d more free pages\n",
  4216                  *pPgno, closest+1, k, pTrunk->pgno, n-1));
  4217           if( closest<k-1 ){
  4218             memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
  4219           }
  4220           put4byte(&aData[4], k-1);
  4221           rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 1);
  4222           if( rc==SQLITE_OK ){
  4223             sqlite3PagerDontRollback((*ppPage)->pDbPage);
  4224             rc = sqlite3PagerWrite((*ppPage)->pDbPage);
  4225             if( rc!=SQLITE_OK ){
  4226               releasePage(*ppPage);
  4227             }
  4228           }
  4229           searchList = 0;
  4230         }
  4231       }
  4232       releasePage(pPrevTrunk);
  4233       pPrevTrunk = 0;
  4234     }while( searchList );
  4235   }else{
  4236     /* There are no pages on the freelist, so create a new page at the
  4237     ** end of the file */
  4238     int nPage = pagerPagecount(pBt->pPager);
  4239     *pPgno = nPage + 1;
  4240 
  4241 #ifndef SQLITE_OMIT_AUTOVACUUM
  4242     if( pBt->nTrunc ){
  4243       /* An incr-vacuum has already run within this transaction. So the
  4244       ** page to allocate is not from the physical end of the file, but
  4245       ** at pBt->nTrunc. 
  4246       */
  4247       *pPgno = pBt->nTrunc+1;
  4248       if( *pPgno==PENDING_BYTE_PAGE(pBt) ){
  4249         (*pPgno)++;
  4250       }
  4251     }
  4252     if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, *pPgno) ){
  4253       /* If *pPgno refers to a pointer-map page, allocate two new pages
  4254       ** at the end of the file instead of one. The first allocated page
  4255       ** becomes a new pointer-map page, the second is used by the caller.
  4256       */
  4257       TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", *pPgno));
  4258       assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
  4259       (*pPgno)++;
  4260       if( *pPgno==PENDING_BYTE_PAGE(pBt) ){ (*pPgno)++; }
  4261     }
  4262     if( pBt->nTrunc ){
  4263       pBt->nTrunc = *pPgno;
  4264     }
  4265 #endif
  4266 
  4267     assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
  4268     rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 0);
  4269     if( rc ) return rc;
  4270     rc = sqlite3PagerWrite((*ppPage)->pDbPage);
  4271     if( rc!=SQLITE_OK ){
  4272       releasePage(*ppPage);
  4273     }
  4274     TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
  4275   }
  4276 
  4277   assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
  4278 
  4279 end_allocate_page:
  4280   releasePage(pTrunk);
  4281   releasePage(pPrevTrunk);
  4282   if( rc==SQLITE_OK && sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
  4283     releasePage(*ppPage);
  4284     return SQLITE_CORRUPT_BKPT;
  4285   }
  4286   return rc;
  4287 }
  4288 
  4289 /*
  4290 ** Add a page of the database file to the freelist.
  4291 **
  4292 ** sqlite3PagerUnref() is NOT called for pPage.
  4293 */
  4294 static int freePage(MemPage *pPage){
  4295   BtShared *pBt = pPage->pBt;
  4296   MemPage *pPage1 = pBt->pPage1;
  4297   int rc, n, k;
  4298 
  4299   /* Prepare the page for freeing */
  4300   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  4301   assert( pPage->pgno>1 );
  4302   pPage->isInit = 0;
  4303 
  4304   /* Increment the free page count on pPage1 */
  4305   rc = sqlite3PagerWrite(pPage1->pDbPage);
  4306   if( rc ) return rc;
  4307   n = get4byte(&pPage1->aData[36]);
  4308   put4byte(&pPage1->aData[36], n+1);
  4309 
  4310 #ifdef SQLITE_SECURE_DELETE
  4311   /* If the SQLITE_SECURE_DELETE compile-time option is enabled, then
  4312   ** always fully overwrite deleted information with zeros.
  4313   */
  4314   rc = sqlite3PagerWrite(pPage->pDbPage);
  4315   if( rc ) return rc;
  4316   memset(pPage->aData, 0, pPage->pBt->pageSize);
  4317 #endif
  4318 
  4319   /* If the database supports auto-vacuum, write an entry in the pointer-map
  4320   ** to indicate that the page is free.
  4321   */
  4322   if( ISAUTOVACUUM ){
  4323     rc = ptrmapPut(pBt, pPage->pgno, PTRMAP_FREEPAGE, 0);
  4324     if( rc ) return rc;
  4325   }
  4326 
  4327   if( n==0 ){
  4328     /* This is the first free page */
  4329     rc = sqlite3PagerWrite(pPage->pDbPage);
  4330     if( rc ) return rc;
  4331     memset(pPage->aData, 0, 8);
  4332     put4byte(&pPage1->aData[32], pPage->pgno);
  4333     TRACE(("FREE-PAGE: %d first\n", pPage->pgno));
  4334   }else{
  4335     /* Other free pages already exist.  Retrive the first trunk page
  4336     ** of the freelist and find out how many leaves it has. */
  4337     MemPage *pTrunk;
  4338     rc = sqlite3BtreeGetPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk, 0);
  4339     if( rc ) return rc;
  4340     k = get4byte(&pTrunk->aData[4]);
  4341     if( k>=pBt->usableSize/4 - 8 ){
  4342       /* The trunk is full.  Turn the page being freed into a new
  4343       ** trunk page with no leaves.
  4344       **
  4345       ** Note that the trunk page is not really full until it contains
  4346       ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
  4347       ** coded.  But due to a coding error in versions of SQLite prior to
  4348       ** 3.6.0, databases with freelist trunk pages holding more than
  4349       ** usableSize/4 - 8 entries will be reported as corrupt.  In order
  4350       ** to maintain backwards compatibility with older versions of SQLite,
  4351       ** we will contain to restrict the number of entries to usableSize/4 - 8
  4352       ** for now.  At some point in the future (once everyone has upgraded
  4353       ** to 3.6.0 or later) we should consider fixing the conditional above
  4354       ** to read "usableSize/4-2" instead of "usableSize/4-8".
  4355       */
  4356       rc = sqlite3PagerWrite(pPage->pDbPage);
  4357       if( rc==SQLITE_OK ){
  4358         put4byte(pPage->aData, pTrunk->pgno);
  4359         put4byte(&pPage->aData[4], 0);
  4360         put4byte(&pPage1->aData[32], pPage->pgno);
  4361         TRACE(("FREE-PAGE: %d new trunk page replacing %d\n",
  4362                 pPage->pgno, pTrunk->pgno));
  4363       }
  4364     }else if( k<0 ){
  4365       rc = SQLITE_CORRUPT;
  4366     }else{
  4367       /* Add the newly freed page as a leaf on the current trunk */
  4368       rc = sqlite3PagerWrite(pTrunk->pDbPage);
  4369       if( rc==SQLITE_OK ){
  4370         put4byte(&pTrunk->aData[4], k+1);
  4371         put4byte(&pTrunk->aData[8+k*4], pPage->pgno);
  4372 #ifndef SQLITE_SECURE_DELETE
  4373         rc = sqlite3PagerDontWrite(pPage->pDbPage);
  4374 #endif
  4375       }
  4376       TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
  4377     }
  4378     releasePage(pTrunk);
  4379   }
  4380   return rc;
  4381 }
  4382 
  4383 /*
  4384 ** Free any overflow pages associated with the given Cell.
  4385 */
  4386 static int clearCell(MemPage *pPage, unsigned char *pCell){
  4387   BtShared *pBt = pPage->pBt;
  4388   CellInfo info;
  4389   Pgno ovflPgno;
  4390   int rc;
  4391   int nOvfl;
  4392   int ovflPageSize;
  4393 
  4394   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  4395   sqlite3BtreeParseCellPtr(pPage, pCell, &info);
  4396   if( info.iOverflow==0 ){
  4397     return SQLITE_OK;  /* No overflow pages. Return without doing anything */
  4398   }
  4399   ovflPgno = get4byte(&pCell[info.iOverflow]);
  4400   ovflPageSize = pBt->usableSize - 4;
  4401   nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;
  4402   assert( ovflPgno==0 || nOvfl>0 );
  4403   while( nOvfl-- ){
  4404     MemPage *pOvfl;
  4405     if( ovflPgno==0 || ovflPgno>pagerPagecount(pBt->pPager) ){
  4406       return SQLITE_CORRUPT_BKPT;
  4407     }
  4408 
  4409     rc = getOverflowPage(pBt, ovflPgno, &pOvfl, (nOvfl==0)?0:&ovflPgno);
  4410     if( rc ) return rc;
  4411     rc = freePage(pOvfl);
  4412     sqlite3PagerUnref(pOvfl->pDbPage);
  4413     if( rc ) return rc;
  4414   }
  4415   return SQLITE_OK;
  4416 }
  4417 
  4418 /*
  4419 ** Create the byte sequence used to represent a cell on page pPage
  4420 ** and write that byte sequence into pCell[].  Overflow pages are
  4421 ** allocated and filled in as necessary.  The calling procedure
  4422 ** is responsible for making sure sufficient space has been allocated
  4423 ** for pCell[].
  4424 **
  4425 ** Note that pCell does not necessary need to point to the pPage->aData
  4426 ** area.  pCell might point to some temporary storage.  The cell will
  4427 ** be constructed in this temporary area then copied into pPage->aData
  4428 ** later.
  4429 */
  4430 static int fillInCell(
  4431   MemPage *pPage,                /* The page that contains the cell */
  4432   unsigned char *pCell,          /* Complete text of the cell */
  4433   const void *pKey, i64 nKey,    /* The key */
  4434   const void *pData,int nData,   /* The data */
  4435   int nZero,                     /* Extra zero bytes to append to pData */
  4436   int *pnSize                    /* Write cell size here */
  4437 ){
  4438   int nPayload;
  4439   const u8 *pSrc;
  4440   int nSrc, n, rc;
  4441   int spaceLeft;
  4442   MemPage *pOvfl = 0;
  4443   MemPage *pToRelease = 0;
  4444   unsigned char *pPrior;
  4445   unsigned char *pPayload;
  4446   BtShared *pBt = pPage->pBt;
  4447   Pgno pgnoOvfl = 0;
  4448   int nHeader;
  4449   CellInfo info;
  4450 
  4451   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  4452 
  4453   /* Fill in the header. */
  4454   nHeader = 0;
  4455   if( !pPage->leaf ){
  4456     nHeader += 4;
  4457   }
  4458   if( pPage->hasData ){
  4459     nHeader += putVarint(&pCell[nHeader], nData+nZero);
  4460   }else{
  4461     nData = nZero = 0;
  4462   }
  4463   nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
  4464   sqlite3BtreeParseCellPtr(pPage, pCell, &info);
  4465   assert( info.nHeader==nHeader );
  4466   assert( info.nKey==nKey );
  4467   assert( info.nData==nData+nZero );
  4468   
  4469   /* Fill in the payload */
  4470   nPayload = nData + nZero;
  4471   if( pPage->intKey ){
  4472     pSrc = pData;
  4473     nSrc = nData;
  4474     nData = 0;
  4475   }else{
  4476     nPayload += nKey;
  4477     pSrc = pKey;
  4478     nSrc = nKey;
  4479   }
  4480   *pnSize = info.nSize;
  4481   spaceLeft = info.nLocal;
  4482   pPayload = &pCell[nHeader];
  4483   pPrior = &pCell[info.iOverflow];
  4484 
  4485   while( nPayload>0 ){
  4486     if( spaceLeft==0 ){
  4487       int isExact = 0;
  4488 #ifndef SQLITE_OMIT_AUTOVACUUM
  4489       Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
  4490       if( pBt->autoVacuum ){
  4491         do{
  4492           pgnoOvfl++;
  4493         } while( 
  4494           PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt) 
  4495         );
  4496         if( pgnoOvfl>1 ){
  4497           /* isExact = 1; */
  4498         }
  4499       }
  4500 #endif
  4501       rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, isExact);
  4502 #ifndef SQLITE_OMIT_AUTOVACUUM
  4503       /* If the database supports auto-vacuum, and the second or subsequent
  4504       ** overflow page is being allocated, add an entry to the pointer-map
  4505       ** for that page now. 
  4506       **
  4507       ** If this is the first overflow page, then write a partial entry 
  4508       ** to the pointer-map. If we write nothing to this pointer-map slot,
  4509       ** then the optimistic overflow chain processing in clearCell()
  4510       ** may misinterpret the uninitialised values and delete the
  4511       ** wrong pages from the database.
  4512       */
  4513       if( pBt->autoVacuum && rc==SQLITE_OK ){
  4514         u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
  4515         rc = ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap);
  4516         if( rc ){
  4517           releasePage(pOvfl);
  4518         }
  4519       }
  4520 #endif
  4521       if( rc ){
  4522         releasePage(pToRelease);
  4523         return rc;
  4524       }
  4525       put4byte(pPrior, pgnoOvfl);
  4526       releasePage(pToRelease);
  4527       pToRelease = pOvfl;
  4528       pPrior = pOvfl->aData;
  4529       put4byte(pPrior, 0);
  4530       pPayload = &pOvfl->aData[4];
  4531       spaceLeft = pBt->usableSize - 4;
  4532     }
  4533     n = nPayload;
  4534     if( n>spaceLeft ) n = spaceLeft;
  4535     if( nSrc>0 ){
  4536       if( n>nSrc ) n = nSrc;
  4537       assert( pSrc );
  4538       memcpy(pPayload, pSrc, n);
  4539     }else{
  4540       memset(pPayload, 0, n);
  4541     }
  4542     nPayload -= n;
  4543     pPayload += n;
  4544     pSrc += n;
  4545     nSrc -= n;
  4546     spaceLeft -= n;
  4547     if( nSrc==0 ){
  4548       nSrc = nData;
  4549       pSrc = pData;
  4550     }
  4551   }
  4552   releasePage(pToRelease);
  4553   return SQLITE_OK;
  4554 }
  4555 
  4556 /*
  4557 ** Remove the i-th cell from pPage.  This routine effects pPage only.
  4558 ** The cell content is not freed or deallocated.  It is assumed that
  4559 ** the cell content has been copied someplace else.  This routine just
  4560 ** removes the reference to the cell from pPage.
  4561 **
  4562 ** "sz" must be the number of bytes in the cell.
  4563 */
  4564 static void dropCell(MemPage *pPage, int idx, int sz){
  4565   int i;          /* Loop counter */
  4566   int pc;         /* Offset to cell content of cell being deleted */
  4567   u8 *data;       /* pPage->aData */
  4568   u8 *ptr;        /* Used to move bytes around within data[] */
  4569 
  4570   assert( idx>=0 && idx<pPage->nCell );
  4571   assert( sz==cellSize(pPage, idx) );
  4572   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  4573   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  4574   data = pPage->aData;
  4575   ptr = &data[pPage->cellOffset + 2*idx];
  4576   pc = get2byte(ptr);
  4577   assert( pc>10 && pc+sz<=pPage->pBt->usableSize );
  4578   freeSpace(pPage, pc, sz);
  4579   for(i=idx+1; i<pPage->nCell; i++, ptr+=2){
  4580     ptr[0] = ptr[2];
  4581     ptr[1] = ptr[3];
  4582   }
  4583   pPage->nCell--;
  4584   put2byte(&data[pPage->hdrOffset+3], pPage->nCell);
  4585   pPage->nFree += 2;
  4586 }
  4587 
  4588 /*
  4589 ** Insert a new cell on pPage at cell index "i".  pCell points to the
  4590 ** content of the cell.
  4591 **
  4592 ** If the cell content will fit on the page, then put it there.  If it
  4593 ** will not fit, then make a copy of the cell content into pTemp if
  4594 ** pTemp is not null.  Regardless of pTemp, allocate a new entry
  4595 ** in pPage->aOvfl[] and make it point to the cell content (either
  4596 ** in pTemp or the original pCell) and also record its index. 
  4597 ** Allocating a new entry in pPage->aCell[] implies that 
  4598 ** pPage->nOverflow is incremented.
  4599 **
  4600 ** If nSkip is non-zero, then do not copy the first nSkip bytes of the
  4601 ** cell. The caller will overwrite them after this function returns. If
  4602 ** nSkip is non-zero, then pCell may not point to an invalid memory location 
  4603 ** (but pCell+nSkip is always valid).
  4604 */
  4605 static int insertCell(
  4606   MemPage *pPage,   /* Page into which we are copying */
  4607   int i,            /* New cell becomes the i-th cell of the page */
  4608   u8 *pCell,        /* Content of the new cell */
  4609   int sz,           /* Bytes of content in pCell */
  4610   u8 *pTemp,        /* Temp storage space for pCell, if needed */
  4611   u8 nSkip          /* Do not write the first nSkip bytes of the cell */
  4612 ){
  4613   int idx;          /* Where to write new cell content in data[] */
  4614   int j;            /* Loop counter */
  4615   int top;          /* First byte of content for any cell in data[] */
  4616   int end;          /* First byte past the last cell pointer in data[] */
  4617   int ins;          /* Index in data[] where new cell pointer is inserted */
  4618   int hdr;          /* Offset into data[] of the page header */
  4619   int cellOffset;   /* Address of first cell pointer in data[] */
  4620   u8 *data;         /* The content of the whole page */
  4621   u8 *ptr;          /* Used for moving information around in data[] */
  4622 
  4623   assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
  4624   assert( sz==cellSizePtr(pPage, pCell) );
  4625   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  4626   if( pPage->nOverflow || sz+2>pPage->nFree ){
  4627     if( pTemp ){
  4628       memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip);
  4629       pCell = pTemp;
  4630     }
  4631     j = pPage->nOverflow++;
  4632     assert( j<sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0]) );
  4633     pPage->aOvfl[j].pCell = pCell;
  4634     pPage->aOvfl[j].idx = i;
  4635     pPage->nFree = 0;
  4636   }else{
  4637     int rc = sqlite3PagerWrite(pPage->pDbPage);
  4638     if( rc!=SQLITE_OK ){
  4639       return rc;
  4640     }
  4641     assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  4642     data = pPage->aData;
  4643     hdr = pPage->hdrOffset;
  4644     top = get2byte(&data[hdr+5]);
  4645     cellOffset = pPage->cellOffset;
  4646     end = cellOffset + 2*pPage->nCell + 2;
  4647     ins = cellOffset + 2*i;
  4648     if( end > top - sz ){
  4649       defragmentPage(pPage);
  4650       top = get2byte(&data[hdr+5]);
  4651       assert( end + sz <= top );
  4652     }
  4653     idx = allocateSpace(pPage, sz);
  4654     assert( idx>0 );
  4655     assert( end <= get2byte(&data[hdr+5]) );
  4656     pPage->nCell++;
  4657     pPage->nFree -= 2;
  4658     memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip);
  4659     for(j=end-2, ptr=&data[j]; j>ins; j-=2, ptr-=2){
  4660       ptr[0] = ptr[-2];
  4661       ptr[1] = ptr[-1];
  4662     }
  4663     put2byte(&data[ins], idx);
  4664     put2byte(&data[hdr+3], pPage->nCell);
  4665 #ifndef SQLITE_OMIT_AUTOVACUUM
  4666     if( pPage->pBt->autoVacuum ){
  4667       /* The cell may contain a pointer to an overflow page. If so, write
  4668       ** the entry for the overflow page into the pointer map.
  4669       */
  4670       CellInfo info;
  4671       sqlite3BtreeParseCellPtr(pPage, pCell, &info);
  4672       assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
  4673       if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
  4674         Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
  4675         rc = ptrmapPut(pPage->pBt, pgnoOvfl, PTRMAP_OVERFLOW1, pPage->pgno);
  4676         if( rc!=SQLITE_OK ) return rc;
  4677       }
  4678     }
  4679 #endif
  4680   }
  4681 
  4682   return SQLITE_OK;
  4683 }
  4684 
  4685 /*
  4686 ** Add a list of cells to a page.  The page should be initially empty.
  4687 ** The cells are guaranteed to fit on the page.
  4688 */
  4689 static void assemblePage(
  4690   MemPage *pPage,   /* The page to be assemblied */
  4691   int nCell,        /* The number of cells to add to this page */
  4692   u8 **apCell,      /* Pointers to cell bodies */
  4693   u16 *aSize        /* Sizes of the cells */
  4694 ){
  4695   int i;            /* Loop counter */
  4696   int totalSize;    /* Total size of all cells */
  4697   int hdr;          /* Index of page header */
  4698   int cellptr;      /* Address of next cell pointer */
  4699   int cellbody;     /* Address of next cell body */
  4700   u8 *data;         /* Data for the page */
  4701 
  4702   assert( pPage->nOverflow==0 );
  4703   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  4704   totalSize = 0;
  4705   for(i=0; i<nCell; i++){
  4706     totalSize += aSize[i];
  4707   }
  4708   assert( totalSize+2*nCell<=pPage->nFree );
  4709   assert( pPage->nCell==0 );
  4710   cellptr = pPage->cellOffset;
  4711   data = pPage->aData;
  4712   hdr = pPage->hdrOffset;
  4713   put2byte(&data[hdr+3], nCell);
  4714   if( nCell ){
  4715     cellbody = allocateSpace(pPage, totalSize);
  4716     assert( cellbody>0 );
  4717     assert( pPage->nFree >= 2*nCell );
  4718     pPage->nFree -= 2*nCell;
  4719     for(i=0; i<nCell; i++){
  4720       put2byte(&data[cellptr], cellbody);
  4721       memcpy(&data[cellbody], apCell[i], aSize[i]);
  4722       cellptr += 2;
  4723       cellbody += aSize[i];
  4724     }
  4725     assert( cellbody==pPage->pBt->usableSize );
  4726   }
  4727   pPage->nCell = nCell;
  4728 }
  4729 
  4730 /*
  4731 ** The following parameters determine how many adjacent pages get involved
  4732 ** in a balancing operation.  NN is the number of neighbors on either side
  4733 ** of the page that participate in the balancing operation.  NB is the
  4734 ** total number of pages that participate, including the target page and
  4735 ** NN neighbors on either side.
  4736 **
  4737 ** The minimum value of NN is 1 (of course).  Increasing NN above 1
  4738 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
  4739 ** in exchange for a larger degradation in INSERT and UPDATE performance.
  4740 ** The value of NN appears to give the best results overall.
  4741 */
  4742 #define NN 1             /* Number of neighbors on either side of pPage */
  4743 #define NB (NN*2+1)      /* Total pages involved in the balance */
  4744 
  4745 /* Forward reference */
  4746 static int balance(BtCursor*, int);
  4747 
  4748 #ifndef SQLITE_OMIT_QUICKBALANCE
  4749 /*
  4750 ** This version of balance() handles the common special case where
  4751 ** a new entry is being inserted on the extreme right-end of the
  4752 ** tree, in other words, when the new entry will become the largest
  4753 ** entry in the tree.
  4754 **
  4755 ** Instead of trying balance the 3 right-most leaf pages, just add
  4756 ** a new page to the right-hand side and put the one new entry in
  4757 ** that page.  This leaves the right side of the tree somewhat
  4758 ** unbalanced.  But odds are that we will be inserting new entries
  4759 ** at the end soon afterwards so the nearly empty page will quickly
  4760 ** fill up.  On average.
  4761 **
  4762 ** pPage is the leaf page which is the right-most page in the tree.
  4763 ** pParent is its parent.  pPage must have a single overflow entry
  4764 ** which is also the right-most entry on the page.
  4765 */
  4766 static int balance_quick(BtCursor *pCur){
  4767   int rc;
  4768   MemPage *pNew = 0;
  4769   Pgno pgnoNew;
  4770   u8 *pCell;
  4771   u16 szCell;
  4772   CellInfo info;
  4773   MemPage *pPage = pCur->apPage[pCur->iPage];
  4774   MemPage *pParent = pCur->apPage[pCur->iPage-1];
  4775   BtShared *pBt = pPage->pBt;
  4776   int parentIdx = pParent->nCell;   /* pParent new divider cell index */
  4777   int parentSize;                   /* Size of new divider cell */
  4778   u8 parentCell[64];                /* Space for the new divider cell */
  4779 
  4780   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  4781 
  4782   /* Allocate a new page. Insert the overflow cell from pPage
  4783   ** into it. Then remove the overflow cell from pPage.
  4784   */
  4785   rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
  4786   if( rc==SQLITE_OK ){
  4787     pCell = pPage->aOvfl[0].pCell;
  4788     szCell = cellSizePtr(pPage, pCell);
  4789     zeroPage(pNew, pPage->aData[0]);
  4790     assemblePage(pNew, 1, &pCell, &szCell);
  4791     pPage->nOverflow = 0;
  4792   
  4793     /* pPage is currently the right-child of pParent. Change this
  4794     ** so that the right-child is the new page allocated above and
  4795     ** pPage is the next-to-right child. 
  4796     **
  4797     ** Ignore the return value of the call to fillInCell(). fillInCell()
  4798     ** may only return other than SQLITE_OK if it is required to allocate
  4799     ** one or more overflow pages. Since an internal table B-Tree cell 
  4800     ** may never spill over onto an overflow page (it is a maximum of 
  4801     ** 13 bytes in size), it is not neccessary to check the return code.
  4802     **
  4803     ** Similarly, the insertCell() function cannot fail if the page
  4804     ** being inserted into is already writable and the cell does not 
  4805     ** contain an overflow pointer. So ignore this return code too.
  4806     */
  4807     assert( pPage->nCell>0 );
  4808     pCell = findCell(pPage, pPage->nCell-1);
  4809     sqlite3BtreeParseCellPtr(pPage, pCell, &info);
  4810     fillInCell(pParent, parentCell, 0, info.nKey, 0, 0, 0, &parentSize);
  4811     assert( parentSize<64 );
  4812     assert( sqlite3PagerIswriteable(pParent->pDbPage) );
  4813     insertCell(pParent, parentIdx, parentCell, parentSize, 0, 4);
  4814     put4byte(findOverflowCell(pParent,parentIdx), pPage->pgno);
  4815     put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
  4816   
  4817     /* If this is an auto-vacuum database, update the pointer map
  4818     ** with entries for the new page, and any pointer from the 
  4819     ** cell on the page to an overflow page.
  4820     */
  4821     if( ISAUTOVACUUM ){
  4822       rc = ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno);
  4823       if( rc==SQLITE_OK ){
  4824         rc = ptrmapPutOvfl(pNew, 0);
  4825       }
  4826     }
  4827 
  4828     /* Release the reference to the new page. */
  4829     releasePage(pNew);
  4830   }
  4831 
  4832   /* At this point the pPage->nFree variable is not set correctly with
  4833   ** respect to the content of the page (because it was set to 0 by 
  4834   ** insertCell). So call sqlite3BtreeInitPage() to make sure it is
  4835   ** correct.
  4836   **
  4837   ** This has to be done even if an error will be returned. Normally, if
  4838   ** an error occurs during tree balancing, the contents of MemPage are
  4839   ** not important, as they will be recalculated when the page is rolled
  4840   ** back. But here, in balance_quick(), it is possible that pPage has 
  4841   ** not yet been marked dirty or written into the journal file. Therefore
  4842   ** it will not be rolled back and so it is important to make sure that
  4843   ** the page data and contents of MemPage are consistent.
  4844   */
  4845   pPage->isInit = 0;
  4846   sqlite3BtreeInitPage(pPage);
  4847 
  4848   /* If everything else succeeded, balance the parent page, in 
  4849   ** case the divider cell inserted caused it to become overfull.
  4850   */
  4851   if( rc==SQLITE_OK ){
  4852     releasePage(pPage);
  4853     pCur->iPage--;
  4854     rc = balance(pCur, 0);
  4855   }
  4856   return rc;
  4857 }
  4858 #endif /* SQLITE_OMIT_QUICKBALANCE */
  4859 
  4860 /*
  4861 ** This routine redistributes Cells on pPage and up to NN*2 siblings
  4862 ** of pPage so that all pages have about the same amount of free space.
  4863 ** Usually NN siblings on either side of pPage is used in the balancing,
  4864 ** though more siblings might come from one side if pPage is the first
  4865 ** or last child of its parent.  If pPage has fewer than 2*NN siblings
  4866 ** (something which can only happen if pPage is the root page or a 
  4867 ** child of root) then all available siblings participate in the balancing.
  4868 **
  4869 ** The number of siblings of pPage might be increased or decreased by one or
  4870 ** two in an effort to keep pages nearly full but not over full. The root page
  4871 ** is special and is allowed to be nearly empty. If pPage is 
  4872 ** the root page, then the depth of the tree might be increased
  4873 ** or decreased by one, as necessary, to keep the root page from being
  4874 ** overfull or completely empty.
  4875 **
  4876 ** Note that when this routine is called, some of the Cells on pPage
  4877 ** might not actually be stored in pPage->aData[].  This can happen
  4878 ** if the page is overfull.  Part of the job of this routine is to
  4879 ** make sure all Cells for pPage once again fit in pPage->aData[].
  4880 **
  4881 ** In the course of balancing the siblings of pPage, the parent of pPage
  4882 ** might become overfull or underfull.  If that happens, then this routine
  4883 ** is called recursively on the parent.
  4884 **
  4885 ** If this routine fails for any reason, it might leave the database
  4886 ** in a corrupted state.  So if this routine fails, the database should
  4887 ** be rolled back.
  4888 */
  4889 static int balance_nonroot(BtCursor *pCur){
  4890   MemPage *pPage;              /* The over or underfull page to balance */
  4891   MemPage *pParent;            /* The parent of pPage */
  4892   BtShared *pBt;               /* The whole database */
  4893   int nCell = 0;               /* Number of cells in apCell[] */
  4894   int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */
  4895   int nOld;                    /* Number of pages in apOld[] */
  4896   int nNew;                    /* Number of pages in apNew[] */
  4897   int nDiv;                    /* Number of cells in apDiv[] */
  4898   int i, j, k;                 /* Loop counters */
  4899   int idx;                     /* Index of pPage in pParent->aCell[] */
  4900   int nxDiv;                   /* Next divider slot in pParent->aCell[] */
  4901   int rc;                      /* The return code */
  4902   int leafCorrection;          /* 4 if pPage is a leaf.  0 if not */
  4903   int leafData;                /* True if pPage is a leaf of a LEAFDATA tree */
  4904   int usableSpace;             /* Bytes in pPage beyond the header */
  4905   int pageFlags;               /* Value of pPage->aData[0] */
  4906   int subtotal;                /* Subtotal of bytes in cells on one page */
  4907   int iSpace1 = 0;             /* First unused byte of aSpace1[] */
  4908   int iSpace2 = 0;             /* First unused byte of aSpace2[] */
  4909   int szScratch;               /* Size of scratch memory requested */
  4910   MemPage *apOld[NB];          /* pPage and up to two siblings */
  4911   Pgno pgnoOld[NB];            /* Page numbers for each page in apOld[] */
  4912   MemPage *apCopy[NB];         /* Private copies of apOld[] pages */
  4913   MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
  4914   Pgno pgnoNew[NB+2];          /* Page numbers for each page in apNew[] */
  4915   u8 *apDiv[NB];               /* Divider cells in pParent */
  4916   int cntNew[NB+2];            /* Index in aCell[] of cell after i-th page */
  4917   int szNew[NB+2];             /* Combined size of cells place on i-th page */
  4918   u8 **apCell = 0;             /* All cells begin balanced */
  4919   u16 *szCell;                 /* Local size of all cells in apCell[] */
  4920   u8 *aCopy[NB];         /* Space for holding data of apCopy[] */
  4921   u8 *aSpace1;           /* Space for copies of dividers cells before balance */
  4922   u8 *aSpace2 = 0;       /* Space for overflow dividers cells after balance */
  4923   u8 *aFrom = 0;
  4924 
  4925   pPage = pCur->apPage[pCur->iPage];
  4926   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  4927   VVA_ONLY( pCur->pagesShuffled = 1 );
  4928 
  4929   /* 
  4930   ** Find the parent page.
  4931   */
  4932   assert( pCur->iPage>0 );
  4933   assert( pPage->isInit );
  4934   assert( sqlite3PagerIswriteable(pPage->pDbPage) || pPage->nOverflow==1 );
  4935   pBt = pPage->pBt;
  4936   pParent = pCur->apPage[pCur->iPage-1];
  4937   assert( pParent );
  4938   if( SQLITE_OK!=(rc = sqlite3PagerWrite(pParent->pDbPage)) ){
  4939     return rc;
  4940   }
  4941 
  4942   TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
  4943 
  4944 #ifndef SQLITE_OMIT_QUICKBALANCE
  4945   /*
  4946   ** A special case:  If a new entry has just been inserted into a
  4947   ** table (that is, a btree with integer keys and all data at the leaves)
  4948   ** and the new entry is the right-most entry in the tree (it has the
  4949   ** largest key) then use the special balance_quick() routine for
  4950   ** balancing.  balance_quick() is much faster and results in a tighter
  4951   ** packing of data in the common case.
  4952   */
  4953   if( pPage->leaf &&
  4954       pPage->intKey &&
  4955       pPage->nOverflow==1 &&
  4956       pPage->aOvfl[0].idx==pPage->nCell &&
  4957       pParent->pgno!=1 &&
  4958       get4byte(&pParent->aData[pParent->hdrOffset+8])==pPage->pgno
  4959   ){
  4960     assert( pPage->intKey );
  4961     /*
  4962     ** TODO: Check the siblings to the left of pPage. It may be that
  4963     ** they are not full and no new page is required.
  4964     */
  4965     return balance_quick(pCur);
  4966   }
  4967 #endif
  4968 
  4969   if( SQLITE_OK!=(rc = sqlite3PagerWrite(pPage->pDbPage)) ){
  4970     return rc;
  4971   }
  4972 
  4973   /*
  4974   ** Find the cell in the parent page whose left child points back
  4975   ** to pPage.  The "idx" variable is the index of that cell.  If pPage
  4976   ** is the rightmost child of pParent then set idx to pParent->nCell 
  4977   */
  4978   idx = pCur->aiIdx[pCur->iPage-1];
  4979   assertParentIndex(pParent, idx, pPage->pgno);
  4980 
  4981   /*
  4982   ** Initialize variables so that it will be safe to jump
  4983   ** directly to balance_cleanup at any moment.
  4984   */
  4985   nOld = nNew = 0;
  4986 
  4987   /*
  4988   ** Find sibling pages to pPage and the cells in pParent that divide
  4989   ** the siblings.  An attempt is made to find NN siblings on either
  4990   ** side of pPage.  More siblings are taken from one side, however, if
  4991   ** pPage there are fewer than NN siblings on the other side.  If pParent
  4992   ** has NB or fewer children then all children of pParent are taken.
  4993   */
  4994   nxDiv = idx - NN;
  4995   if( nxDiv + NB > pParent->nCell ){
  4996     nxDiv = pParent->nCell - NB + 1;
  4997   }
  4998   if( nxDiv<0 ){
  4999     nxDiv = 0;
  5000   }
  5001   nDiv = 0;
  5002   for(i=0, k=nxDiv; i<NB; i++, k++){
  5003     if( k<pParent->nCell ){
  5004       apDiv[i] = findCell(pParent, k);
  5005       nDiv++;
  5006       assert( !pParent->leaf );
  5007       pgnoOld[i] = get4byte(apDiv[i]);
  5008     }else if( k==pParent->nCell ){
  5009       pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+8]);
  5010     }else{
  5011       break;
  5012     }
  5013     rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i]);
  5014     if( rc ) goto balance_cleanup;
  5015     /* apOld[i]->idxParent = k; */
  5016     apCopy[i] = 0;
  5017     assert( i==nOld );
  5018     nOld++;
  5019     nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
  5020   }
  5021 
  5022   /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
  5023   ** alignment */
  5024   nMaxCells = (nMaxCells + 3)&~3;
  5025 
  5026   /*
  5027   ** Allocate space for memory structures
  5028   */
  5029   szScratch =
  5030        nMaxCells*sizeof(u8*)                       /* apCell */
  5031      + nMaxCells*sizeof(u16)                       /* szCell */
  5032      + (ROUND8(sizeof(MemPage))+pBt->pageSize)*NB  /* aCopy */
  5033      + pBt->pageSize                               /* aSpace1 */
  5034      + (ISAUTOVACUUM ? nMaxCells : 0);             /* aFrom */
  5035   apCell = sqlite3ScratchMalloc( szScratch ); 
  5036   if( apCell==0 ){
  5037     rc = SQLITE_NOMEM;
  5038     goto balance_cleanup;
  5039   }
  5040   szCell = (u16*)&apCell[nMaxCells];
  5041   aCopy[0] = (u8*)&szCell[nMaxCells];
  5042   assert( ((aCopy[0] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
  5043   for(i=1; i<NB; i++){
  5044     aCopy[i] = &aCopy[i-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
  5045     assert( ((aCopy[i] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
  5046   }
  5047   aSpace1 = &aCopy[NB-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
  5048   assert( ((aSpace1 - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
  5049   if( ISAUTOVACUUM ){
  5050     aFrom = &aSpace1[pBt->pageSize];
  5051   }
  5052   aSpace2 = sqlite3PageMalloc(pBt->pageSize);
  5053   if( aSpace2==0 ){
  5054     rc = SQLITE_NOMEM;
  5055     goto balance_cleanup;
  5056   }
  5057   
  5058   /*
  5059   ** Make copies of the content of pPage and its siblings into aOld[].
  5060   ** The rest of this function will use data from the copies rather
  5061   ** that the original pages since the original pages will be in the
  5062   ** process of being overwritten.
  5063   */
  5064   for(i=0; i<nOld; i++){
  5065     MemPage *p = apCopy[i] = (MemPage*)aCopy[i];
  5066     memcpy(p, apOld[i], sizeof(MemPage));
  5067     p->aData = (void*)&p[1];
  5068     memcpy(p->aData, apOld[i]->aData, pBt->pageSize);
  5069   }
  5070 
  5071   /*
  5072   ** Load pointers to all cells on sibling pages and the divider cells
  5073   ** into the local apCell[] array.  Make copies of the divider cells
  5074   ** into space obtained form aSpace1[] and remove the the divider Cells
  5075   ** from pParent.
  5076   **
  5077   ** If the siblings are on leaf pages, then the child pointers of the
  5078   ** divider cells are stripped from the cells before they are copied
  5079   ** into aSpace1[].  In this way, all cells in apCell[] are without
  5080   ** child pointers.  If siblings are not leaves, then all cell in
  5081   ** apCell[] include child pointers.  Either way, all cells in apCell[]
  5082   ** are alike.
  5083   **
  5084   ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
  5085   **       leafData:  1 if pPage holds key+data and pParent holds only keys.
  5086   */
  5087   nCell = 0;
  5088   leafCorrection = pPage->leaf*4;
  5089   leafData = pPage->hasData;
  5090   for(i=0; i<nOld; i++){
  5091     MemPage *pOld = apCopy[i];
  5092     int limit = pOld->nCell+pOld->nOverflow;
  5093     for(j=0; j<limit; j++){
  5094       assert( nCell<nMaxCells );
  5095       apCell[nCell] = findOverflowCell(pOld, j);
  5096       szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
  5097       if( ISAUTOVACUUM ){
  5098         int a;
  5099         aFrom[nCell] = i;
  5100         for(a=0; a<pOld->nOverflow; a++){
  5101           if( pOld->aOvfl[a].pCell==apCell[nCell] ){
  5102             aFrom[nCell] = 0xFF;
  5103             break;
  5104           }
  5105         }
  5106       }
  5107       nCell++;
  5108     }
  5109     if( i<nOld-1 ){
  5110       u16 sz = cellSizePtr(pParent, apDiv[i]);
  5111       if( leafData ){
  5112         /* With the LEAFDATA flag, pParent cells hold only INTKEYs that
  5113         ** are duplicates of keys on the child pages.  We need to remove
  5114         ** the divider cells from pParent, but the dividers cells are not
  5115         ** added to apCell[] because they are duplicates of child cells.
  5116         */
  5117         dropCell(pParent, nxDiv, sz);
  5118       }else{
  5119         u8 *pTemp;
  5120         assert( nCell<nMaxCells );
  5121         szCell[nCell] = sz;
  5122         pTemp = &aSpace1[iSpace1];
  5123         iSpace1 += sz;
  5124         assert( sz<=pBt->pageSize/4 );
  5125         assert( iSpace1<=pBt->pageSize );
  5126         memcpy(pTemp, apDiv[i], sz);
  5127         apCell[nCell] = pTemp+leafCorrection;
  5128         if( ISAUTOVACUUM ){
  5129           aFrom[nCell] = 0xFF;
  5130         }
  5131         dropCell(pParent, nxDiv, sz);
  5132         szCell[nCell] -= leafCorrection;
  5133         assert( get4byte(pTemp)==pgnoOld[i] );
  5134         if( !pOld->leaf ){
  5135           assert( leafCorrection==0 );
  5136           /* The right pointer of the child page pOld becomes the left
  5137           ** pointer of the divider cell */
  5138           memcpy(apCell[nCell], &pOld->aData[pOld->hdrOffset+8], 4);
  5139         }else{
  5140           assert( leafCorrection==4 );
  5141           if( szCell[nCell]<4 ){
  5142             /* Do not allow any cells smaller than 4 bytes. */
  5143             szCell[nCell] = 4;
  5144           }
  5145         }
  5146         nCell++;
  5147       }
  5148     }
  5149   }
  5150 
  5151   /*
  5152   ** Figure out the number of pages needed to hold all nCell cells.
  5153   ** Store this number in "k".  Also compute szNew[] which is the total
  5154   ** size of all cells on the i-th page and cntNew[] which is the index
  5155   ** in apCell[] of the cell that divides page i from page i+1.  
  5156   ** cntNew[k] should equal nCell.
  5157   **
  5158   ** Values computed by this block:
  5159   **
  5160   **           k: The total number of sibling pages
  5161   **    szNew[i]: Spaced used on the i-th sibling page.
  5162   **   cntNew[i]: Index in apCell[] and szCell[] for the first cell to
  5163   **              the right of the i-th sibling page.
  5164   ** usableSpace: Number of bytes of space available on each sibling.
  5165   ** 
  5166   */
  5167   usableSpace = pBt->usableSize - 12 + leafCorrection;
  5168   for(subtotal=k=i=0; i<nCell; i++){
  5169     assert( i<nMaxCells );
  5170     subtotal += szCell[i] + 2;
  5171     if( subtotal > usableSpace ){
  5172       szNew[k] = subtotal - szCell[i];
  5173       cntNew[k] = i;
  5174       if( leafData ){ i--; }
  5175       subtotal = 0;
  5176       k++;
  5177     }
  5178   }
  5179   szNew[k] = subtotal;
  5180   cntNew[k] = nCell;
  5181   k++;
  5182 
  5183   /*
  5184   ** The packing computed by the previous block is biased toward the siblings
  5185   ** on the left side.  The left siblings are always nearly full, while the
  5186   ** right-most sibling might be nearly empty.  This block of code attempts
  5187   ** to adjust the packing of siblings to get a better balance.
  5188   **
  5189   ** This adjustment is more than an optimization.  The packing above might
  5190   ** be so out of balance as to be illegal.  For example, the right-most
  5191   ** sibling might be completely empty.  This adjustment is not optional.
  5192   */
  5193   for(i=k-1; i>0; i--){
  5194     int szRight = szNew[i];  /* Size of sibling on the right */
  5195     int szLeft = szNew[i-1]; /* Size of sibling on the left */
  5196     int r;              /* Index of right-most cell in left sibling */
  5197     int d;              /* Index of first cell to the left of right sibling */
  5198 
  5199     r = cntNew[i-1] - 1;
  5200     d = r + 1 - leafData;
  5201     assert( d<nMaxCells );
  5202     assert( r<nMaxCells );
  5203     while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){
  5204       szRight += szCell[d] + 2;
  5205       szLeft -= szCell[r] + 2;
  5206       cntNew[i-1]--;
  5207       r = cntNew[i-1] - 1;
  5208       d = r + 1 - leafData;
  5209     }
  5210     szNew[i] = szRight;
  5211     szNew[i-1] = szLeft;
  5212   }
  5213 
  5214   /* Either we found one or more cells (cntnew[0])>0) or we are the
  5215   ** a virtual root page.  A virtual root page is when the real root
  5216   ** page is page 1 and we are the only child of that page.
  5217   */
  5218   assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
  5219 
  5220   /*
  5221   ** Allocate k new pages.  Reuse old pages where possible.
  5222   */
  5223   assert( pPage->pgno>1 );
  5224   pageFlags = pPage->aData[0];
  5225   for(i=0; i<k; i++){
  5226     MemPage *pNew;
  5227     if( i<nOld ){
  5228       pNew = apNew[i] = apOld[i];
  5229       pgnoNew[i] = pgnoOld[i];
  5230       apOld[i] = 0;
  5231       rc = sqlite3PagerWrite(pNew->pDbPage);
  5232       nNew++;
  5233       if( rc ) goto balance_cleanup;
  5234     }else{
  5235       assert( i>0 );
  5236       rc = allocateBtreePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1], 0);
  5237       if( rc ) goto balance_cleanup;
  5238       apNew[i] = pNew;
  5239       nNew++;
  5240     }
  5241   }
  5242 
  5243   /* Free any old pages that were not reused as new pages.
  5244   */
  5245   while( i<nOld ){
  5246     rc = freePage(apOld[i]);
  5247     if( rc ) goto balance_cleanup;
  5248     releasePage(apOld[i]);
  5249     apOld[i] = 0;
  5250     i++;
  5251   }
  5252 
  5253   /*
  5254   ** Put the new pages in accending order.  This helps to
  5255   ** keep entries in the disk file in order so that a scan
  5256   ** of the table is a linear scan through the file.  That
  5257   ** in turn helps the operating system to deliver pages
  5258   ** from the disk more rapidly.
  5259   **
  5260   ** An O(n^2) insertion sort algorithm is used, but since
  5261   ** n is never more than NB (a small constant), that should
  5262   ** not be a problem.
  5263   **
  5264   ** When NB==3, this one optimization makes the database
  5265   ** about 25% faster for large insertions and deletions.
  5266   */
  5267   for(i=0; i<k-1; i++){
  5268     int minV = pgnoNew[i];
  5269     int minI = i;
  5270     for(j=i+1; j<k; j++){
  5271       if( pgnoNew[j]<(unsigned)minV ){
  5272         minI = j;
  5273         minV = pgnoNew[j];
  5274       }
  5275     }
  5276     if( minI>i ){
  5277       int t;
  5278       MemPage *pT;
  5279       t = pgnoNew[i];
  5280       pT = apNew[i];
  5281       pgnoNew[i] = pgnoNew[minI];
  5282       apNew[i] = apNew[minI];
  5283       pgnoNew[minI] = t;
  5284       apNew[minI] = pT;
  5285     }
  5286   }
  5287   TRACE(("BALANCE: old: %d %d %d  new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
  5288     pgnoOld[0], 
  5289     nOld>=2 ? pgnoOld[1] : 0,
  5290     nOld>=3 ? pgnoOld[2] : 0,
  5291     pgnoNew[0], szNew[0],
  5292     nNew>=2 ? pgnoNew[1] : 0, nNew>=2 ? szNew[1] : 0,
  5293     nNew>=3 ? pgnoNew[2] : 0, nNew>=3 ? szNew[2] : 0,
  5294     nNew>=4 ? pgnoNew[3] : 0, nNew>=4 ? szNew[3] : 0,
  5295     nNew>=5 ? pgnoNew[4] : 0, nNew>=5 ? szNew[4] : 0));
  5296 
  5297   /*
  5298   ** Evenly distribute the data in apCell[] across the new pages.
  5299   ** Insert divider cells into pParent as necessary.
  5300   */
  5301   j = 0;
  5302   for(i=0; i<nNew; i++){
  5303     /* Assemble the new sibling page. */
  5304     MemPage *pNew = apNew[i];
  5305     assert( j<nMaxCells );
  5306     assert( pNew->pgno==pgnoNew[i] );
  5307     zeroPage(pNew, pageFlags);
  5308     assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
  5309     assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );
  5310     assert( pNew->nOverflow==0 );
  5311 
  5312     /* If this is an auto-vacuum database, update the pointer map entries
  5313     ** that point to the siblings that were rearranged. These can be: left
  5314     ** children of cells, the right-child of the page, or overflow pages
  5315     ** pointed to by cells.
  5316     */
  5317     if( ISAUTOVACUUM ){
  5318       for(k=j; k<cntNew[i]; k++){
  5319         assert( k<nMaxCells );
  5320         if( aFrom[k]==0xFF || apCopy[aFrom[k]]->pgno!=pNew->pgno ){
  5321           rc = ptrmapPutOvfl(pNew, k-j);
  5322           if( rc==SQLITE_OK && leafCorrection==0 ){
  5323             rc = ptrmapPut(pBt, get4byte(apCell[k]), PTRMAP_BTREE, pNew->pgno);
  5324           }
  5325           if( rc!=SQLITE_OK ){
  5326             goto balance_cleanup;
  5327           }
  5328         }
  5329       }
  5330     }
  5331 
  5332     j = cntNew[i];
  5333 
  5334     /* If the sibling page assembled above was not the right-most sibling,
  5335     ** insert a divider cell into the parent page.
  5336     */
  5337     if( i<nNew-1 && j<nCell ){
  5338       u8 *pCell;
  5339       u8 *pTemp;
  5340       int sz;
  5341 
  5342       assert( j<nMaxCells );
  5343       pCell = apCell[j];
  5344       sz = szCell[j] + leafCorrection;
  5345       pTemp = &aSpace2[iSpace2];
  5346       if( !pNew->leaf ){
  5347         memcpy(&pNew->aData[8], pCell, 4);
  5348         if( ISAUTOVACUUM 
  5349          && (aFrom[j]==0xFF || apCopy[aFrom[j]]->pgno!=pNew->pgno)
  5350         ){
  5351           rc = ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno);
  5352           if( rc!=SQLITE_OK ){
  5353             goto balance_cleanup;
  5354           }
  5355         }
  5356       }else if( leafData ){
  5357         /* If the tree is a leaf-data tree, and the siblings are leaves, 
  5358         ** then there is no divider cell in apCell[]. Instead, the divider 
  5359         ** cell consists of the integer key for the right-most cell of 
  5360         ** the sibling-page assembled above only.
  5361         */
  5362         CellInfo info;
  5363         j--;
  5364         sqlite3BtreeParseCellPtr(pNew, apCell[j], &info);
  5365         pCell = pTemp;
  5366         fillInCell(pParent, pCell, 0, info.nKey, 0, 0, 0, &sz);
  5367         pTemp = 0;
  5368       }else{
  5369         pCell -= 4;
  5370         /* Obscure case for non-leaf-data trees: If the cell at pCell was
  5371         ** previously stored on a leaf node, and its reported size was 4
  5372         ** bytes, then it may actually be smaller than this 
  5373         ** (see sqlite3BtreeParseCellPtr(), 4 bytes is the minimum size of
  5374         ** any cell). But it is important to pass the correct size to 
  5375         ** insertCell(), so reparse the cell now.
  5376         **
  5377         ** Note that this can never happen in an SQLite data file, as all
  5378         ** cells are at least 4 bytes. It only happens in b-trees used
  5379         ** to evaluate "IN (SELECT ...)" and similar clauses.
  5380         */
  5381         if( szCell[j]==4 ){
  5382           assert(leafCorrection==4);
  5383           sz = cellSizePtr(pParent, pCell);
  5384         }
  5385       }
  5386       iSpace2 += sz;
  5387       assert( sz<=pBt->pageSize/4 );
  5388       assert( iSpace2<=pBt->pageSize );
  5389       rc = insertCell(pParent, nxDiv, pCell, sz, pTemp, 4);
  5390       if( rc!=SQLITE_OK ) goto balance_cleanup;
  5391       put4byte(findOverflowCell(pParent,nxDiv), pNew->pgno);
  5392 
  5393       /* If this is an auto-vacuum database, and not a leaf-data tree,
  5394       ** then update the pointer map with an entry for the overflow page
  5395       ** that the cell just inserted points to (if any).
  5396       */
  5397       if( ISAUTOVACUUM && !leafData ){
  5398         rc = ptrmapPutOvfl(pParent, nxDiv);
  5399         if( rc!=SQLITE_OK ){
  5400           goto balance_cleanup;
  5401         }
  5402       }
  5403       j++;
  5404       nxDiv++;
  5405     }
  5406 
  5407     /* Set the pointer-map entry for the new sibling page. */
  5408     if( ISAUTOVACUUM ){
  5409       rc = ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno);
  5410       if( rc!=SQLITE_OK ){
  5411         goto balance_cleanup;
  5412       }
  5413     }
  5414   }
  5415   assert( j==nCell );
  5416   assert( nOld>0 );
  5417   assert( nNew>0 );
  5418   if( (pageFlags & PTF_LEAF)==0 ){
  5419     u8 *zChild = &apCopy[nOld-1]->aData[8];
  5420     memcpy(&apNew[nNew-1]->aData[8], zChild, 4);
  5421     if( ISAUTOVACUUM ){
  5422       rc = ptrmapPut(pBt, get4byte(zChild), PTRMAP_BTREE, apNew[nNew-1]->pgno);
  5423       if( rc!=SQLITE_OK ){
  5424         goto balance_cleanup;
  5425       }
  5426     }
  5427   }
  5428   if( nxDiv==pParent->nCell+pParent->nOverflow ){
  5429     /* Right-most sibling is the right-most child of pParent */
  5430     put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew[nNew-1]);
  5431   }else{
  5432     /* Right-most sibling is the left child of the first entry in pParent
  5433     ** past the right-most divider entry */
  5434     put4byte(findOverflowCell(pParent, nxDiv), pgnoNew[nNew-1]);
  5435   }
  5436 
  5437   /*
  5438   ** Balance the parent page.  Note that the current page (pPage) might
  5439   ** have been added to the freelist so it might no longer be initialized.
  5440   ** But the parent page will always be initialized.
  5441   */
  5442   assert( pParent->isInit );
  5443   sqlite3ScratchFree(apCell);
  5444   apCell = 0;
  5445   releasePage(pPage);
  5446   pCur->iPage--;
  5447   rc = balance(pCur, 0);
  5448   
  5449   /*
  5450   ** Cleanup before returning.
  5451   */
  5452 balance_cleanup:
  5453   sqlite3PageFree(aSpace2);
  5454   sqlite3ScratchFree(apCell);
  5455   for(i=0; i<nOld; i++){
  5456     releasePage(apOld[i]);
  5457   }
  5458   for(i=0; i<nNew; i++){
  5459     releasePage(apNew[i]);
  5460   }
  5461 
  5462   /* releasePage(pParent); */
  5463   TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n",
  5464           pPage->pgno, nOld, nNew, nCell));
  5465 
  5466   return rc;
  5467 }
  5468 
  5469 /*
  5470 ** This routine is called for the root page of a btree when the root
  5471 ** page contains no cells.  This is an opportunity to make the tree
  5472 ** shallower by one level.
  5473 */
  5474 static int balance_shallower(BtCursor *pCur){
  5475   MemPage *pPage;              /* Root page of B-Tree */
  5476   MemPage *pChild;             /* The only child page of pPage */
  5477   Pgno pgnoChild;              /* Page number for pChild */
  5478   int rc = SQLITE_OK;          /* Return code from subprocedures */
  5479   BtShared *pBt;                  /* The main BTree structure */
  5480   int mxCellPerPage;           /* Maximum number of cells per page */
  5481   u8 **apCell;                 /* All cells from pages being balanced */
  5482   u16 *szCell;                 /* Local size of all cells */
  5483 
  5484   assert( pCur->iPage==0 );
  5485   pPage = pCur->apPage[0];
  5486 
  5487   assert( pPage->nCell==0 );
  5488   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  5489   pBt = pPage->pBt;
  5490   mxCellPerPage = MX_CELL(pBt);
  5491   apCell = sqlite3Malloc( mxCellPerPage*(sizeof(u8*)+sizeof(u16)) );
  5492   if( apCell==0 ) return SQLITE_NOMEM;
  5493   szCell = (u16*)&apCell[mxCellPerPage];
  5494   if( pPage->leaf ){
  5495     /* The table is completely empty */
  5496     TRACE(("BALANCE: empty table %d\n", pPage->pgno));
  5497   }else{
  5498     /* The root page is empty but has one child.  Transfer the
  5499     ** information from that one child into the root page if it 
  5500     ** will fit.  This reduces the depth of the tree by one.
  5501     **
  5502     ** If the root page is page 1, it has less space available than
  5503     ** its child (due to the 100 byte header that occurs at the beginning
  5504     ** of the database fle), so it might not be able to hold all of the 
  5505     ** information currently contained in the child.  If this is the 
  5506     ** case, then do not do the transfer.  Leave page 1 empty except
  5507     ** for the right-pointer to the child page.  The child page becomes
  5508     ** the virtual root of the tree.
  5509     */
  5510     VVA_ONLY( pCur->pagesShuffled = 1 );
  5511     pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+8]);
  5512     assert( pgnoChild>0 );
  5513     assert( pgnoChild<=pagerPagecount(pPage->pBt->pPager) );
  5514     rc = sqlite3BtreeGetPage(pPage->pBt, pgnoChild, &pChild, 0);
  5515     if( rc ) goto end_shallow_balance;
  5516     if( pPage->pgno==1 ){
  5517       rc = sqlite3BtreeInitPage(pChild);
  5518       if( rc ) goto end_shallow_balance;
  5519       assert( pChild->nOverflow==0 );
  5520       if( pChild->nFree>=100 ){
  5521         /* The child information will fit on the root page, so do the
  5522         ** copy */
  5523         int i;
  5524         zeroPage(pPage, pChild->aData[0]);
  5525         for(i=0; i<pChild->nCell; i++){
  5526           apCell[i] = findCell(pChild,i);
  5527           szCell[i] = cellSizePtr(pChild, apCell[i]);
  5528         }
  5529         assemblePage(pPage, pChild->nCell, apCell, szCell);
  5530         /* Copy the right-pointer of the child to the parent. */
  5531         put4byte(&pPage->aData[pPage->hdrOffset+8], 
  5532             get4byte(&pChild->aData[pChild->hdrOffset+8]));
  5533         freePage(pChild);
  5534         TRACE(("BALANCE: child %d transfer to page 1\n", pChild->pgno));
  5535       }else{
  5536         /* The child has more information that will fit on the root.
  5537         ** The tree is already balanced.  Do nothing. */
  5538         TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno));
  5539       }
  5540     }else{
  5541       memcpy(pPage->aData, pChild->aData, pPage->pBt->usableSize);
  5542       pPage->isInit = 0;
  5543       rc = sqlite3BtreeInitPage(pPage);
  5544       assert( rc==SQLITE_OK );
  5545       freePage(pChild);
  5546       TRACE(("BALANCE: transfer child %d into root %d\n",
  5547               pChild->pgno, pPage->pgno));
  5548     }
  5549     assert( pPage->nOverflow==0 );
  5550     if( ISAUTOVACUUM ){
  5551       rc = setChildPtrmaps(pPage);
  5552     }
  5553     releasePage(pChild);
  5554   }
  5555 end_shallow_balance:
  5556   sqlite3_free(apCell);
  5557   return rc;
  5558 }
  5559 
  5560 
  5561 /*
  5562 ** The root page is overfull
  5563 **
  5564 ** When this happens, Create a new child page and copy the
  5565 ** contents of the root into the child.  Then make the root
  5566 ** page an empty page with rightChild pointing to the new
  5567 ** child.   Finally, call balance_internal() on the new child
  5568 ** to cause it to split.
  5569 */
  5570 static int balance_deeper(BtCursor *pCur){
  5571   int rc;             /* Return value from subprocedures */
  5572   MemPage *pPage;     /* Pointer to the root page */
  5573   MemPage *pChild;    /* Pointer to a new child page */
  5574   Pgno pgnoChild;     /* Page number of the new child page */
  5575   BtShared *pBt;         /* The BTree */
  5576   int usableSize;     /* Total usable size of a page */
  5577   u8 *data;           /* Content of the parent page */
  5578   u8 *cdata;          /* Content of the child page */
  5579   int hdr;            /* Offset to page header in parent */
  5580   int cbrk;           /* Offset to content of first cell in parent */
  5581 
  5582   assert( pCur->iPage==0 );
  5583   assert( pCur->apPage[0]->nOverflow>0 );
  5584 
  5585   VVA_ONLY( pCur->pagesShuffled = 1 );
  5586   pPage = pCur->apPage[0];
  5587   pBt = pPage->pBt;
  5588   assert( sqlite3_mutex_held(pBt->mutex) );
  5589   rc = allocateBtreePage(pBt, &pChild, &pgnoChild, pPage->pgno, 0);
  5590   if( rc ) return rc;
  5591   assert( sqlite3PagerIswriteable(pChild->pDbPage) );
  5592   usableSize = pBt->usableSize;
  5593   data = pPage->aData;
  5594   hdr = pPage->hdrOffset;
  5595   cbrk = get2byte(&data[hdr+5]);
  5596   cdata = pChild->aData;
  5597   memcpy(cdata, &data[hdr], pPage->cellOffset+2*pPage->nCell-hdr);
  5598   memcpy(&cdata[cbrk], &data[cbrk], usableSize-cbrk);
  5599   
  5600   rc = sqlite3BtreeInitPage(pChild);
  5601   if( rc==SQLITE_OK ){
  5602     int nCopy = pPage->nOverflow*sizeof(pPage->aOvfl[0]);
  5603     memcpy(pChild->aOvfl, pPage->aOvfl, nCopy);
  5604     pChild->nOverflow = pPage->nOverflow;
  5605     if( pChild->nOverflow ){
  5606       pChild->nFree = 0;
  5607     }
  5608     assert( pChild->nCell==pPage->nCell );
  5609     zeroPage(pPage, pChild->aData[0] & ~PTF_LEAF);
  5610     put4byte(&pPage->aData[pPage->hdrOffset+8], pgnoChild);
  5611     TRACE(("BALANCE: copy root %d into %d\n", pPage->pgno, pChild->pgno));
  5612     if( ISAUTOVACUUM ){
  5613       rc = ptrmapPut(pBt, pChild->pgno, PTRMAP_BTREE, pPage->pgno);
  5614       if( rc==SQLITE_OK ){
  5615         rc = setChildPtrmaps(pChild);
  5616       }
  5617     }
  5618   }
  5619 
  5620   if( rc==SQLITE_OK ){
  5621     pCur->iPage++;
  5622     pCur->apPage[1] = pChild;
  5623     pCur->aiIdx[0] = 0;
  5624     rc = balance_nonroot(pCur);
  5625   }else{
  5626     releasePage(pChild);
  5627   }
  5628 
  5629   return rc;
  5630 }
  5631 
  5632 /*
  5633 ** The page that pCur currently points to has just been modified in
  5634 ** some way. This function figures out if this modification means the
  5635 ** tree needs to be balanced, and if so calls the appropriate balancing 
  5636 ** routine.
  5637 ** 
  5638 ** Parameter isInsert is true if a new cell was just inserted into the
  5639 ** page, or false otherwise.
  5640 */
  5641 static int balance(BtCursor *pCur, int isInsert){
  5642   int rc = SQLITE_OK;
  5643   MemPage *pPage = pCur->apPage[pCur->iPage];
  5644 
  5645   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  5646   if( pCur->iPage==0 ){
  5647     rc = sqlite3PagerWrite(pPage->pDbPage);
  5648     if( rc==SQLITE_OK && pPage->nOverflow>0 ){
  5649       rc = balance_deeper(pCur);
  5650     }
  5651     if( rc==SQLITE_OK && pPage->nCell==0 ){
  5652       rc = balance_shallower(pCur);
  5653     }
  5654   }else{
  5655     if( pPage->nOverflow>0 || 
  5656         (!isInsert && pPage->nFree>pPage->pBt->usableSize*2/3) ){
  5657       rc = balance_nonroot(pCur);
  5658     }
  5659   }
  5660   return rc;
  5661 }
  5662 
  5663 /*
  5664 ** This routine checks all cursors that point to table pgnoRoot.
  5665 ** If any of those cursors were opened with wrFlag==0 in a different
  5666 ** database connection (a database connection that shares the pager
  5667 ** cache with the current connection) and that other connection 
  5668 ** is not in the ReadUncommmitted state, then this routine returns 
  5669 ** SQLITE_LOCKED.
  5670 **
  5671 ** As well as cursors with wrFlag==0, cursors with wrFlag==1 and 
  5672 ** isIncrblobHandle==1 are also considered 'read' cursors. Incremental 
  5673 ** blob cursors are used for both reading and writing.
  5674 **
  5675 ** When pgnoRoot is the root page of an intkey table, this function is also
  5676 ** responsible for invalidating incremental blob cursors when the table row
  5677 ** on which they are opened is deleted or modified. Cursors are invalidated
  5678 ** according to the following rules:
  5679 **
  5680 **   1) When BtreeClearTable() is called to completely delete the contents
  5681 **      of a B-Tree table, pExclude is set to zero and parameter iRow is 
  5682 **      set to non-zero. In this case all incremental blob cursors open
  5683 **      on the table rooted at pgnoRoot are invalidated.
  5684 **
  5685 **   2) When BtreeInsert(), BtreeDelete() or BtreePutData() is called to 
  5686 **      modify a table row via an SQL statement, pExclude is set to the 
  5687 **      write cursor used to do the modification and parameter iRow is set
  5688 **      to the integer row id of the B-Tree entry being modified. Unless
  5689 **      pExclude is itself an incremental blob cursor, then all incremental
  5690 **      blob cursors open on row iRow of the B-Tree are invalidated.
  5691 **
  5692 **   3) If both pExclude and iRow are set to zero, no incremental blob 
  5693 **      cursors are invalidated.
  5694 */
  5695 static int checkReadLocks(
  5696   Btree *pBtree, 
  5697   Pgno pgnoRoot, 
  5698   BtCursor *pExclude,
  5699   i64 iRow
  5700 ){
  5701   BtCursor *p;
  5702   BtShared *pBt = pBtree->pBt;
  5703   sqlite3 *db = pBtree->db;
  5704   assert( sqlite3BtreeHoldsMutex(pBtree) );
  5705   for(p=pBt->pCursor; p; p=p->pNext){
  5706     if( p==pExclude ) continue;
  5707     if( p->pgnoRoot!=pgnoRoot ) continue;
  5708 #ifndef SQLITE_OMIT_INCRBLOB
  5709     if( p->isIncrblobHandle && ( 
  5710          (!pExclude && iRow)
  5711       || (pExclude && !pExclude->isIncrblobHandle && p->info.nKey==iRow)
  5712     )){
  5713       p->eState = CURSOR_INVALID;
  5714     }
  5715 #endif
  5716     if( p->eState!=CURSOR_VALID ) continue;
  5717     if( p->wrFlag==0 
  5718 #ifndef SQLITE_OMIT_INCRBLOB
  5719      || p->isIncrblobHandle
  5720 #endif
  5721     ){
  5722       sqlite3 *dbOther = p->pBtree->db;
  5723       if( dbOther==0 ||
  5724          (dbOther!=db && (dbOther->flags & SQLITE_ReadUncommitted)==0) ){
  5725         return SQLITE_LOCKED;
  5726       }
  5727     }
  5728   }
  5729   return SQLITE_OK;
  5730 }
  5731 
  5732 /*
  5733 ** Insert a new record into the BTree.  The key is given by (pKey,nKey)
  5734 ** and the data is given by (pData,nData).  The cursor is used only to
  5735 ** define what table the record should be inserted into.  The cursor
  5736 ** is left pointing at a random location.
  5737 **
  5738 ** For an INTKEY table, only the nKey value of the key is used.  pKey is
  5739 ** ignored.  For a ZERODATA table, the pData and nData are both ignored.
  5740 */
  5741 int sqlite3BtreeInsert(
  5742   BtCursor *pCur,                /* Insert data into the table of this cursor */
  5743   const void *pKey, i64 nKey,    /* The key of the new record */
  5744   const void *pData, int nData,  /* The data of the new record */
  5745   int nZero,                     /* Number of extra 0 bytes to append to data */
  5746   int appendBias                 /* True if this is likely an append */
  5747 ){
  5748   int rc;
  5749   int loc;
  5750   int szNew;
  5751   int idx;
  5752   MemPage *pPage;
  5753   Btree *p = pCur->pBtree;
  5754   BtShared *pBt = p->pBt;
  5755   unsigned char *oldCell;
  5756   unsigned char *newCell = 0;
  5757 
  5758   assert( cursorHoldsMutex(pCur) );
  5759   if( pBt->inTransaction!=TRANS_WRITE ){
  5760     /* Must start a transaction before doing an insert */
  5761     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
  5762     return rc;
  5763   }
  5764   assert( !pBt->readOnly );
  5765   if( !pCur->wrFlag ){
  5766     return SQLITE_PERM;   /* Cursor not open for writing */
  5767   }
  5768   if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur, nKey) ){
  5769     return SQLITE_LOCKED; /* The table pCur points to has a read lock */
  5770   }
  5771   if( pCur->eState==CURSOR_FAULT ){
  5772     return pCur->skip;
  5773   }
  5774 
  5775   /* Save the positions of any other cursors open on this table */
  5776   clearCursorPosition(pCur);
  5777   if( 
  5778     SQLITE_OK!=(rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur)) ||
  5779     SQLITE_OK!=(rc = sqlite3BtreeMoveto(pCur, pKey, nKey, appendBias, &loc))
  5780   ){
  5781     return rc;
  5782   }
  5783 
  5784   pPage = pCur->apPage[pCur->iPage];
  5785   assert( pPage->intKey || nKey>=0 );
  5786   assert( pPage->leaf || !pPage->intKey );
  5787   TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
  5788           pCur->pgnoRoot, nKey, nData, pPage->pgno,
  5789           loc==0 ? "overwrite" : "new entry"));
  5790   assert( pPage->isInit );
  5791   allocateTempSpace(pBt);
  5792   newCell = pBt->pTmpSpace;
  5793   if( newCell==0 ) return SQLITE_NOMEM;
  5794   rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);
  5795   if( rc ) goto end_insert;
  5796   assert( szNew==cellSizePtr(pPage, newCell) );
  5797   assert( szNew<=MX_CELL_SIZE(pBt) );
  5798   idx = pCur->aiIdx[pCur->iPage];
  5799   if( loc==0 && CURSOR_VALID==pCur->eState ){
  5800     u16 szOld;
  5801     assert( idx<pPage->nCell );
  5802     rc = sqlite3PagerWrite(pPage->pDbPage);
  5803     if( rc ){
  5804       goto end_insert;
  5805     }
  5806     oldCell = findCell(pPage, idx);
  5807     if( !pPage->leaf ){
  5808       memcpy(newCell, oldCell, 4);
  5809     }
  5810     szOld = cellSizePtr(pPage, oldCell);
  5811     rc = clearCell(pPage, oldCell);
  5812     if( rc ) goto end_insert;
  5813     dropCell(pPage, idx, szOld);
  5814   }else if( loc<0 && pPage->nCell>0 ){
  5815     assert( pPage->leaf );
  5816     idx = ++pCur->aiIdx[pCur->iPage];
  5817     pCur->info.nSize = 0;
  5818     pCur->validNKey = 0;
  5819   }else{
  5820     assert( pPage->leaf );
  5821   }
  5822   rc = insertCell(pPage, idx, newCell, szNew, 0, 0);
  5823   if( rc!=SQLITE_OK ) goto end_insert;
  5824   rc = balance(pCur, 1);
  5825   if( rc==SQLITE_OK ){
  5826     moveToRoot(pCur);
  5827   }
  5828 end_insert:
  5829   return rc;
  5830 }
  5831 
  5832 /*
  5833 ** Delete the entry that the cursor is pointing to.  The cursor
  5834 ** is left pointing at a arbitrary location.
  5835 */
  5836 int sqlite3BtreeDelete(BtCursor *pCur){
  5837   MemPage *pPage = pCur->apPage[pCur->iPage];
  5838   int idx;
  5839   unsigned char *pCell;
  5840   int rc;
  5841   Pgno pgnoChild = 0;
  5842   Btree *p = pCur->pBtree;
  5843   BtShared *pBt = p->pBt;
  5844 
  5845   assert( cursorHoldsMutex(pCur) );
  5846   assert( pPage->isInit );
  5847   if( pBt->inTransaction!=TRANS_WRITE ){
  5848     /* Must start a transaction before doing a delete */
  5849     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
  5850     return rc;
  5851   }
  5852   assert( !pBt->readOnly );
  5853   if( pCur->eState==CURSOR_FAULT ){
  5854     return pCur->skip;
  5855   }
  5856   if( pCur->aiIdx[pCur->iPage]>=pPage->nCell ){
  5857     return SQLITE_ERROR;  /* The cursor is not pointing to anything */
  5858   }
  5859   if( !pCur->wrFlag ){
  5860     return SQLITE_PERM;   /* Did not open this cursor for writing */
  5861   }
  5862   if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur, pCur->info.nKey) ){
  5863     return SQLITE_LOCKED; /* The table pCur points to has a read lock */
  5864   }
  5865 
  5866   /* Restore the current cursor position (a no-op if the cursor is not in 
  5867   ** CURSOR_REQUIRESEEK state) and save the positions of any other cursors 
  5868   ** open on the same table. Then call sqlite3PagerWrite() on the page
  5869   ** that the entry will be deleted from.
  5870   */
  5871   if( 
  5872     (rc = restoreCursorPosition(pCur))!=0 ||
  5873     (rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur))!=0 ||
  5874     (rc = sqlite3PagerWrite(pPage->pDbPage))!=0
  5875   ){
  5876     return rc;
  5877   }
  5878 
  5879   /* Locate the cell within its page and leave pCell pointing to the
  5880   ** data. The clearCell() call frees any overflow pages associated with the
  5881   ** cell. The cell itself is still intact.
  5882   */
  5883   idx = pCur->aiIdx[pCur->iPage];
  5884   pCell = findCell(pPage, idx);
  5885   if( !pPage->leaf ){
  5886     pgnoChild = get4byte(pCell);
  5887   }
  5888   rc = clearCell(pPage, pCell);
  5889   if( rc ){
  5890     return rc;
  5891   }
  5892 
  5893   if( !pPage->leaf ){
  5894     /*
  5895     ** The entry we are about to delete is not a leaf so if we do not
  5896     ** do something we will leave a hole on an internal page.
  5897     ** We have to fill the hole by moving in a cell from a leaf.  The
  5898     ** next Cell after the one to be deleted is guaranteed to exist and
  5899     ** to be a leaf so we can use it.
  5900     */
  5901     BtCursor leafCur;
  5902     MemPage *pLeafPage;
  5903 
  5904     unsigned char *pNext;
  5905     int notUsed;
  5906     unsigned char *tempCell = 0;
  5907     assert( !pPage->intKey );
  5908     sqlite3BtreeGetTempCursor(pCur, &leafCur);
  5909     rc = sqlite3BtreeNext(&leafCur, &notUsed);
  5910     if( rc==SQLITE_OK ){
  5911       assert( leafCur.aiIdx[leafCur.iPage]==0 );
  5912       pLeafPage = leafCur.apPage[leafCur.iPage];
  5913       rc = sqlite3PagerWrite(pLeafPage->pDbPage);
  5914     }
  5915     if( rc==SQLITE_OK ){
  5916       int leafCursorInvalid = 0;
  5917       u16 szNext;
  5918       TRACE(("DELETE: table=%d delete internal from %d replace from leaf %d\n",
  5919          pCur->pgnoRoot, pPage->pgno, pLeafPage->pgno));
  5920       dropCell(pPage, idx, cellSizePtr(pPage, pCell));
  5921       pNext = findCell(pLeafPage, 0);
  5922       szNext = cellSizePtr(pLeafPage, pNext);
  5923       assert( MX_CELL_SIZE(pBt)>=szNext+4 );
  5924       allocateTempSpace(pBt);
  5925       tempCell = pBt->pTmpSpace;
  5926       if( tempCell==0 ){
  5927         rc = SQLITE_NOMEM;
  5928       }
  5929       if( rc==SQLITE_OK ){
  5930         rc = insertCell(pPage, idx, pNext-4, szNext+4, tempCell, 0);
  5931       }
  5932 
  5933 
  5934       /* The "if" statement in the next code block is critical.  The
  5935       ** slightest error in that statement would allow SQLite to operate
  5936       ** correctly most of the time but produce very rare failures.  To
  5937       ** guard against this, the following macros help to verify that
  5938       ** the "if" statement is well tested.
  5939       */
  5940       testcase( pPage->nOverflow==0 && pPage->nFree<pBt->usableSize*2/3 
  5941                  && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
  5942       testcase( pPage->nOverflow==0 && pPage->nFree==pBt->usableSize*2/3 
  5943                  && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
  5944       testcase( pPage->nOverflow==0 && pPage->nFree==pBt->usableSize*2/3+1 
  5945                  && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
  5946       testcase( pPage->nOverflow>0 && pPage->nFree<=pBt->usableSize*2/3
  5947                  && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
  5948       testcase( (pPage->nOverflow>0 || (pPage->nFree > pBt->usableSize*2/3))
  5949                  && pLeafPage->nFree+2+szNext == pBt->usableSize*2/3 );
  5950 
  5951 
  5952       if( (pPage->nOverflow>0 || (pPage->nFree > pBt->usableSize*2/3)) &&
  5953           (pLeafPage->nFree+2+szNext > pBt->usableSize*2/3)
  5954       ){
  5955         /* This branch is taken if the internal node is now either overflowing
  5956         ** or underfull and the leaf node will be underfull after the just cell 
  5957         ** copied to the internal node is deleted from it. This is a special
  5958         ** case because the call to balance() to correct the internal node
  5959         ** may change the tree structure and invalidate the contents of
  5960         ** the leafCur.apPage[] and leafCur.aiIdx[] arrays, which will be
  5961         ** used by the balance() required to correct the underfull leaf
  5962         ** node.
  5963         **
  5964         ** The formula used in the expression above are based on facets of
  5965         ** the SQLite file-format that do not change over time.
  5966         */
  5967         testcase( pPage->nFree==pBt->usableSize*2/3+1 );
  5968         testcase( pLeafPage->nFree+2+szNext==pBt->usableSize*2/3+1 );
  5969         leafCursorInvalid = 1;
  5970       }        
  5971 
  5972       if( rc==SQLITE_OK ){
  5973         put4byte(findOverflowCell(pPage, idx), pgnoChild);
  5974         VVA_ONLY( pCur->pagesShuffled = 0 );
  5975         rc = balance(pCur, 0);
  5976       }
  5977 
  5978       if( rc==SQLITE_OK && leafCursorInvalid ){
  5979         /* The leaf-node is now underfull and so the tree needs to be 
  5980         ** rebalanced. However, the balance() operation on the internal
  5981         ** node above may have modified the structure of the B-Tree and
  5982         ** so the current contents of leafCur.apPage[] and leafCur.aiIdx[]
  5983         ** may not be trusted.
  5984         **
  5985         ** It is not possible to copy the ancestry from pCur, as the same
  5986         ** balance() call has invalidated the pCur->apPage[] and aiIdx[]
  5987         ** arrays. 
  5988         **
  5989         ** The call to saveCursorPosition() below internally saves the 
  5990         ** key that leafCur is currently pointing to. Currently, there
  5991         ** are two copies of that key in the tree - one here on the leaf
  5992         ** page and one on some internal node in the tree. The copy on
  5993         ** the leaf node is always the next key in tree-order after the 
  5994         ** copy on the internal node. So, the call to sqlite3BtreeNext()
  5995         ** calls restoreCursorPosition() to point the cursor to the copy
  5996         ** stored on the internal node, then advances to the next entry,
  5997         ** which happens to be the copy of the key on the internal node.
  5998         ** Net effect: leafCur is pointing back to the duplicate cell
  5999         ** that needs to be removed, and the leafCur.apPage[] and
  6000         ** leafCur.aiIdx[] arrays are correct.
  6001         */
  6002         VVA_ONLY( Pgno leafPgno = pLeafPage->pgno );
  6003         rc = saveCursorPosition(&leafCur);
  6004         if( rc==SQLITE_OK ){
  6005           rc = sqlite3BtreeNext(&leafCur, &notUsed);
  6006         }
  6007         pLeafPage = leafCur.apPage[leafCur.iPage];
  6008         assert( rc!=SQLITE_OK || pLeafPage->pgno==leafPgno );
  6009         assert( rc!=SQLITE_OK || leafCur.aiIdx[leafCur.iPage]==0 );
  6010       }
  6011 
  6012       if( rc==SQLITE_OK ){
  6013         dropCell(pLeafPage, 0, szNext);
  6014         VVA_ONLY( leafCur.pagesShuffled = 0 );
  6015         rc = balance(&leafCur, 0);
  6016         assert( leafCursorInvalid || !leafCur.pagesShuffled
  6017                                    || !pCur->pagesShuffled );
  6018       }
  6019     }
  6020     sqlite3BtreeReleaseTempCursor(&leafCur);
  6021   }else{
  6022     TRACE(("DELETE: table=%d delete from leaf %d\n",
  6023        pCur->pgnoRoot, pPage->pgno));
  6024     dropCell(pPage, idx, cellSizePtr(pPage, pCell));
  6025     rc = balance(pCur, 0);
  6026   }
  6027   if( rc==SQLITE_OK ){
  6028     moveToRoot(pCur);
  6029   }
  6030   return rc;
  6031 }
  6032 
  6033 /*
  6034 ** Create a new BTree table.  Write into *piTable the page
  6035 ** number for the root page of the new table.
  6036 **
  6037 ** The type of type is determined by the flags parameter.  Only the
  6038 ** following values of flags are currently in use.  Other values for
  6039 ** flags might not work:
  6040 **
  6041 **     BTREE_INTKEY|BTREE_LEAFDATA     Used for SQL tables with rowid keys
  6042 **     BTREE_ZERODATA                  Used for SQL indices
  6043 */
  6044 static int btreeCreateTable(Btree *p, int *piTable, int flags){
  6045   BtShared *pBt = p->pBt;
  6046   MemPage *pRoot;
  6047   Pgno pgnoRoot;
  6048   int rc;
  6049 
  6050   assert( sqlite3BtreeHoldsMutex(p) );
  6051   if( pBt->inTransaction!=TRANS_WRITE ){
  6052     /* Must start a transaction first */
  6053     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
  6054     return rc;
  6055   }
  6056   assert( !pBt->readOnly );
  6057 
  6058 #ifdef SQLITE_OMIT_AUTOVACUUM
  6059   rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
  6060   if( rc ){
  6061     return rc;
  6062   }
  6063 #else
  6064   if( pBt->autoVacuum ){
  6065     Pgno pgnoMove;      /* Move a page here to make room for the root-page */
  6066     MemPage *pPageMove; /* The page to move to. */
  6067 
  6068     /* Creating a new table may probably require moving an existing database
  6069     ** to make room for the new tables root page. In case this page turns
  6070     ** out to be an overflow page, delete all overflow page-map caches
  6071     ** held by open cursors.
  6072     */
  6073     invalidateAllOverflowCache(pBt);
  6074 
  6075     /* Read the value of meta[3] from the database to determine where the
  6076     ** root page of the new table should go. meta[3] is the largest root-page
  6077     ** created so far, so the new root-page is (meta[3]+1).
  6078     */
  6079     rc = sqlite3BtreeGetMeta(p, 4, &pgnoRoot);
  6080     if( rc!=SQLITE_OK ){
  6081       return rc;
  6082     }
  6083     pgnoRoot++;
  6084 
  6085     /* The new root-page may not be allocated on a pointer-map page, or the
  6086     ** PENDING_BYTE page.
  6087     */
  6088     while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
  6089         pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
  6090       pgnoRoot++;
  6091     }
  6092     assert( pgnoRoot>=3 );
  6093 
  6094     /* Allocate a page. The page that currently resides at pgnoRoot will
  6095     ** be moved to the allocated page (unless the allocated page happens
  6096     ** to reside at pgnoRoot).
  6097     */
  6098     rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1);
  6099     if( rc!=SQLITE_OK ){
  6100       return rc;
  6101     }
  6102 
  6103     if( pgnoMove!=pgnoRoot ){
  6104       /* pgnoRoot is the page that will be used for the root-page of
  6105       ** the new table (assuming an error did not occur). But we were
  6106       ** allocated pgnoMove. If required (i.e. if it was not allocated
  6107       ** by extending the file), the current page at position pgnoMove
  6108       ** is already journaled.
  6109       */
  6110       u8 eType;
  6111       Pgno iPtrPage;
  6112 
  6113       releasePage(pPageMove);
  6114 
  6115       /* Move the page currently at pgnoRoot to pgnoMove. */
  6116       rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0);
  6117       if( rc!=SQLITE_OK ){
  6118         return rc;
  6119       }
  6120       rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
  6121       if( rc!=SQLITE_OK || eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
  6122         releasePage(pRoot);
  6123         return rc;
  6124       }
  6125       assert( eType!=PTRMAP_ROOTPAGE );
  6126       assert( eType!=PTRMAP_FREEPAGE );
  6127       rc = sqlite3PagerWrite(pRoot->pDbPage);
  6128       if( rc!=SQLITE_OK ){
  6129         releasePage(pRoot);
  6130         return rc;
  6131       }
  6132       rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
  6133       releasePage(pRoot);
  6134 
  6135       /* Obtain the page at pgnoRoot */
  6136       if( rc!=SQLITE_OK ){
  6137         return rc;
  6138       }
  6139       rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0);
  6140       if( rc!=SQLITE_OK ){
  6141         return rc;
  6142       }
  6143       rc = sqlite3PagerWrite(pRoot->pDbPage);
  6144       if( rc!=SQLITE_OK ){
  6145         releasePage(pRoot);
  6146         return rc;
  6147       }
  6148     }else{
  6149       pRoot = pPageMove;
  6150     } 
  6151 
  6152     /* Update the pointer-map and meta-data with the new root-page number. */
  6153     rc = ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0);
  6154     if( rc ){
  6155       releasePage(pRoot);
  6156       return rc;
  6157     }
  6158     rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
  6159     if( rc ){
  6160       releasePage(pRoot);
  6161       return rc;
  6162     }
  6163 
  6164   }else{
  6165     rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
  6166     if( rc ) return rc;
  6167   }
  6168 #endif
  6169   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
  6170   zeroPage(pRoot, flags | PTF_LEAF);
  6171   sqlite3PagerUnref(pRoot->pDbPage);
  6172   *piTable = (int)pgnoRoot;
  6173   return SQLITE_OK;
  6174 }
  6175 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
  6176   int rc;
  6177   sqlite3BtreeEnter(p);
  6178   p->pBt->db = p->db;
  6179   rc = btreeCreateTable(p, piTable, flags);
  6180   sqlite3BtreeLeave(p);
  6181   return rc;
  6182 }
  6183 
  6184 /*
  6185 ** Erase the given database page and all its children.  Return
  6186 ** the page to the freelist.
  6187 */
  6188 static int clearDatabasePage(
  6189   BtShared *pBt,           /* The BTree that contains the table */
  6190   Pgno pgno,            /* Page number to clear */
  6191   MemPage *pParent,     /* Parent page.  NULL for the root */
  6192   int freePageFlag      /* Deallocate page if true */
  6193 ){
  6194   MemPage *pPage = 0;
  6195   int rc;
  6196   unsigned char *pCell;
  6197   int i;
  6198 
  6199   assert( sqlite3_mutex_held(pBt->mutex) );
  6200   if( pgno>pagerPagecount(pBt->pPager) ){
  6201     return SQLITE_CORRUPT_BKPT;
  6202   }
  6203 
  6204   rc = getAndInitPage(pBt, pgno, &pPage);
  6205   if( rc ) goto cleardatabasepage_out;
  6206   for(i=0; i<pPage->nCell; i++){
  6207     pCell = findCell(pPage, i);
  6208     if( !pPage->leaf ){
  6209       rc = clearDatabasePage(pBt, get4byte(pCell), pPage, 1);
  6210       if( rc ) goto cleardatabasepage_out;
  6211     }
  6212     rc = clearCell(pPage, pCell);
  6213     if( rc ) goto cleardatabasepage_out;
  6214   }
  6215   if( !pPage->leaf ){
  6216     rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), pPage, 1);
  6217     if( rc ) goto cleardatabasepage_out;
  6218   }
  6219   if( freePageFlag ){
  6220     rc = freePage(pPage);
  6221   }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
  6222     zeroPage(pPage, pPage->aData[0] | PTF_LEAF);
  6223   }
  6224 
  6225 cleardatabasepage_out:
  6226   releasePage(pPage);
  6227   return rc;
  6228 }
  6229 
  6230 /*
  6231 ** Delete all information from a single table in the database.  iTable is
  6232 ** the page number of the root of the table.  After this routine returns,
  6233 ** the root page is empty, but still exists.
  6234 **
  6235 ** This routine will fail with SQLITE_LOCKED if there are any open
  6236 ** read cursors on the table.  Open write cursors are moved to the
  6237 ** root of the table.
  6238 */
  6239 int sqlite3BtreeClearTable(Btree *p, int iTable){
  6240   int rc;
  6241   BtShared *pBt = p->pBt;
  6242   sqlite3BtreeEnter(p);
  6243   pBt->db = p->db;
  6244   if( p->inTrans!=TRANS_WRITE ){
  6245     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
  6246   }else if( (rc = checkReadLocks(p, iTable, 0, 1))!=SQLITE_OK ){
  6247     /* nothing to do */
  6248   }else if( SQLITE_OK!=(rc = saveAllCursors(pBt, iTable, 0)) ){
  6249     /* nothing to do */
  6250   }else{
  6251     rc = clearDatabasePage(pBt, (Pgno)iTable, 0, 0);
  6252   }
  6253   sqlite3BtreeLeave(p);
  6254   return rc;
  6255 }
  6256 
  6257 /*
  6258 ** Erase all information in a table and add the root of the table to
  6259 ** the freelist.  Except, the root of the principle table (the one on
  6260 ** page 1) is never added to the freelist.
  6261 **
  6262 ** This routine will fail with SQLITE_LOCKED if there are any open
  6263 ** cursors on the table.
  6264 **
  6265 ** If AUTOVACUUM is enabled and the page at iTable is not the last
  6266 ** root page in the database file, then the last root page 
  6267 ** in the database file is moved into the slot formerly occupied by
  6268 ** iTable and that last slot formerly occupied by the last root page
  6269 ** is added to the freelist instead of iTable.  In this say, all
  6270 ** root pages are kept at the beginning of the database file, which
  6271 ** is necessary for AUTOVACUUM to work right.  *piMoved is set to the 
  6272 ** page number that used to be the last root page in the file before
  6273 ** the move.  If no page gets moved, *piMoved is set to 0.
  6274 ** The last root page is recorded in meta[3] and the value of
  6275 ** meta[3] is updated by this procedure.
  6276 */
  6277 static int btreeDropTable(Btree *p, int iTable, int *piMoved){
  6278   int rc;
  6279   MemPage *pPage = 0;
  6280   BtShared *pBt = p->pBt;
  6281 
  6282   assert( sqlite3BtreeHoldsMutex(p) );
  6283   if( p->inTrans!=TRANS_WRITE ){
  6284     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
  6285   }
  6286 
  6287   /* It is illegal to drop a table if any cursors are open on the
  6288   ** database. This is because in auto-vacuum mode the backend may
  6289   ** need to move another root-page to fill a gap left by the deleted
  6290   ** root page. If an open cursor was using this page a problem would 
  6291   ** occur.
  6292   */
  6293   if( pBt->pCursor ){
  6294     return SQLITE_LOCKED;
  6295   }
  6296 
  6297   rc = sqlite3BtreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
  6298   if( rc ) return rc;
  6299   rc = sqlite3BtreeClearTable(p, iTable);
  6300   if( rc ){
  6301     releasePage(pPage);
  6302     return rc;
  6303   }
  6304 
  6305   *piMoved = 0;
  6306 
  6307   if( iTable>1 ){
  6308 #ifdef SQLITE_OMIT_AUTOVACUUM
  6309     rc = freePage(pPage);
  6310     releasePage(pPage);
  6311 #else
  6312     if( pBt->autoVacuum ){
  6313       Pgno maxRootPgno;
  6314       rc = sqlite3BtreeGetMeta(p, 4, &maxRootPgno);
  6315       if( rc!=SQLITE_OK ){
  6316         releasePage(pPage);
  6317         return rc;
  6318       }
  6319 
  6320       if( iTable==maxRootPgno ){
  6321         /* If the table being dropped is the table with the largest root-page
  6322         ** number in the database, put the root page on the free list. 
  6323         */
  6324         rc = freePage(pPage);
  6325         releasePage(pPage);
  6326         if( rc!=SQLITE_OK ){
  6327           return rc;
  6328         }
  6329       }else{
  6330         /* The table being dropped does not have the largest root-page
  6331         ** number in the database. So move the page that does into the 
  6332         ** gap left by the deleted root-page.
  6333         */
  6334         MemPage *pMove;
  6335         releasePage(pPage);
  6336         rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0);
  6337         if( rc!=SQLITE_OK ){
  6338           return rc;
  6339         }
  6340         rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
  6341         releasePage(pMove);
  6342         if( rc!=SQLITE_OK ){
  6343           return rc;
  6344         }
  6345         rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0);
  6346         if( rc!=SQLITE_OK ){
  6347           return rc;
  6348         }
  6349         rc = freePage(pMove);
  6350         releasePage(pMove);
  6351         if( rc!=SQLITE_OK ){
  6352           return rc;
  6353         }
  6354         *piMoved = maxRootPgno;
  6355       }
  6356 
  6357       /* Set the new 'max-root-page' value in the database header. This
  6358       ** is the old value less one, less one more if that happens to
  6359       ** be a root-page number, less one again if that is the
  6360       ** PENDING_BYTE_PAGE.
  6361       */
  6362       maxRootPgno--;
  6363       if( maxRootPgno==PENDING_BYTE_PAGE(pBt) ){
  6364         maxRootPgno--;
  6365       }
  6366       if( maxRootPgno==PTRMAP_PAGENO(pBt, maxRootPgno) ){
  6367         maxRootPgno--;
  6368       }
  6369       assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
  6370 
  6371       rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
  6372     }else{
  6373       rc = freePage(pPage);
  6374       releasePage(pPage);
  6375     }
  6376 #endif
  6377   }else{
  6378     /* If sqlite3BtreeDropTable was called on page 1. */
  6379     zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
  6380     releasePage(pPage);
  6381   }
  6382   return rc;  
  6383 }
  6384 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
  6385   int rc;
  6386   sqlite3BtreeEnter(p);
  6387   p->pBt->db = p->db;
  6388   rc = btreeDropTable(p, iTable, piMoved);
  6389   sqlite3BtreeLeave(p);
  6390   return rc;
  6391 }
  6392 
  6393 
  6394 /*
  6395 ** Read the meta-information out of a database file.  Meta[0]
  6396 ** is the number of free pages currently in the database.  Meta[1]
  6397 ** through meta[15] are available for use by higher layers.  Meta[0]
  6398 ** is read-only, the others are read/write.
  6399 ** 
  6400 ** The schema layer numbers meta values differently.  At the schema
  6401 ** layer (and the SetCookie and ReadCookie opcodes) the number of
  6402 ** free pages is not visible.  So Cookie[0] is the same as Meta[1].
  6403 */
  6404 int sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
  6405   DbPage *pDbPage;
  6406   int rc;
  6407   unsigned char *pP1;
  6408   BtShared *pBt = p->pBt;
  6409 
  6410   sqlite3BtreeEnter(p);
  6411   pBt->db = p->db;
  6412 
  6413   /* Reading a meta-data value requires a read-lock on page 1 (and hence
  6414   ** the sqlite_master table. We grab this lock regardless of whether or
  6415   ** not the SQLITE_ReadUncommitted flag is set (the table rooted at page
  6416   ** 1 is treated as a special case by queryTableLock() and lockTable()).
  6417   */
  6418   rc = queryTableLock(p, 1, READ_LOCK);
  6419   if( rc!=SQLITE_OK ){
  6420     sqlite3BtreeLeave(p);
  6421     return rc;
  6422   }
  6423 
  6424   assert( idx>=0 && idx<=15 );
  6425   if( pBt->pPage1 ){
  6426     /* The b-tree is already holding a reference to page 1 of the database
  6427     ** file. In this case the required meta-data value can be read directly
  6428     ** from the page data of this reference. This is slightly faster than
  6429     ** requesting a new reference from the pager layer.
  6430     */
  6431     pP1 = (unsigned char *)pBt->pPage1->aData;
  6432   }else{
  6433     /* The b-tree does not have a reference to page 1 of the database file.
  6434     ** Obtain one from the pager layer.
  6435     */
  6436     rc = sqlite3PagerGet(pBt->pPager, 1, &pDbPage);
  6437     if( rc ){
  6438       sqlite3BtreeLeave(p);
  6439       return rc;
  6440     }
  6441     pP1 = (unsigned char *)sqlite3PagerGetData(pDbPage);
  6442   }
  6443   *pMeta = get4byte(&pP1[36 + idx*4]);
  6444 
  6445   /* If the b-tree is not holding a reference to page 1, then one was 
  6446   ** requested from the pager layer in the above block. Release it now.
  6447   */
  6448   if( !pBt->pPage1 ){
  6449     sqlite3PagerUnref(pDbPage);
  6450   }
  6451 
  6452   /* If autovacuumed is disabled in this build but we are trying to 
  6453   ** access an autovacuumed database, then make the database readonly. 
  6454   */
  6455 #ifdef SQLITE_OMIT_AUTOVACUUM
  6456   if( idx==4 && *pMeta>0 ) pBt->readOnly = 1;
  6457 #endif
  6458 
  6459   /* Grab the read-lock on page 1. */
  6460   rc = lockTable(p, 1, READ_LOCK);
  6461   sqlite3BtreeLeave(p);
  6462   return rc;
  6463 }
  6464 
  6465 /*
  6466 ** Write meta-information back into the database.  Meta[0] is
  6467 ** read-only and may not be written.
  6468 */
  6469 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
  6470   BtShared *pBt = p->pBt;
  6471   unsigned char *pP1;
  6472   int rc;
  6473   assert( idx>=1 && idx<=15 );
  6474   sqlite3BtreeEnter(p);
  6475   pBt->db = p->db;
  6476   if( p->inTrans!=TRANS_WRITE ){
  6477     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
  6478   }else{
  6479     assert( pBt->pPage1!=0 );
  6480     pP1 = pBt->pPage1->aData;
  6481     rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
  6482     if( rc==SQLITE_OK ){
  6483       put4byte(&pP1[36 + idx*4], iMeta);
  6484 #ifndef SQLITE_OMIT_AUTOVACUUM
  6485       if( idx==7 ){
  6486         assert( pBt->autoVacuum || iMeta==0 );
  6487         assert( iMeta==0 || iMeta==1 );
  6488         pBt->incrVacuum = iMeta;
  6489       }
  6490 #endif
  6491     }
  6492   }
  6493   sqlite3BtreeLeave(p);
  6494   return rc;
  6495 }
  6496 
  6497 /*
  6498 ** Return the flag byte at the beginning of the page that the cursor
  6499 ** is currently pointing to.
  6500 */
  6501 int sqlite3BtreeFlags(BtCursor *pCur){
  6502   /* TODO: What about CURSOR_REQUIRESEEK state? Probably need to call
  6503   ** restoreCursorPosition() here.
  6504   */
  6505   MemPage *pPage;
  6506   restoreCursorPosition(pCur);
  6507   pPage = pCur->apPage[pCur->iPage];
  6508   assert( cursorHoldsMutex(pCur) );
  6509   assert( pPage->pBt==pCur->pBt );
  6510   return pPage ? pPage->aData[pPage->hdrOffset] : 0;
  6511 }
  6512 
  6513 
  6514 /*
  6515 ** Return the pager associated with a BTree.  This routine is used for
  6516 ** testing and debugging only.
  6517 */
  6518 Pager *sqlite3BtreePager(Btree *p){
  6519   return p->pBt->pPager;
  6520 }
  6521 
  6522 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
  6523 /*
  6524 ** Append a message to the error message string.
  6525 */
  6526 static void checkAppendMsg(
  6527   IntegrityCk *pCheck,
  6528   char *zMsg1,
  6529   const char *zFormat,
  6530   ...
  6531 ){
  6532   va_list ap;
  6533   if( !pCheck->mxErr ) return;
  6534   pCheck->mxErr--;
  6535   pCheck->nErr++;
  6536   va_start(ap, zFormat);
  6537   if( pCheck->errMsg.nChar ){
  6538     sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);
  6539   }
  6540   if( zMsg1 ){
  6541     sqlite3StrAccumAppend(&pCheck->errMsg, zMsg1, -1);
  6542   }
  6543   sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap);
  6544   va_end(ap);
  6545   if( pCheck->errMsg.mallocFailed ){
  6546     pCheck->mallocFailed = 1;
  6547   }
  6548 }
  6549 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
  6550 
  6551 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
  6552 /*
  6553 ** Add 1 to the reference count for page iPage.  If this is the second
  6554 ** reference to the page, add an error message to pCheck->zErrMsg.
  6555 ** Return 1 if there are 2 ore more references to the page and 0 if
  6556 ** if this is the first reference to the page.
  6557 **
  6558 ** Also check that the page number is in bounds.
  6559 */
  6560 static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){
  6561   if( iPage==0 ) return 1;
  6562   if( iPage>pCheck->nPage || iPage<0 ){
  6563     checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage);
  6564     return 1;
  6565   }
  6566   if( pCheck->anRef[iPage]==1 ){
  6567     checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage);
  6568     return 1;
  6569   }
  6570   return  (pCheck->anRef[iPage]++)>1;
  6571 }
  6572 
  6573 #ifndef SQLITE_OMIT_AUTOVACUUM
  6574 /*
  6575 ** Check that the entry in the pointer-map for page iChild maps to 
  6576 ** page iParent, pointer type ptrType. If not, append an error message
  6577 ** to pCheck.
  6578 */
  6579 static void checkPtrmap(
  6580   IntegrityCk *pCheck,   /* Integrity check context */
  6581   Pgno iChild,           /* Child page number */
  6582   u8 eType,              /* Expected pointer map type */
  6583   Pgno iParent,          /* Expected pointer map parent page number */
  6584   char *zContext         /* Context description (used for error msg) */
  6585 ){
  6586   int rc;
  6587   u8 ePtrmapType;
  6588   Pgno iPtrmapParent;
  6589 
  6590   rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
  6591   if( rc!=SQLITE_OK ){
  6592     checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild);
  6593     return;
  6594   }
  6595 
  6596   if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
  6597     checkAppendMsg(pCheck, zContext, 
  6598       "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)", 
  6599       iChild, eType, iParent, ePtrmapType, iPtrmapParent);
  6600   }
  6601 }
  6602 #endif
  6603 
  6604 /*
  6605 ** Check the integrity of the freelist or of an overflow page list.
  6606 ** Verify that the number of pages on the list is N.
  6607 */
  6608 static void checkList(
  6609   IntegrityCk *pCheck,  /* Integrity checking context */
  6610   int isFreeList,       /* True for a freelist.  False for overflow page list */
  6611   int iPage,            /* Page number for first page in the list */
  6612   int N,                /* Expected number of pages in the list */
  6613   char *zContext        /* Context for error messages */
  6614 ){
  6615   int i;
  6616   int expected = N;
  6617   int iFirst = iPage;
  6618   while( N-- > 0 && pCheck->mxErr ){
  6619     DbPage *pOvflPage;
  6620     unsigned char *pOvflData;
  6621     if( iPage<1 ){
  6622       checkAppendMsg(pCheck, zContext,
  6623          "%d of %d pages missing from overflow list starting at %d",
  6624           N+1, expected, iFirst);
  6625       break;
  6626     }
  6627     if( checkRef(pCheck, iPage, zContext) ) break;
  6628     if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){
  6629       checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage);
  6630       break;
  6631     }
  6632     pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
  6633     if( isFreeList ){
  6634       int n = get4byte(&pOvflData[4]);
  6635 #ifndef SQLITE_OMIT_AUTOVACUUM
  6636       if( pCheck->pBt->autoVacuum ){
  6637         checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext);
  6638       }
  6639 #endif
  6640       if( n>pCheck->pBt->usableSize/4-2 ){
  6641         checkAppendMsg(pCheck, zContext,
  6642            "freelist leaf count too big on page %d", iPage);
  6643         N--;
  6644       }else{
  6645         for(i=0; i<n; i++){
  6646           Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
  6647 #ifndef SQLITE_OMIT_AUTOVACUUM
  6648           if( pCheck->pBt->autoVacuum ){
  6649             checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext);
  6650           }
  6651 #endif
  6652           checkRef(pCheck, iFreePage, zContext);
  6653         }
  6654         N -= n;
  6655       }
  6656     }
  6657 #ifndef SQLITE_OMIT_AUTOVACUUM
  6658     else{
  6659       /* If this database supports auto-vacuum and iPage is not the last
  6660       ** page in this overflow list, check that the pointer-map entry for
  6661       ** the following page matches iPage.
  6662       */
  6663       if( pCheck->pBt->autoVacuum && N>0 ){
  6664         i = get4byte(pOvflData);
  6665         checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext);
  6666       }
  6667     }
  6668 #endif
  6669     iPage = get4byte(pOvflData);
  6670     sqlite3PagerUnref(pOvflPage);
  6671   }
  6672 }
  6673 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
  6674 
  6675 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
  6676 /*
  6677 ** Do various sanity checks on a single page of a tree.  Return
  6678 ** the tree depth.  Root pages return 0.  Parents of root pages
  6679 ** return 1, and so forth.
  6680 ** 
  6681 ** These checks are done:
  6682 **
  6683 **      1.  Make sure that cells and freeblocks do not overlap
  6684 **          but combine to completely cover the page.
  6685 **  NO  2.  Make sure cell keys are in order.
  6686 **  NO  3.  Make sure no key is less than or equal to zLowerBound.
  6687 **  NO  4.  Make sure no key is greater than or equal to zUpperBound.
  6688 **      5.  Check the integrity of overflow pages.
  6689 **      6.  Recursively call checkTreePage on all children.
  6690 **      7.  Verify that the depth of all children is the same.
  6691 **      8.  Make sure this page is at least 33% full or else it is
  6692 **          the root of the tree.
  6693 */
  6694 static int checkTreePage(
  6695   IntegrityCk *pCheck,  /* Context for the sanity check */
  6696   int iPage,            /* Page number of the page to check */
  6697   MemPage *pParent,     /* Parent page */
  6698   char *zParentContext  /* Parent context */
  6699 ){
  6700   MemPage *pPage;
  6701   int i, rc, depth, d2, pgno, cnt;
  6702   int hdr, cellStart;
  6703   int nCell;
  6704   u8 *data;
  6705   BtShared *pBt;
  6706   int usableSize;
  6707   char zContext[100];
  6708   char *hit;
  6709 
  6710   sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage);
  6711 
  6712   /* Check that the page exists
  6713   */
  6714   pBt = pCheck->pBt;
  6715   usableSize = pBt->usableSize;
  6716   if( iPage==0 ) return 0;
  6717   if( checkRef(pCheck, iPage, zParentContext) ) return 0;
  6718   if( (rc = sqlite3BtreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
  6719     checkAppendMsg(pCheck, zContext,
  6720        "unable to get the page. error code=%d", rc);
  6721     return 0;
  6722   }
  6723   if( (rc = sqlite3BtreeInitPage(pPage))!=0 ){
  6724     checkAppendMsg(pCheck, zContext, 
  6725                    "sqlite3BtreeInitPage() returns error code %d", rc);
  6726     releasePage(pPage);
  6727     return 0;
  6728   }
  6729 
  6730   /* Check out all the cells.
  6731   */
  6732   depth = 0;
  6733   for(i=0; i<pPage->nCell && pCheck->mxErr; i++){
  6734     u8 *pCell;
  6735     int sz;
  6736     CellInfo info;
  6737 
  6738     /* Check payload overflow pages
  6739     */
  6740     sqlite3_snprintf(sizeof(zContext), zContext,
  6741              "On tree page %d cell %d: ", iPage, i);
  6742     pCell = findCell(pPage,i);
  6743     sqlite3BtreeParseCellPtr(pPage, pCell, &info);
  6744     sz = info.nData;
  6745     if( !pPage->intKey ) sz += info.nKey;
  6746     assert( sz==info.nPayload );
  6747     if( sz>info.nLocal ){
  6748       int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4);
  6749       Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
  6750 #ifndef SQLITE_OMIT_AUTOVACUUM
  6751       if( pBt->autoVacuum ){
  6752         checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext);
  6753       }
  6754 #endif
  6755       checkList(pCheck, 0, pgnoOvfl, nPage, zContext);
  6756     }
  6757 
  6758     /* Check sanity of left child page.
  6759     */
  6760     if( !pPage->leaf ){
  6761       pgno = get4byte(pCell);
  6762 #ifndef SQLITE_OMIT_AUTOVACUUM
  6763       if( pBt->autoVacuum ){
  6764         checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
  6765       }
  6766 #endif
  6767       d2 = checkTreePage(pCheck,pgno,pPage,zContext);
  6768       if( i>0 && d2!=depth ){
  6769         checkAppendMsg(pCheck, zContext, "Child page depth differs");
  6770       }
  6771       depth = d2;
  6772     }
  6773   }
  6774   if( !pPage->leaf ){
  6775     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
  6776     sqlite3_snprintf(sizeof(zContext), zContext, 
  6777                      "On page %d at right child: ", iPage);
  6778 #ifndef SQLITE_OMIT_AUTOVACUUM
  6779     if( pBt->autoVacuum ){
  6780       checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, 0);
  6781     }
  6782 #endif
  6783     checkTreePage(pCheck, pgno, pPage, zContext);
  6784   }
  6785  
  6786   /* Check for complete coverage of the page
  6787   */
  6788   data = pPage->aData;
  6789   hdr = pPage->hdrOffset;
  6790   hit = sqlite3PageMalloc( pBt->pageSize );
  6791   if( hit==0 ){
  6792     pCheck->mallocFailed = 1;
  6793   }else{
  6794     memset(hit, 0, usableSize );
  6795     memset(hit, 1, get2byte(&data[hdr+5]));
  6796     nCell = get2byte(&data[hdr+3]);
  6797     cellStart = hdr + 12 - 4*pPage->leaf;
  6798     for(i=0; i<nCell; i++){
  6799       int pc = get2byte(&data[cellStart+i*2]);
  6800       u16 size = 1024;
  6801       int j;
  6802       if( pc<=usableSize ){
  6803         size = cellSizePtr(pPage, &data[pc]);
  6804       }
  6805       if( (pc+size-1)>=usableSize || pc<0 ){
  6806         checkAppendMsg(pCheck, 0, 
  6807             "Corruption detected in cell %d on page %d",i,iPage,0);
  6808       }else{
  6809         for(j=pc+size-1; j>=pc; j--) hit[j]++;
  6810       }
  6811     }
  6812     for(cnt=0, i=get2byte(&data[hdr+1]); i>0 && i<usableSize && cnt<10000; 
  6813            cnt++){
  6814       int size = get2byte(&data[i+2]);
  6815       int j;
  6816       if( (i+size-1)>=usableSize || i<0 ){
  6817         checkAppendMsg(pCheck, 0,  
  6818             "Corruption detected in cell %d on page %d",i,iPage,0);
  6819       }else{
  6820         for(j=i+size-1; j>=i; j--) hit[j]++;
  6821       }
  6822       i = get2byte(&data[i]);
  6823     }
  6824     for(i=cnt=0; i<usableSize; i++){
  6825       if( hit[i]==0 ){
  6826         cnt++;
  6827       }else if( hit[i]>1 ){
  6828         checkAppendMsg(pCheck, 0,
  6829           "Multiple uses for byte %d of page %d", i, iPage);
  6830         break;
  6831       }
  6832     }
  6833     if( cnt!=data[hdr+7] ){
  6834       checkAppendMsg(pCheck, 0, 
  6835           "Fragmented space is %d byte reported as %d on page %d",
  6836           cnt, data[hdr+7], iPage);
  6837     }
  6838   }
  6839   sqlite3PageFree(hit);
  6840 
  6841   releasePage(pPage);
  6842   return depth+1;
  6843 }
  6844 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
  6845 
  6846 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
  6847 /*
  6848 ** This routine does a complete check of the given BTree file.  aRoot[] is
  6849 ** an array of pages numbers were each page number is the root page of
  6850 ** a table.  nRoot is the number of entries in aRoot.
  6851 **
  6852 ** Write the number of error seen in *pnErr.  Except for some memory
  6853 ** allocation errors,  nn error message is held in memory obtained from
  6854 ** malloc is returned if *pnErr is non-zero.  If *pnErr==0 then NULL is
  6855 ** returned.
  6856 */
  6857 char *sqlite3BtreeIntegrityCheck(
  6858   Btree *p,     /* The btree to be checked */
  6859   int *aRoot,   /* An array of root pages numbers for individual trees */
  6860   int nRoot,    /* Number of entries in aRoot[] */
  6861   int mxErr,    /* Stop reporting errors after this many */
  6862   int *pnErr    /* Write number of errors seen to this variable */
  6863 ){
  6864   int i;
  6865   int nRef;
  6866   IntegrityCk sCheck;
  6867   BtShared *pBt = p->pBt;
  6868   char zErr[100];
  6869 
  6870   sqlite3BtreeEnter(p);
  6871   pBt->db = p->db;
  6872   nRef = sqlite3PagerRefcount(pBt->pPager);
  6873   if( lockBtreeWithRetry(p)!=SQLITE_OK ){
  6874     *pnErr = 1;
  6875     sqlite3BtreeLeave(p);
  6876     return sqlite3DbStrDup(0, "cannot acquire a read lock on the database");
  6877   }
  6878   sCheck.pBt = pBt;
  6879   sCheck.pPager = pBt->pPager;
  6880   sCheck.nPage = pagerPagecount(sCheck.pPager);
  6881   sCheck.mxErr = mxErr;
  6882   sCheck.nErr = 0;
  6883   sCheck.mallocFailed = 0;
  6884   *pnErr = 0;
  6885 #ifndef SQLITE_OMIT_AUTOVACUUM
  6886   if( pBt->nTrunc!=0 ){
  6887     sCheck.nPage = pBt->nTrunc;
  6888   }
  6889 #endif
  6890   if( sCheck.nPage==0 ){
  6891     unlockBtreeIfUnused(pBt);
  6892     sqlite3BtreeLeave(p);
  6893     return 0;
  6894   }
  6895   sCheck.anRef = sqlite3Malloc( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
  6896   if( !sCheck.anRef ){
  6897     unlockBtreeIfUnused(pBt);
  6898     *pnErr = 1;
  6899     sqlite3BtreeLeave(p);
  6900     return 0;
  6901   }
  6902   for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
  6903   i = PENDING_BYTE_PAGE(pBt);
  6904   if( i<=sCheck.nPage ){
  6905     sCheck.anRef[i] = 1;
  6906   }
  6907   sqlite3StrAccumInit(&sCheck.errMsg, zErr, sizeof(zErr), 20000);
  6908 
  6909   /* Check the integrity of the freelist
  6910   */
  6911   checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
  6912             get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
  6913 
  6914   /* Check all the tables.
  6915   */
  6916   for(i=0; i<nRoot && sCheck.mxErr; i++){
  6917     if( aRoot[i]==0 ) continue;
  6918 #ifndef SQLITE_OMIT_AUTOVACUUM
  6919     if( pBt->autoVacuum && aRoot[i]>1 ){
  6920       checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0);
  6921     }
  6922 #endif
  6923     checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: ");
  6924   }
  6925 
  6926   /* Make sure every page in the file is referenced
  6927   */
  6928   for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
  6929 #ifdef SQLITE_OMIT_AUTOVACUUM
  6930     if( sCheck.anRef[i]==0 ){
  6931       checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
  6932     }
  6933 #else
  6934     /* If the database supports auto-vacuum, make sure no tables contain
  6935     ** references to pointer-map pages.
  6936     */
  6937     if( sCheck.anRef[i]==0 && 
  6938        (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
  6939       checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
  6940     }
  6941     if( sCheck.anRef[i]!=0 && 
  6942        (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
  6943       checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i);
  6944     }
  6945 #endif
  6946   }
  6947 
  6948   /* Make sure this analysis did not leave any unref() pages
  6949   */
  6950   unlockBtreeIfUnused(pBt);
  6951   if( nRef != sqlite3PagerRefcount(pBt->pPager) ){
  6952     checkAppendMsg(&sCheck, 0, 
  6953       "Outstanding page count goes from %d to %d during this analysis",
  6954       nRef, sqlite3PagerRefcount(pBt->pPager)
  6955     );
  6956   }
  6957 
  6958   /* Clean  up and report errors.
  6959   */
  6960   sqlite3BtreeLeave(p);
  6961   sqlite3_free(sCheck.anRef);
  6962   if( sCheck.mallocFailed ){
  6963     sqlite3StrAccumReset(&sCheck.errMsg);
  6964     *pnErr = sCheck.nErr+1;
  6965     return 0;
  6966   }
  6967   *pnErr = sCheck.nErr;
  6968   if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);
  6969   return sqlite3StrAccumFinish(&sCheck.errMsg);
  6970 }
  6971 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
  6972 
  6973 /*
  6974 ** Return the full pathname of the underlying database file.
  6975 **
  6976 ** The pager filename is invariant as long as the pager is
  6977 ** open so it is safe to access without the BtShared mutex.
  6978 */
  6979 const char *sqlite3BtreeGetFilename(Btree *p){
  6980   assert( p->pBt->pPager!=0 );
  6981   return sqlite3PagerFilename(p->pBt->pPager);
  6982 }
  6983 
  6984 /*
  6985 ** Return the pathname of the directory that contains the database file.
  6986 **
  6987 ** The pager directory name is invariant as long as the pager is
  6988 ** open so it is safe to access without the BtShared mutex.
  6989 */
  6990 const char *sqlite3BtreeGetDirname(Btree *p){
  6991   assert( p->pBt->pPager!=0 );
  6992   return sqlite3PagerDirname(p->pBt->pPager);
  6993 }
  6994 
  6995 /*
  6996 ** Return the pathname of the journal file for this database. The return
  6997 ** value of this routine is the same regardless of whether the journal file
  6998 ** has been created or not.
  6999 **
  7000 ** The pager journal filename is invariant as long as the pager is
  7001 ** open so it is safe to access without the BtShared mutex.
  7002 */
  7003 const char *sqlite3BtreeGetJournalname(Btree *p){
  7004   assert( p->pBt->pPager!=0 );
  7005   return sqlite3PagerJournalname(p->pBt->pPager);
  7006 }
  7007 
  7008 #ifndef SQLITE_OMIT_VACUUM
  7009 /*
  7010 ** Copy the complete content of pBtFrom into pBtTo.  A transaction
  7011 ** must be active for both files.
  7012 **
  7013 ** The size of file pTo may be reduced by this operation.
  7014 ** If anything goes wrong, the transaction on pTo is rolled back. 
  7015 **
  7016 ** If successful, CommitPhaseOne() may be called on pTo before returning. 
  7017 ** The caller should finish committing the transaction on pTo by calling
  7018 ** sqlite3BtreeCommit().
  7019 */
  7020 static int btreeCopyFile(Btree *pTo, Btree *pFrom){
  7021   int rc = SQLITE_OK;
  7022   Pgno i;
  7023 
  7024   Pgno nFromPage;     /* Number of pages in pFrom */
  7025   Pgno nToPage;       /* Number of pages in pTo */
  7026   Pgno nNewPage;      /* Number of pages in pTo after the copy */
  7027 
  7028   Pgno iSkip;         /* Pending byte page in pTo */
  7029   int nToPageSize;    /* Page size of pTo in bytes */
  7030   int nFromPageSize;  /* Page size of pFrom in bytes */
  7031 
  7032   BtShared *pBtTo = pTo->pBt;
  7033   BtShared *pBtFrom = pFrom->pBt;
  7034   pBtTo->db = pTo->db;
  7035   pBtFrom->db = pFrom->db;
  7036 
  7037   nToPageSize = pBtTo->pageSize;
  7038   nFromPageSize = pBtFrom->pageSize;
  7039 
  7040   if( pTo->inTrans!=TRANS_WRITE || pFrom->inTrans!=TRANS_WRITE ){
  7041     return SQLITE_ERROR;
  7042   }
  7043   if( pBtTo->pCursor ){
  7044     return SQLITE_BUSY;
  7045   }
  7046 
  7047   nToPage = pagerPagecount(pBtTo->pPager);
  7048   nFromPage = pagerPagecount(pBtFrom->pPager);
  7049   iSkip = PENDING_BYTE_PAGE(pBtTo);
  7050 
  7051   /* Variable nNewPage is the number of pages required to store the
  7052   ** contents of pFrom using the current page-size of pTo.
  7053   */
  7054   nNewPage = ((i64)nFromPage * (i64)nFromPageSize + (i64)nToPageSize - 1) / 
  7055       (i64)nToPageSize;
  7056 
  7057   for(i=1; rc==SQLITE_OK && (i<=nToPage || i<=nNewPage); i++){
  7058 
  7059     /* Journal the original page.
  7060     **
  7061     ** iSkip is the page number of the locking page (PENDING_BYTE_PAGE)
  7062     ** in database *pTo (before the copy). This page is never written 
  7063     ** into the journal file. Unless i==iSkip or the page was not
  7064     ** present in pTo before the copy operation, journal page i from pTo.
  7065     */
  7066     if( i!=iSkip && i<=nToPage ){
  7067       DbPage *pDbPage = 0;
  7068       rc = sqlite3PagerGet(pBtTo->pPager, i, &pDbPage);
  7069       if( rc==SQLITE_OK ){
  7070         rc = sqlite3PagerWrite(pDbPage);
  7071         if( rc==SQLITE_OK && i>nFromPage ){
  7072           /* Yeah.  It seems wierd to call DontWrite() right after Write(). But
  7073           ** that is because the names of those procedures do not exactly 
  7074           ** represent what they do.  Write() really means "put this page in the
  7075           ** rollback journal and mark it as dirty so that it will be written
  7076           ** to the database file later."  DontWrite() undoes the second part of
  7077           ** that and prevents the page from being written to the database. The
  7078           ** page is still on the rollback journal, though.  And that is the 
  7079           ** whole point of this block: to put pages on the rollback journal. 
  7080           */
  7081           rc = sqlite3PagerDontWrite(pDbPage);
  7082         }
  7083         sqlite3PagerUnref(pDbPage);
  7084       }
  7085     }
  7086 
  7087     /* Overwrite the data in page i of the target database */
  7088     if( rc==SQLITE_OK && i!=iSkip && i<=nNewPage ){
  7089 
  7090       DbPage *pToPage = 0;
  7091       sqlite3_int64 iOff;
  7092 
  7093       rc = sqlite3PagerGet(pBtTo->pPager, i, &pToPage);
  7094       if( rc==SQLITE_OK ){
  7095         rc = sqlite3PagerWrite(pToPage);
  7096       }
  7097 
  7098       for(
  7099         iOff=(i-1)*nToPageSize; 
  7100         rc==SQLITE_OK && iOff<i*nToPageSize; 
  7101         iOff += nFromPageSize
  7102       ){
  7103         DbPage *pFromPage = 0;
  7104         Pgno iFrom = (iOff/nFromPageSize)+1;
  7105 
  7106         if( iFrom==PENDING_BYTE_PAGE(pBtFrom) ){
  7107           continue;
  7108         }
  7109 
  7110         rc = sqlite3PagerGet(pBtFrom->pPager, iFrom, &pFromPage);
  7111         if( rc==SQLITE_OK ){
  7112           char *zTo = sqlite3PagerGetData(pToPage);
  7113           char *zFrom = sqlite3PagerGetData(pFromPage);
  7114           int nCopy;
  7115 
  7116           if( nFromPageSize>=nToPageSize ){
  7117             zFrom += ((i-1)*nToPageSize - ((iFrom-1)*nFromPageSize));
  7118             nCopy = nToPageSize;
  7119           }else{
  7120             zTo += (((iFrom-1)*nFromPageSize) - (i-1)*nToPageSize);
  7121             nCopy = nFromPageSize;
  7122           }
  7123 
  7124           memcpy(zTo, zFrom, nCopy);
  7125           sqlite3PagerUnref(pFromPage);
  7126         }
  7127       }
  7128 
  7129       if( pToPage ){
  7130         MemPage *p = (MemPage *)sqlite3PagerGetExtra(pToPage);
  7131         p->isInit = 0;
  7132         sqlite3PagerUnref(pToPage);
  7133       }
  7134     }
  7135   }
  7136 
  7137   /* If things have worked so far, the database file may need to be 
  7138   ** truncated. The complex part is that it may need to be truncated to
  7139   ** a size that is not an integer multiple of nToPageSize - the current
  7140   ** page size used by the pager associated with B-Tree pTo.
  7141   **
  7142   ** For example, say the page-size of pTo is 2048 bytes and the original 
  7143   ** number of pages is 5 (10 KB file). If pFrom has a page size of 1024 
  7144   ** bytes and 9 pages, then the file needs to be truncated to 9KB.
  7145   */
  7146   if( rc==SQLITE_OK ){
  7147     if( nFromPageSize!=nToPageSize ){
  7148       sqlite3_file *pFile = sqlite3PagerFile(pBtTo->pPager);
  7149       i64 iSize = (i64)nFromPageSize * (i64)nFromPage;
  7150       i64 iNow = (i64)((nToPage>nNewPage)?nToPage:nNewPage) * (i64)nToPageSize; 
  7151       i64 iPending = ((i64)PENDING_BYTE_PAGE(pBtTo)-1) *(i64)nToPageSize;
  7152   
  7153       assert( iSize<=iNow );
  7154   
  7155       /* Commit phase one syncs the journal file associated with pTo 
  7156       ** containing the original data. It does not sync the database file
  7157       ** itself. After doing this it is safe to use OsTruncate() and other
  7158       ** file APIs on the database file directly.
  7159       */
  7160       pBtTo->db = pTo->db;
  7161       rc = sqlite3PagerCommitPhaseOne(pBtTo->pPager, 0, 0, 1);
  7162       if( iSize<iNow && rc==SQLITE_OK ){
  7163         rc = sqlite3OsTruncate(pFile, iSize);
  7164       }
  7165   
  7166       /* The loop that copied data from database pFrom to pTo did not
  7167       ** populate the locking page of database pTo. If the page-size of
  7168       ** pFrom is smaller than that of pTo, this means some data will
  7169       ** not have been copied. 
  7170       **
  7171       ** This block copies the missing data from database pFrom to pTo 
  7172       ** using file APIs. This is safe because at this point we know that
  7173       ** all of the original data from pTo has been synced into the 
  7174       ** journal file. At this point it would be safe to do anything at
  7175       ** all to the database file except truncate it to zero bytes.
  7176       */
  7177       if( rc==SQLITE_OK && nFromPageSize<nToPageSize && iSize>iPending){
  7178         i64 iOff;
  7179         for(
  7180           iOff=iPending; 
  7181           rc==SQLITE_OK && iOff<(iPending+nToPageSize); 
  7182           iOff += nFromPageSize
  7183         ){
  7184           DbPage *pFromPage = 0;
  7185           Pgno iFrom = (iOff/nFromPageSize)+1;
  7186   
  7187           if( iFrom==PENDING_BYTE_PAGE(pBtFrom) || iFrom>nFromPage ){
  7188             continue;
  7189           }
  7190   
  7191           rc = sqlite3PagerGet(pBtFrom->pPager, iFrom, &pFromPage);
  7192           if( rc==SQLITE_OK ){
  7193             char *zFrom = sqlite3PagerGetData(pFromPage);
  7194             rc = sqlite3OsWrite(pFile, zFrom, nFromPageSize, iOff);
  7195             sqlite3PagerUnref(pFromPage);
  7196           }
  7197         }
  7198       }
  7199   
  7200       /* Sync the database file */
  7201       if( rc==SQLITE_OK ){
  7202         rc = sqlite3PagerSync(pBtTo->pPager);
  7203       }
  7204     }else{
  7205       rc = sqlite3PagerTruncate(pBtTo->pPager, nNewPage);
  7206     }
  7207     if( rc==SQLITE_OK ){
  7208       pBtTo->pageSizeFixed = 0;
  7209     }
  7210   }
  7211 
  7212   if( rc ){
  7213     sqlite3BtreeRollback(pTo);
  7214   }
  7215 
  7216   return rc;  
  7217 }
  7218 int sqlite3BtreeCopyFile(Btree *pTo, Btree *pFrom){
  7219   int rc;
  7220   sqlite3BtreeEnter(pTo);
  7221   sqlite3BtreeEnter(pFrom);
  7222   rc = btreeCopyFile(pTo, pFrom);
  7223   sqlite3BtreeLeave(pFrom);
  7224   sqlite3BtreeLeave(pTo);
  7225   return rc;
  7226 }
  7227 
  7228 #endif /* SQLITE_OMIT_VACUUM */
  7229 
  7230 /*
  7231 ** Return non-zero if a transaction is active.
  7232 */
  7233 int sqlite3BtreeIsInTrans(Btree *p){
  7234   assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
  7235   return (p && (p->inTrans==TRANS_WRITE));
  7236 }
  7237 
  7238 /*
  7239 ** Return non-zero if a statement transaction is active.
  7240 */
  7241 int sqlite3BtreeIsInStmt(Btree *p){
  7242   assert( sqlite3BtreeHoldsMutex(p) );
  7243   return (p->pBt && p->pBt->inStmt);
  7244 }
  7245 
  7246 /*
  7247 ** Return non-zero if a read (or write) transaction is active.
  7248 */
  7249 int sqlite3BtreeIsInReadTrans(Btree *p){
  7250   assert( sqlite3_mutex_held(p->db->mutex) );
  7251   return (p && (p->inTrans!=TRANS_NONE));
  7252 }
  7253 
  7254 /*
  7255 ** This function returns a pointer to a blob of memory associated with
  7256 ** a single shared-btree. The memory is used by client code for its own
  7257 ** purposes (for example, to store a high-level schema associated with 
  7258 ** the shared-btree). The btree layer manages reference counting issues.
  7259 **
  7260 ** The first time this is called on a shared-btree, nBytes bytes of memory
  7261 ** are allocated, zeroed, and returned to the caller. For each subsequent 
  7262 ** call the nBytes parameter is ignored and a pointer to the same blob
  7263 ** of memory returned. 
  7264 **
  7265 ** If the nBytes parameter is 0 and the blob of memory has not yet been
  7266 ** allocated, a null pointer is returned. If the blob has already been
  7267 ** allocated, it is returned as normal.
  7268 **
  7269 ** Just before the shared-btree is closed, the function passed as the 
  7270 ** xFree argument when the memory allocation was made is invoked on the 
  7271 ** blob of allocated memory. This function should not call sqlite3_free()
  7272 ** on the memory, the btree layer does that.
  7273 */
  7274 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
  7275   BtShared *pBt = p->pBt;
  7276   sqlite3BtreeEnter(p);
  7277   if( !pBt->pSchema && nBytes ){
  7278     pBt->pSchema = sqlite3MallocZero(nBytes);
  7279     pBt->xFreeSchema = xFree;
  7280   }
  7281   sqlite3BtreeLeave(p);
  7282   return pBt->pSchema;
  7283 }
  7284 
  7285 /*
  7286 ** Return true if another user of the same shared btree as the argument
  7287 ** handle holds an exclusive lock on the sqlite_master table.
  7288 */
  7289 int sqlite3BtreeSchemaLocked(Btree *p){
  7290   int rc;
  7291   assert( sqlite3_mutex_held(p->db->mutex) );
  7292   sqlite3BtreeEnter(p);
  7293   rc = (queryTableLock(p, MASTER_ROOT, READ_LOCK)!=SQLITE_OK);
  7294   sqlite3BtreeLeave(p);
  7295   return rc;
  7296 }
  7297 
  7298 
  7299 #ifndef SQLITE_OMIT_SHARED_CACHE
  7300 /*
  7301 ** Obtain a lock on the table whose root page is iTab.  The
  7302 ** lock is a write lock if isWritelock is true or a read lock
  7303 ** if it is false.
  7304 */
  7305 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
  7306   int rc = SQLITE_OK;
  7307   if( p->sharable ){
  7308     u8 lockType = READ_LOCK + isWriteLock;
  7309     assert( READ_LOCK+1==WRITE_LOCK );
  7310     assert( isWriteLock==0 || isWriteLock==1 );
  7311     sqlite3BtreeEnter(p);
  7312     rc = queryTableLock(p, iTab, lockType);
  7313     if( rc==SQLITE_OK ){
  7314       rc = lockTable(p, iTab, lockType);
  7315     }
  7316     sqlite3BtreeLeave(p);
  7317   }
  7318   return rc;
  7319 }
  7320 #endif
  7321 
  7322 #ifndef SQLITE_OMIT_INCRBLOB
  7323 /*
  7324 ** Argument pCsr must be a cursor opened for writing on an 
  7325 ** INTKEY table currently pointing at a valid table entry. 
  7326 ** This function modifies the data stored as part of that entry.
  7327 ** Only the data content may only be modified, it is not possible
  7328 ** to change the length of the data stored.
  7329 */
  7330 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
  7331   assert( cursorHoldsMutex(pCsr) );
  7332   assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
  7333   assert(pCsr->isIncrblobHandle);
  7334 
  7335   restoreCursorPosition(pCsr);
  7336   assert( pCsr->eState!=CURSOR_REQUIRESEEK );
  7337   if( pCsr->eState!=CURSOR_VALID ){
  7338     return SQLITE_ABORT;
  7339   }
  7340 
  7341   /* Check some preconditions: 
  7342   **   (a) the cursor is open for writing,
  7343   **   (b) there is no read-lock on the table being modified and
  7344   **   (c) the cursor points at a valid row of an intKey table.
  7345   */
  7346   if( !pCsr->wrFlag ){
  7347     return SQLITE_READONLY;
  7348   }
  7349   assert( !pCsr->pBt->readOnly 
  7350           && pCsr->pBt->inTransaction==TRANS_WRITE );
  7351   if( checkReadLocks(pCsr->pBtree, pCsr->pgnoRoot, pCsr, 0) ){
  7352     return SQLITE_LOCKED; /* The table pCur points to has a read lock */
  7353   }
  7354   if( pCsr->eState==CURSOR_INVALID || !pCsr->apPage[pCsr->iPage]->intKey ){
  7355     return SQLITE_ERROR;
  7356   }
  7357 
  7358   return accessPayload(pCsr, offset, amt, (unsigned char *)z, 0, 1);
  7359 }
  7360 
  7361 /* 
  7362 ** Set a flag on this cursor to cache the locations of pages from the 
  7363 ** overflow list for the current row. This is used by cursors opened
  7364 ** for incremental blob IO only.
  7365 **
  7366 ** This function sets a flag only. The actual page location cache
  7367 ** (stored in BtCursor.aOverflow[]) is allocated and used by function
  7368 ** accessPayload() (the worker function for sqlite3BtreeData() and
  7369 ** sqlite3BtreePutData()).
  7370 */
  7371 void sqlite3BtreeCacheOverflow(BtCursor *pCur){
  7372   assert( cursorHoldsMutex(pCur) );
  7373   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
  7374   assert(!pCur->isIncrblobHandle);
  7375   assert(!pCur->aOverflow);
  7376   pCur->isIncrblobHandle = 1;
  7377 }
  7378 #endif