os/persistentdata/persistentstorage/sql/SQLite364/btree.c
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
     1 /*
     2 ** 2004 April 6
     3 **
     4 ** The author disclaims copyright to this source code.  In place of
     5 ** a legal notice, here is a blessing:
     6 **
     7 **    May you do good and not evil.
     8 **    May you find forgiveness for yourself and forgive others.
     9 **    May you share freely, never taking more than you give.
    10 **
    11 *************************************************************************
    12 ** $Id: btree.c,v 1.525 2008/10/08 17:58:49 danielk1977 Exp $
    13 **
    14 ** This file implements a external (disk-based) database using BTrees.
    15 ** See the header comment on "btreeInt.h" for additional information.
    16 ** Including a description of file format and an overview of operation.
    17 */
    18 #include "btreeInt.h"
    19 
    20 /*
    21 ** The header string that appears at the beginning of every
    22 ** SQLite database.
    23 */
    24 static const char zMagicHeader[] = SQLITE_FILE_HEADER;
    25 
    26 /*
    27 ** Set this global variable to 1 to enable tracing using the TRACE
    28 ** macro.
    29 */
    30 #if 0
    31 int sqlite3BtreeTrace=0;  /* True to enable tracing */
    32 # define TRACE(X)  if(sqlite3BtreeTrace){printf X;fflush(stdout);}
    33 #else
    34 # define TRACE(X)
    35 #endif
    36 
    37 /*
    38 ** Sometimes we need a small amount of code such as a variable initialization
    39 ** to setup for a later assert() statement.  We do not want this code to
    40 ** appear when assert() is disabled.  The following macro is therefore
    41 ** used to contain that setup code.  The "VVA" acronym stands for
    42 ** "Verification, Validation, and Accreditation".  In other words, the
    43 ** code within VVA_ONLY() will only run during verification processes.
    44 */
    45 #ifndef NDEBUG
    46 # define VVA_ONLY(X)  X
    47 #else
    48 # define VVA_ONLY(X)
    49 #endif
    50 
    51 
    52 
    53 #ifndef SQLITE_OMIT_SHARED_CACHE
    54 /*
    55 ** A list of BtShared objects that are eligible for participation
    56 ** in shared cache.  This variable has file scope during normal builds,
    57 ** but the test harness needs to access it so we make it global for 
    58 ** test builds.
    59 */
    60 #ifdef SQLITE_TEST
    61 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
    62 #else
    63 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
    64 #endif
    65 #endif /* SQLITE_OMIT_SHARED_CACHE */
    66 
    67 #ifndef SQLITE_OMIT_SHARED_CACHE
    68 /*
    69 ** Enable or disable the shared pager and schema features.
    70 **
    71 ** This routine has no effect on existing database connections.
    72 ** The shared cache setting effects only future calls to
    73 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
    74 */
    75 int sqlite3_enable_shared_cache(int enable){
    76   sqlite3GlobalConfig.sharedCacheEnabled = enable;
    77   return SQLITE_OK;
    78 }
    79 #endif
    80 
    81 
    82 /*
    83 ** Forward declaration
    84 */
    85 static int checkReadLocks(Btree*, Pgno, BtCursor*, i64);
    86 
    87 
    88 #ifdef SQLITE_OMIT_SHARED_CACHE
    89   /*
    90   ** The functions queryTableLock(), lockTable() and unlockAllTables()
    91   ** manipulate entries in the BtShared.pLock linked list used to store
    92   ** shared-cache table level locks. If the library is compiled with the
    93   ** shared-cache feature disabled, then there is only ever one user
    94   ** of each BtShared structure and so this locking is not necessary. 
    95   ** So define the lock related functions as no-ops.
    96   */
    97   #define queryTableLock(a,b,c) SQLITE_OK
    98   #define lockTable(a,b,c) SQLITE_OK
    99   #define unlockAllTables(a)
   100 #endif
   101 
   102 #ifndef SQLITE_OMIT_SHARED_CACHE
   103 /*
   104 ** Query to see if btree handle p may obtain a lock of type eLock 
   105 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
   106 ** SQLITE_OK if the lock may be obtained (by calling lockTable()), or
   107 ** SQLITE_LOCKED if not.
   108 */
   109 static int queryTableLock(Btree *p, Pgno iTab, u8 eLock){
   110   BtShared *pBt = p->pBt;
   111   BtLock *pIter;
   112 
   113   assert( sqlite3BtreeHoldsMutex(p) );
   114   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
   115   assert( p->db!=0 );
   116   
   117   /* This is a no-op if the shared-cache is not enabled */
   118   if( !p->sharable ){
   119     return SQLITE_OK;
   120   }
   121 
   122   /* If some other connection is holding an exclusive lock, the
   123   ** requested lock may not be obtained.
   124   */
   125   if( pBt->pExclusive && pBt->pExclusive!=p ){
   126     return SQLITE_LOCKED;
   127   }
   128 
   129   /* This (along with lockTable()) is where the ReadUncommitted flag is
   130   ** dealt with. If the caller is querying for a read-lock and the flag is
   131   ** set, it is unconditionally granted - even if there are write-locks
   132   ** on the table. If a write-lock is requested, the ReadUncommitted flag
   133   ** is not considered.
   134   **
   135   ** In function lockTable(), if a read-lock is demanded and the 
   136   ** ReadUncommitted flag is set, no entry is added to the locks list 
   137   ** (BtShared.pLock).
   138   **
   139   ** To summarize: If the ReadUncommitted flag is set, then read cursors do
   140   ** not create or respect table locks. The locking procedure for a 
   141   ** write-cursor does not change.
   142   */
   143   if( 
   144     0==(p->db->flags&SQLITE_ReadUncommitted) || 
   145     eLock==WRITE_LOCK ||
   146     iTab==MASTER_ROOT
   147   ){
   148     for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
   149       if( pIter->pBtree!=p && pIter->iTable==iTab && 
   150           (pIter->eLock!=eLock || eLock!=READ_LOCK) ){
   151         return SQLITE_LOCKED;
   152       }
   153     }
   154   }
   155   return SQLITE_OK;
   156 }
   157 #endif /* !SQLITE_OMIT_SHARED_CACHE */
   158 
   159 #ifndef SQLITE_OMIT_SHARED_CACHE
   160 /*
   161 ** Add a lock on the table with root-page iTable to the shared-btree used
   162 ** by Btree handle p. Parameter eLock must be either READ_LOCK or 
   163 ** WRITE_LOCK.
   164 **
   165 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_BUSY and
   166 ** SQLITE_NOMEM may also be returned.
   167 */
   168 static int lockTable(Btree *p, Pgno iTable, u8 eLock){
   169   BtShared *pBt = p->pBt;
   170   BtLock *pLock = 0;
   171   BtLock *pIter;
   172 
   173   assert( sqlite3BtreeHoldsMutex(p) );
   174   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
   175   assert( p->db!=0 );
   176 
   177   /* This is a no-op if the shared-cache is not enabled */
   178   if( !p->sharable ){
   179     return SQLITE_OK;
   180   }
   181 
   182   assert( SQLITE_OK==queryTableLock(p, iTable, eLock) );
   183 
   184   /* If the read-uncommitted flag is set and a read-lock is requested,
   185   ** return early without adding an entry to the BtShared.pLock list. See
   186   ** comment in function queryTableLock() for more info on handling 
   187   ** the ReadUncommitted flag.
   188   */
   189   if( 
   190     (p->db->flags&SQLITE_ReadUncommitted) && 
   191     (eLock==READ_LOCK) &&
   192     iTable!=MASTER_ROOT
   193   ){
   194     return SQLITE_OK;
   195   }
   196 
   197   /* First search the list for an existing lock on this table. */
   198   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
   199     if( pIter->iTable==iTable && pIter->pBtree==p ){
   200       pLock = pIter;
   201       break;
   202     }
   203   }
   204 
   205   /* If the above search did not find a BtLock struct associating Btree p
   206   ** with table iTable, allocate one and link it into the list.
   207   */
   208   if( !pLock ){
   209     pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
   210     if( !pLock ){
   211       return SQLITE_NOMEM;
   212     }
   213     pLock->iTable = iTable;
   214     pLock->pBtree = p;
   215     pLock->pNext = pBt->pLock;
   216     pBt->pLock = pLock;
   217   }
   218 
   219   /* Set the BtLock.eLock variable to the maximum of the current lock
   220   ** and the requested lock. This means if a write-lock was already held
   221   ** and a read-lock requested, we don't incorrectly downgrade the lock.
   222   */
   223   assert( WRITE_LOCK>READ_LOCK );
   224   if( eLock>pLock->eLock ){
   225     pLock->eLock = eLock;
   226   }
   227 
   228   return SQLITE_OK;
   229 }
   230 #endif /* !SQLITE_OMIT_SHARED_CACHE */
   231 
   232 #ifndef SQLITE_OMIT_SHARED_CACHE
   233 /*
   234 ** Release all the table locks (locks obtained via calls to the lockTable()
   235 ** procedure) held by Btree handle p.
   236 */
   237 static void unlockAllTables(Btree *p){
   238   BtShared *pBt = p->pBt;
   239   BtLock **ppIter = &pBt->pLock;
   240 
   241   assert( sqlite3BtreeHoldsMutex(p) );
   242   assert( p->sharable || 0==*ppIter );
   243 
   244   while( *ppIter ){
   245     BtLock *pLock = *ppIter;
   246     assert( pBt->pExclusive==0 || pBt->pExclusive==pLock->pBtree );
   247     if( pLock->pBtree==p ){
   248       *ppIter = pLock->pNext;
   249       sqlite3_free(pLock);
   250     }else{
   251       ppIter = &pLock->pNext;
   252     }
   253   }
   254 
   255   if( pBt->pExclusive==p ){
   256     pBt->pExclusive = 0;
   257   }
   258 }
   259 #endif /* SQLITE_OMIT_SHARED_CACHE */
   260 
   261 static void releasePage(MemPage *pPage);  /* Forward reference */
   262 
   263 /*
   264 ** Verify that the cursor holds a mutex on the BtShared
   265 */
   266 #ifndef NDEBUG
   267 static int cursorHoldsMutex(BtCursor *p){
   268   return sqlite3_mutex_held(p->pBt->mutex);
   269 }
   270 #endif
   271 
   272 
   273 #ifndef SQLITE_OMIT_INCRBLOB
   274 /*
   275 ** Invalidate the overflow page-list cache for cursor pCur, if any.
   276 */
   277 static void invalidateOverflowCache(BtCursor *pCur){
   278   assert( cursorHoldsMutex(pCur) );
   279   sqlite3_free(pCur->aOverflow);
   280   pCur->aOverflow = 0;
   281 }
   282 
   283 /*
   284 ** Invalidate the overflow page-list cache for all cursors opened
   285 ** on the shared btree structure pBt.
   286 */
   287 static void invalidateAllOverflowCache(BtShared *pBt){
   288   BtCursor *p;
   289   assert( sqlite3_mutex_held(pBt->mutex) );
   290   for(p=pBt->pCursor; p; p=p->pNext){
   291     invalidateOverflowCache(p);
   292   }
   293 }
   294 #else
   295   #define invalidateOverflowCache(x)
   296   #define invalidateAllOverflowCache(x)
   297 #endif
   298 
   299 /*
   300 ** Save the current cursor position in the variables BtCursor.nKey 
   301 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
   302 */
   303 static int saveCursorPosition(BtCursor *pCur){
   304   int rc;
   305 
   306   assert( CURSOR_VALID==pCur->eState );
   307   assert( 0==pCur->pKey );
   308   assert( cursorHoldsMutex(pCur) );
   309 
   310   rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);
   311 
   312   /* If this is an intKey table, then the above call to BtreeKeySize()
   313   ** stores the integer key in pCur->nKey. In this case this value is
   314   ** all that is required. Otherwise, if pCur is not open on an intKey
   315   ** table, then malloc space for and store the pCur->nKey bytes of key 
   316   ** data.
   317   */
   318   if( rc==SQLITE_OK && 0==pCur->apPage[0]->intKey){
   319     void *pKey = sqlite3Malloc(pCur->nKey);
   320     if( pKey ){
   321       rc = sqlite3BtreeKey(pCur, 0, pCur->nKey, pKey);
   322       if( rc==SQLITE_OK ){
   323         pCur->pKey = pKey;
   324       }else{
   325         sqlite3_free(pKey);
   326       }
   327     }else{
   328       rc = SQLITE_NOMEM;
   329     }
   330   }
   331   assert( !pCur->apPage[0]->intKey || !pCur->pKey );
   332 
   333   if( rc==SQLITE_OK ){
   334     int i;
   335     for(i=0; i<=pCur->iPage; i++){
   336       releasePage(pCur->apPage[i]);
   337       pCur->apPage[i] = 0;
   338     }
   339     pCur->iPage = -1;
   340     pCur->eState = CURSOR_REQUIRESEEK;
   341   }
   342 
   343   invalidateOverflowCache(pCur);
   344   return rc;
   345 }
   346 
   347 /*
   348 ** Save the positions of all cursors except pExcept open on the table 
   349 ** with root-page iRoot. Usually, this is called just before cursor
   350 ** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()).
   351 */
   352 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
   353   BtCursor *p;
   354   assert( sqlite3_mutex_held(pBt->mutex) );
   355   assert( pExcept==0 || pExcept->pBt==pBt );
   356   for(p=pBt->pCursor; p; p=p->pNext){
   357     if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) && 
   358         p->eState==CURSOR_VALID ){
   359       int rc = saveCursorPosition(p);
   360       if( SQLITE_OK!=rc ){
   361         return rc;
   362       }
   363     }
   364   }
   365   return SQLITE_OK;
   366 }
   367 
   368 /*
   369 ** Clear the current cursor position.
   370 */
   371 void sqlite3BtreeClearCursor(BtCursor *pCur){
   372   assert( cursorHoldsMutex(pCur) );
   373   sqlite3_free(pCur->pKey);
   374   pCur->pKey = 0;
   375   pCur->eState = CURSOR_INVALID;
   376 }
   377 
   378 /*
   379 ** Restore the cursor to the position it was in (or as close to as possible)
   380 ** when saveCursorPosition() was called. Note that this call deletes the 
   381 ** saved position info stored by saveCursorPosition(), so there can be
   382 ** at most one effective restoreCursorPosition() call after each 
   383 ** saveCursorPosition().
   384 */
   385 int sqlite3BtreeRestoreCursorPosition(BtCursor *pCur){
   386   int rc;
   387   assert( cursorHoldsMutex(pCur) );
   388   assert( pCur->eState>=CURSOR_REQUIRESEEK );
   389   if( pCur->eState==CURSOR_FAULT ){
   390     return pCur->skip;
   391   }
   392   pCur->eState = CURSOR_INVALID;
   393   rc = sqlite3BtreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skip);
   394   if( rc==SQLITE_OK ){
   395     sqlite3_free(pCur->pKey);
   396     pCur->pKey = 0;
   397     assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
   398   }
   399   return rc;
   400 }
   401 
   402 #define restoreCursorPosition(p) \
   403   (p->eState>=CURSOR_REQUIRESEEK ? \
   404          sqlite3BtreeRestoreCursorPosition(p) : \
   405          SQLITE_OK)
   406 
   407 /*
   408 ** Determine whether or not a cursor has moved from the position it
   409 ** was last placed at.  Cursor can move when the row they are pointing
   410 ** at is deleted out from under them.
   411 **
   412 ** This routine returns an error code if something goes wrong.  The
   413 ** integer *pHasMoved is set to one if the cursor has moved and 0 if not.
   414 */
   415 int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved){
   416   int rc;
   417 
   418   rc = restoreCursorPosition(pCur);
   419   if( rc ){
   420     *pHasMoved = 1;
   421     return rc;
   422   }
   423   if( pCur->eState!=CURSOR_VALID || pCur->skip!=0 ){
   424     *pHasMoved = 1;
   425   }else{
   426     *pHasMoved = 0;
   427   }
   428   return SQLITE_OK;
   429 }
   430 
   431 #ifndef SQLITE_OMIT_AUTOVACUUM
   432 /*
   433 ** Given a page number of a regular database page, return the page
   434 ** number for the pointer-map page that contains the entry for the
   435 ** input page number.
   436 */
   437 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
   438   int nPagesPerMapPage, iPtrMap, ret;
   439   assert( sqlite3_mutex_held(pBt->mutex) );
   440   nPagesPerMapPage = (pBt->usableSize/5)+1;
   441   iPtrMap = (pgno-2)/nPagesPerMapPage;
   442   ret = (iPtrMap*nPagesPerMapPage) + 2; 
   443   if( ret==PENDING_BYTE_PAGE(pBt) ){
   444     ret++;
   445   }
   446   return ret;
   447 }
   448 
   449 /*
   450 ** Write an entry into the pointer map.
   451 **
   452 ** This routine updates the pointer map entry for page number 'key'
   453 ** so that it maps to type 'eType' and parent page number 'pgno'.
   454 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
   455 */
   456 static int ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent){
   457   DbPage *pDbPage;  /* The pointer map page */
   458   u8 *pPtrmap;      /* The pointer map data */
   459   Pgno iPtrmap;     /* The pointer map page number */
   460   int offset;       /* Offset in pointer map page */
   461   int rc;
   462 
   463   assert( sqlite3_mutex_held(pBt->mutex) );
   464   /* The master-journal page number must never be used as a pointer map page */
   465   assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
   466 
   467   assert( pBt->autoVacuum );
   468   if( key==0 ){
   469     return SQLITE_CORRUPT_BKPT;
   470   }
   471   iPtrmap = PTRMAP_PAGENO(pBt, key);
   472   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
   473   if( rc!=SQLITE_OK ){
   474     return rc;
   475   }
   476   offset = PTRMAP_PTROFFSET(iPtrmap, key);
   477   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
   478 
   479   if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
   480     TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
   481     rc = sqlite3PagerWrite(pDbPage);
   482     if( rc==SQLITE_OK ){
   483       pPtrmap[offset] = eType;
   484       put4byte(&pPtrmap[offset+1], parent);
   485     }
   486   }
   487 
   488   sqlite3PagerUnref(pDbPage);
   489   return rc;
   490 }
   491 
   492 /*
   493 ** Read an entry from the pointer map.
   494 **
   495 ** This routine retrieves the pointer map entry for page 'key', writing
   496 ** the type and parent page number to *pEType and *pPgno respectively.
   497 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
   498 */
   499 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
   500   DbPage *pDbPage;   /* The pointer map page */
   501   int iPtrmap;       /* Pointer map page index */
   502   u8 *pPtrmap;       /* Pointer map page data */
   503   int offset;        /* Offset of entry in pointer map */
   504   int rc;
   505 
   506   assert( sqlite3_mutex_held(pBt->mutex) );
   507 
   508   iPtrmap = PTRMAP_PAGENO(pBt, key);
   509   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
   510   if( rc!=0 ){
   511     return rc;
   512   }
   513   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
   514 
   515   offset = PTRMAP_PTROFFSET(iPtrmap, key);
   516   assert( pEType!=0 );
   517   *pEType = pPtrmap[offset];
   518   if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
   519 
   520   sqlite3PagerUnref(pDbPage);
   521   if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
   522   return SQLITE_OK;
   523 }
   524 
   525 #else /* if defined SQLITE_OMIT_AUTOVACUUM */
   526   #define ptrmapPut(w,x,y,z) SQLITE_OK
   527   #define ptrmapGet(w,x,y,z) SQLITE_OK
   528   #define ptrmapPutOvfl(y,z) SQLITE_OK
   529 #endif
   530 
   531 /*
   532 ** Given a btree page and a cell index (0 means the first cell on
   533 ** the page, 1 means the second cell, and so forth) return a pointer
   534 ** to the cell content.
   535 **
   536 ** This routine works only for pages that do not contain overflow cells.
   537 */
   538 #define findCell(P,I) \
   539   ((P)->aData + ((P)->maskPage & get2byte(&(P)->aData[(P)->cellOffset+2*(I)])))
   540 
   541 /*
   542 ** This a more complex version of findCell() that works for
   543 ** pages that do contain overflow cells.  See insert
   544 */
   545 static u8 *findOverflowCell(MemPage *pPage, int iCell){
   546   int i;
   547   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
   548   for(i=pPage->nOverflow-1; i>=0; i--){
   549     int k;
   550     struct _OvflCell *pOvfl;
   551     pOvfl = &pPage->aOvfl[i];
   552     k = pOvfl->idx;
   553     if( k<=iCell ){
   554       if( k==iCell ){
   555         return pOvfl->pCell;
   556       }
   557       iCell--;
   558     }
   559   }
   560   return findCell(pPage, iCell);
   561 }
   562 
   563 /*
   564 ** Parse a cell content block and fill in the CellInfo structure.  There
   565 ** are two versions of this function.  sqlite3BtreeParseCell() takes a 
   566 ** cell index as the second argument and sqlite3BtreeParseCellPtr() 
   567 ** takes a pointer to the body of the cell as its second argument.
   568 **
   569 ** Within this file, the parseCell() macro can be called instead of
   570 ** sqlite3BtreeParseCellPtr(). Using some compilers, this will be faster.
   571 */
   572 void sqlite3BtreeParseCellPtr(
   573   MemPage *pPage,         /* Page containing the cell */
   574   u8 *pCell,              /* Pointer to the cell text. */
   575   CellInfo *pInfo         /* Fill in this structure */
   576 ){
   577   int n;                  /* Number bytes in cell content header */
   578   u32 nPayload;           /* Number of bytes of cell payload */
   579 
   580   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
   581 
   582   pInfo->pCell = pCell;
   583   assert( pPage->leaf==0 || pPage->leaf==1 );
   584   n = pPage->childPtrSize;
   585   assert( n==4-4*pPage->leaf );
   586   if( pPage->intKey ){
   587     if( pPage->hasData ){
   588       n += getVarint32(&pCell[n], nPayload);
   589     }else{
   590       nPayload = 0;
   591     }
   592     n += getVarint(&pCell[n], (u64*)&pInfo->nKey);
   593     pInfo->nData = nPayload;
   594   }else{
   595     pInfo->nData = 0;
   596     n += getVarint32(&pCell[n], nPayload);
   597     pInfo->nKey = nPayload;
   598   }
   599   pInfo->nPayload = nPayload;
   600   pInfo->nHeader = n;
   601   if( likely(nPayload<=pPage->maxLocal) ){
   602     /* This is the (easy) common case where the entire payload fits
   603     ** on the local page.  No overflow is required.
   604     */
   605     int nSize;          /* Total size of cell content in bytes */
   606     nSize = nPayload + n;
   607     pInfo->nLocal = nPayload;
   608     pInfo->iOverflow = 0;
   609     if( (nSize & ~3)==0 ){
   610       nSize = 4;        /* Minimum cell size is 4 */
   611     }
   612     pInfo->nSize = nSize;
   613   }else{
   614     /* If the payload will not fit completely on the local page, we have
   615     ** to decide how much to store locally and how much to spill onto
   616     ** overflow pages.  The strategy is to minimize the amount of unused
   617     ** space on overflow pages while keeping the amount of local storage
   618     ** in between minLocal and maxLocal.
   619     **
   620     ** Warning:  changing the way overflow payload is distributed in any
   621     ** way will result in an incompatible file format.
   622     */
   623     int minLocal;  /* Minimum amount of payload held locally */
   624     int maxLocal;  /* Maximum amount of payload held locally */
   625     int surplus;   /* Overflow payload available for local storage */
   626 
   627     minLocal = pPage->minLocal;
   628     maxLocal = pPage->maxLocal;
   629     surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);
   630     if( surplus <= maxLocal ){
   631       pInfo->nLocal = surplus;
   632     }else{
   633       pInfo->nLocal = minLocal;
   634     }
   635     pInfo->iOverflow = pInfo->nLocal + n;
   636     pInfo->nSize = pInfo->iOverflow + 4;
   637   }
   638 }
   639 #define parseCell(pPage, iCell, pInfo) \
   640   sqlite3BtreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo))
   641 void sqlite3BtreeParseCell(
   642   MemPage *pPage,         /* Page containing the cell */
   643   int iCell,              /* The cell index.  First cell is 0 */
   644   CellInfo *pInfo         /* Fill in this structure */
   645 ){
   646   parseCell(pPage, iCell, pInfo);
   647 }
   648 
   649 /*
   650 ** Compute the total number of bytes that a Cell needs in the cell
   651 ** data area of the btree-page.  The return number includes the cell
   652 ** data header and the local payload, but not any overflow page or
   653 ** the space used by the cell pointer.
   654 */
   655 #ifndef NDEBUG
   656 static u16 cellSize(MemPage *pPage, int iCell){
   657   CellInfo info;
   658   sqlite3BtreeParseCell(pPage, iCell, &info);
   659   return info.nSize;
   660 }
   661 #endif
   662 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
   663   CellInfo info;
   664   sqlite3BtreeParseCellPtr(pPage, pCell, &info);
   665   return info.nSize;
   666 }
   667 
   668 #ifndef SQLITE_OMIT_AUTOVACUUM
   669 /*
   670 ** If the cell pCell, part of page pPage contains a pointer
   671 ** to an overflow page, insert an entry into the pointer-map
   672 ** for the overflow page.
   673 */
   674 static int ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell){
   675   CellInfo info;
   676   assert( pCell!=0 );
   677   sqlite3BtreeParseCellPtr(pPage, pCell, &info);
   678   assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
   679   if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
   680     Pgno ovfl = get4byte(&pCell[info.iOverflow]);
   681     return ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno);
   682   }
   683   return SQLITE_OK;
   684 }
   685 /*
   686 ** If the cell with index iCell on page pPage contains a pointer
   687 ** to an overflow page, insert an entry into the pointer-map
   688 ** for the overflow page.
   689 */
   690 static int ptrmapPutOvfl(MemPage *pPage, int iCell){
   691   u8 *pCell;
   692   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
   693   pCell = findOverflowCell(pPage, iCell);
   694   return ptrmapPutOvflPtr(pPage, pCell);
   695 }
   696 #endif
   697 
   698 
   699 /*
   700 ** Defragment the page given.  All Cells are moved to the
   701 ** end of the page and all free space is collected into one
   702 ** big FreeBlk that occurs in between the header and cell
   703 ** pointer array and the cell content area.
   704 */
   705 static int defragmentPage(MemPage *pPage){
   706   int i;                     /* Loop counter */
   707   int pc;                    /* Address of a i-th cell */
   708   int addr;                  /* Offset of first byte after cell pointer array */
   709   int hdr;                   /* Offset to the page header */
   710   int size;                  /* Size of a cell */
   711   int usableSize;            /* Number of usable bytes on a page */
   712   int cellOffset;            /* Offset to the cell pointer array */
   713   int cbrk;                  /* Offset to the cell content area */
   714   int nCell;                 /* Number of cells on the page */
   715   unsigned char *data;       /* The page data */
   716   unsigned char *temp;       /* Temp area for cell content */
   717 
   718   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
   719   assert( pPage->pBt!=0 );
   720   assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
   721   assert( pPage->nOverflow==0 );
   722   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
   723   temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
   724   data = pPage->aData;
   725   hdr = pPage->hdrOffset;
   726   cellOffset = pPage->cellOffset;
   727   nCell = pPage->nCell;
   728   assert( nCell==get2byte(&data[hdr+3]) );
   729   usableSize = pPage->pBt->usableSize;
   730   cbrk = get2byte(&data[hdr+5]);
   731   memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk);
   732   cbrk = usableSize;
   733   for(i=0; i<nCell; i++){
   734     u8 *pAddr;     /* The i-th cell pointer */
   735     pAddr = &data[cellOffset + i*2];
   736     pc = get2byte(pAddr);
   737     if( pc>=usableSize ){
   738       return SQLITE_CORRUPT_BKPT;
   739     }
   740     size = cellSizePtr(pPage, &temp[pc]);
   741     cbrk -= size;
   742     if( cbrk<cellOffset+2*nCell || pc+size>usableSize ){
   743       return SQLITE_CORRUPT_BKPT;
   744     }
   745     assert( cbrk+size<=usableSize && cbrk>=0 );
   746     memcpy(&data[cbrk], &temp[pc], size);
   747     put2byte(pAddr, cbrk);
   748   }
   749   assert( cbrk>=cellOffset+2*nCell );
   750   put2byte(&data[hdr+5], cbrk);
   751   data[hdr+1] = 0;
   752   data[hdr+2] = 0;
   753   data[hdr+7] = 0;
   754   addr = cellOffset+2*nCell;
   755   memset(&data[addr], 0, cbrk-addr);
   756   if( cbrk-addr!=pPage->nFree ){
   757     return SQLITE_CORRUPT_BKPT;
   758   }
   759   return SQLITE_OK;
   760 }
   761 
   762 /*
   763 ** Allocate nByte bytes of space on a page.
   764 **
   765 ** Return the index into pPage->aData[] of the first byte of
   766 ** the new allocation.  The caller guarantees that there is enough
   767 ** space.  This routine will never fail.
   768 **
   769 ** If the page contains nBytes of free space but does not contain
   770 ** nBytes of contiguous free space, then this routine automatically
   771 ** calls defragementPage() to consolidate all free space before 
   772 ** allocating the new chunk.
   773 */
   774 static int allocateSpace(MemPage *pPage, int nByte){
   775   int addr, pc, hdr;
   776   int size;
   777   int nFrag;
   778   int top;
   779   int nCell;
   780   int cellOffset;
   781   unsigned char *data;
   782   
   783   data = pPage->aData;
   784   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
   785   assert( pPage->pBt );
   786   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
   787   assert( nByte>=0 );  /* Minimum cell size is 4 */
   788   assert( pPage->nFree>=nByte );
   789   assert( pPage->nOverflow==0 );
   790   pPage->nFree -= nByte;
   791   hdr = pPage->hdrOffset;
   792 
   793   nFrag = data[hdr+7];
   794   if( nFrag<60 ){
   795     /* Search the freelist looking for a slot big enough to satisfy the
   796     ** space request. */
   797     addr = hdr+1;
   798     while( (pc = get2byte(&data[addr]))>0 ){
   799       size = get2byte(&data[pc+2]);
   800       if( size>=nByte ){
   801         if( size<nByte+4 ){
   802           memcpy(&data[addr], &data[pc], 2);
   803           data[hdr+7] = nFrag + size - nByte;
   804           return pc;
   805         }else{
   806           put2byte(&data[pc+2], size-nByte);
   807           return pc + size - nByte;
   808         }
   809       }
   810       addr = pc;
   811     }
   812   }
   813 
   814   /* Allocate memory from the gap in between the cell pointer array
   815   ** and the cell content area.
   816   */
   817   top = get2byte(&data[hdr+5]);
   818   nCell = get2byte(&data[hdr+3]);
   819   cellOffset = pPage->cellOffset;
   820   if( nFrag>=60 || cellOffset + 2*nCell > top - nByte ){
   821     defragmentPage(pPage);
   822     top = get2byte(&data[hdr+5]);
   823   }
   824   top -= nByte;
   825   assert( cellOffset + 2*nCell <= top );
   826   put2byte(&data[hdr+5], top);
   827   return top;
   828 }
   829 
   830 /*
   831 ** Return a section of the pPage->aData to the freelist.
   832 ** The first byte of the new free block is pPage->aDisk[start]
   833 ** and the size of the block is "size" bytes.
   834 **
   835 ** Most of the effort here is involved in coalesing adjacent
   836 ** free blocks into a single big free block.
   837 */
   838 static int freeSpace(MemPage *pPage, int start, int size){
   839   int addr, pbegin, hdr;
   840   unsigned char *data = pPage->aData;
   841 
   842   assert( pPage->pBt!=0 );
   843   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
   844   assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) );
   845   assert( (start + size)<=pPage->pBt->usableSize );
   846   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
   847   assert( size>=0 );   /* Minimum cell size is 4 */
   848 
   849 #ifdef SQLITE_SECURE_DELETE
   850   /* Overwrite deleted information with zeros when the SECURE_DELETE 
   851   ** option is enabled at compile-time */
   852   memset(&data[start], 0, size);
   853 #endif
   854 
   855   /* Add the space back into the linked list of freeblocks */
   856   hdr = pPage->hdrOffset;
   857   addr = hdr + 1;
   858   while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
   859     assert( pbegin<=pPage->pBt->usableSize-4 );
   860     if( pbegin<=addr ) return SQLITE_CORRUPT_BKPT;
   861     addr = pbegin;
   862   }
   863   if( pbegin>pPage->pBt->usableSize-4 ) return SQLITE_CORRUPT_BKPT;
   864   assert( pbegin>addr || pbegin==0 );
   865   put2byte(&data[addr], start);
   866   put2byte(&data[start], pbegin);
   867   put2byte(&data[start+2], size);
   868   pPage->nFree += size;
   869 
   870   /* Coalesce adjacent free blocks */
   871   addr = pPage->hdrOffset + 1;
   872   while( (pbegin = get2byte(&data[addr]))>0 ){
   873     int pnext, psize;
   874     assert( pbegin>addr );
   875     assert( pbegin<=pPage->pBt->usableSize-4 );
   876     pnext = get2byte(&data[pbegin]);
   877     psize = get2byte(&data[pbegin+2]);
   878     if( pbegin + psize + 3 >= pnext && pnext>0 ){
   879       int frag = pnext - (pbegin+psize);
   880       if( frag<0 || frag>data[pPage->hdrOffset+7] ) return SQLITE_CORRUPT_BKPT;
   881       data[pPage->hdrOffset+7] -= frag;
   882       put2byte(&data[pbegin], get2byte(&data[pnext]));
   883       put2byte(&data[pbegin+2], pnext+get2byte(&data[pnext+2])-pbegin);
   884     }else{
   885       addr = pbegin;
   886     }
   887   }
   888 
   889   /* If the cell content area begins with a freeblock, remove it. */
   890   if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){
   891     int top;
   892     pbegin = get2byte(&data[hdr+1]);
   893     memcpy(&data[hdr+1], &data[pbegin], 2);
   894     top = get2byte(&data[hdr+5]);
   895     put2byte(&data[hdr+5], top + get2byte(&data[pbegin+2]));
   896   }
   897   return SQLITE_OK;
   898 }
   899 
   900 /*
   901 ** Decode the flags byte (the first byte of the header) for a page
   902 ** and initialize fields of the MemPage structure accordingly.
   903 **
   904 ** Only the following combinations are supported.  Anything different
   905 ** indicates a corrupt database files:
   906 **
   907 **         PTF_ZERODATA
   908 **         PTF_ZERODATA | PTF_LEAF
   909 **         PTF_LEAFDATA | PTF_INTKEY
   910 **         PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
   911 */
   912 static int decodeFlags(MemPage *pPage, int flagByte){
   913   BtShared *pBt;     /* A copy of pPage->pBt */
   914 
   915   assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
   916   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
   917   pPage->leaf = flagByte>>3;  assert( PTF_LEAF == 1<<3 );
   918   flagByte &= ~PTF_LEAF;
   919   pPage->childPtrSize = 4-4*pPage->leaf;
   920   pBt = pPage->pBt;
   921   if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
   922     pPage->intKey = 1;
   923     pPage->hasData = pPage->leaf;
   924     pPage->maxLocal = pBt->maxLeaf;
   925     pPage->minLocal = pBt->minLeaf;
   926   }else if( flagByte==PTF_ZERODATA ){
   927     pPage->intKey = 0;
   928     pPage->hasData = 0;
   929     pPage->maxLocal = pBt->maxLocal;
   930     pPage->minLocal = pBt->minLocal;
   931   }else{
   932     return SQLITE_CORRUPT_BKPT;
   933   }
   934   return SQLITE_OK;
   935 }
   936 
   937 /*
   938 ** Initialize the auxiliary information for a disk block.
   939 **
   940 ** Return SQLITE_OK on success.  If we see that the page does
   941 ** not contain a well-formed database page, then return 
   942 ** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
   943 ** guarantee that the page is well-formed.  It only shows that
   944 ** we failed to detect any corruption.
   945 */
   946 int sqlite3BtreeInitPage(MemPage *pPage){
   947 
   948   assert( pPage->pBt!=0 );
   949   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
   950   assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
   951   assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
   952   assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
   953 
   954   if( !pPage->isInit ){
   955     int pc;            /* Address of a freeblock within pPage->aData[] */
   956     int hdr;           /* Offset to beginning of page header */
   957     u8 *data;          /* Equal to pPage->aData */
   958     BtShared *pBt;        /* The main btree structure */
   959     int usableSize;    /* Amount of usable space on each page */
   960     int cellOffset;    /* Offset from start of page to first cell pointer */
   961     int nFree;         /* Number of unused bytes on the page */
   962     int top;           /* First byte of the cell content area */
   963 
   964     pBt = pPage->pBt;
   965 
   966     hdr = pPage->hdrOffset;
   967     data = pPage->aData;
   968     if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;
   969     assert( pBt->pageSize>=512 && pBt->pageSize<=32768 );
   970     pPage->maskPage = pBt->pageSize - 1;
   971     pPage->nOverflow = 0;
   972     usableSize = pBt->usableSize;
   973     pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
   974     top = get2byte(&data[hdr+5]);
   975     pPage->nCell = get2byte(&data[hdr+3]);
   976     if( pPage->nCell>MX_CELL(pBt) ){
   977       /* To many cells for a single page.  The page must be corrupt */
   978       return SQLITE_CORRUPT_BKPT;
   979     }
   980   
   981     /* Compute the total free space on the page */
   982     pc = get2byte(&data[hdr+1]);
   983     nFree = data[hdr+7] + top - (cellOffset + 2*pPage->nCell);
   984     while( pc>0 ){
   985       int next, size;
   986       if( pc>usableSize-4 ){
   987         /* Free block is off the page */
   988         return SQLITE_CORRUPT_BKPT; 
   989       }
   990       next = get2byte(&data[pc]);
   991       size = get2byte(&data[pc+2]);
   992       if( next>0 && next<=pc+size+3 ){
   993         /* Free blocks must be in accending order */
   994         return SQLITE_CORRUPT_BKPT; 
   995       }
   996       nFree += size;
   997       pc = next;
   998     }
   999     pPage->nFree = nFree;
  1000     if( nFree>=usableSize ){
  1001       /* Free space cannot exceed total page size */
  1002       return SQLITE_CORRUPT_BKPT; 
  1003     }
  1004 
  1005 #if 0
  1006   /* Check that all the offsets in the cell offset array are within range. 
  1007   ** 
  1008   ** Omitting this consistency check and using the pPage->maskPage mask
  1009   ** to prevent overrunning the page buffer in findCell() results in a
  1010   ** 2.5% performance gain.
  1011   */
  1012   {
  1013     u8 *pOff;        /* Iterator used to check all cell offsets are in range */
  1014     u8 *pEnd;        /* Pointer to end of cell offset array */
  1015     u8 mask;         /* Mask of bits that must be zero in MSB of cell offsets */
  1016     mask = ~(((u8)(pBt->pageSize>>8))-1);
  1017     pEnd = &data[cellOffset + pPage->nCell*2];
  1018     for(pOff=&data[cellOffset]; pOff!=pEnd && !((*pOff)&mask); pOff+=2);
  1019     if( pOff!=pEnd ){
  1020       return SQLITE_CORRUPT_BKPT;
  1021     }
  1022   }
  1023 #endif
  1024 
  1025     pPage->isInit = 1;
  1026   }
  1027   return SQLITE_OK;
  1028 }
  1029 
  1030 /*
  1031 ** Set up a raw page so that it looks like a database page holding
  1032 ** no entries.
  1033 */
  1034 static void zeroPage(MemPage *pPage, int flags){
  1035   unsigned char *data = pPage->aData;
  1036   BtShared *pBt = pPage->pBt;
  1037   int hdr = pPage->hdrOffset;
  1038   int first;
  1039 
  1040   assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
  1041   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
  1042   assert( sqlite3PagerGetData(pPage->pDbPage) == data );
  1043   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  1044   assert( sqlite3_mutex_held(pBt->mutex) );
  1045   /*memset(&data[hdr], 0, pBt->usableSize - hdr);*/
  1046   data[hdr] = flags;
  1047   first = hdr + 8 + 4*((flags&PTF_LEAF)==0);
  1048   memset(&data[hdr+1], 0, 4);
  1049   data[hdr+7] = 0;
  1050   put2byte(&data[hdr+5], pBt->usableSize);
  1051   pPage->nFree = pBt->usableSize - first;
  1052   decodeFlags(pPage, flags);
  1053   pPage->hdrOffset = hdr;
  1054   pPage->cellOffset = first;
  1055   pPage->nOverflow = 0;
  1056   assert( pBt->pageSize>=512 && pBt->pageSize<=32768 );
  1057   pPage->maskPage = pBt->pageSize - 1;
  1058   pPage->nCell = 0;
  1059   pPage->isInit = 1;
  1060 }
  1061 
  1062 
  1063 /*
  1064 ** Convert a DbPage obtained from the pager into a MemPage used by
  1065 ** the btree layer.
  1066 */
  1067 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
  1068   MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
  1069   pPage->aData = sqlite3PagerGetData(pDbPage);
  1070   pPage->pDbPage = pDbPage;
  1071   pPage->pBt = pBt;
  1072   pPage->pgno = pgno;
  1073   pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
  1074   return pPage; 
  1075 }
  1076 
  1077 /*
  1078 ** Get a page from the pager.  Initialize the MemPage.pBt and
  1079 ** MemPage.aData elements if needed.
  1080 **
  1081 ** If the noContent flag is set, it means that we do not care about
  1082 ** the content of the page at this time.  So do not go to the disk
  1083 ** to fetch the content.  Just fill in the content with zeros for now.
  1084 ** If in the future we call sqlite3PagerWrite() on this page, that
  1085 ** means we have started to be concerned about content and the disk
  1086 ** read should occur at that point.
  1087 */
  1088 int sqlite3BtreeGetPage(
  1089   BtShared *pBt,       /* The btree */
  1090   Pgno pgno,           /* Number of the page to fetch */
  1091   MemPage **ppPage,    /* Return the page in this parameter */
  1092   int noContent        /* Do not load page content if true */
  1093 ){
  1094   int rc;
  1095   DbPage *pDbPage;
  1096 
  1097   assert( sqlite3_mutex_held(pBt->mutex) );
  1098   rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent);
  1099   if( rc ) return rc;
  1100   *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
  1101   return SQLITE_OK;
  1102 }
  1103 
  1104 /*
  1105 ** Return the size of the database file in pages.  Or return -1 if
  1106 ** there is any kind of error.
  1107 */
  1108 static int pagerPagecount(Pager *pPager){
  1109   int rc;
  1110   int nPage;
  1111   rc = sqlite3PagerPagecount(pPager, &nPage);
  1112   return (rc==SQLITE_OK?nPage:-1);
  1113 }
  1114 
  1115 /*
  1116 ** Get a page from the pager and initialize it.  This routine
  1117 ** is just a convenience wrapper around separate calls to
  1118 ** sqlite3BtreeGetPage() and sqlite3BtreeInitPage().
  1119 */
  1120 static int getAndInitPage(
  1121   BtShared *pBt,          /* The database file */
  1122   Pgno pgno,           /* Number of the page to get */
  1123   MemPage **ppPage     /* Write the page pointer here */
  1124 ){
  1125   int rc;
  1126   DbPage *pDbPage;
  1127   MemPage *pPage;
  1128 
  1129   assert( sqlite3_mutex_held(pBt->mutex) );
  1130   if( pgno==0 ){
  1131     return SQLITE_CORRUPT_BKPT; 
  1132   }
  1133 
  1134   /* It is often the case that the page we want is already in cache.
  1135   ** If so, get it directly.  This saves us from having to call
  1136   ** pagerPagecount() to make sure pgno is within limits, which results
  1137   ** in a measureable performance improvements.
  1138   */
  1139   pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
  1140   if( pDbPage ){
  1141     /* Page is already in cache */
  1142     *ppPage = pPage = btreePageFromDbPage(pDbPage, pgno, pBt);
  1143     rc = SQLITE_OK;
  1144   }else{
  1145     /* Page not in cache.  Acquire it. */
  1146     if( pgno>pagerPagecount(pBt->pPager) ){
  1147       return SQLITE_CORRUPT_BKPT; 
  1148     }
  1149     rc = sqlite3BtreeGetPage(pBt, pgno, ppPage, 0);
  1150     if( rc ) return rc;
  1151     pPage = *ppPage;
  1152   }
  1153   if( !pPage->isInit ){
  1154     rc = sqlite3BtreeInitPage(pPage);
  1155   }
  1156   if( rc!=SQLITE_OK ){
  1157     releasePage(pPage);
  1158     *ppPage = 0;
  1159   }
  1160   return rc;
  1161 }
  1162 
  1163 /*
  1164 ** Release a MemPage.  This should be called once for each prior
  1165 ** call to sqlite3BtreeGetPage.
  1166 */
  1167 static void releasePage(MemPage *pPage){
  1168   if( pPage ){
  1169     assert( pPage->aData );
  1170     assert( pPage->pBt );
  1171     assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
  1172     assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
  1173     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  1174     sqlite3PagerUnref(pPage->pDbPage);
  1175   }
  1176 }
  1177 
  1178 /*
  1179 ** During a rollback, when the pager reloads information into the cache
  1180 ** so that the cache is restored to its original state at the start of
  1181 ** the transaction, for each page restored this routine is called.
  1182 **
  1183 ** This routine needs to reset the extra data section at the end of the
  1184 ** page to agree with the restored data.
  1185 */
  1186 static void pageReinit(DbPage *pData){
  1187   MemPage *pPage;
  1188   pPage = (MemPage *)sqlite3PagerGetExtra(pData);
  1189   if( pPage->isInit ){
  1190     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  1191     pPage->isInit = 0;
  1192     if( sqlite3PagerPageRefcount(pData)>0 ){
  1193       sqlite3BtreeInitPage(pPage);
  1194     }
  1195   }
  1196 }
  1197 
  1198 /*
  1199 ** Invoke the busy handler for a btree.
  1200 */
  1201 static int sqlite3BtreeInvokeBusyHandler(void *pArg, int n){
  1202   BtShared *pBt = (BtShared*)pArg;
  1203   assert( pBt->db );
  1204   assert( sqlite3_mutex_held(pBt->db->mutex) );
  1205   return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
  1206 }
  1207 
  1208 /*
  1209 ** Open a database file.
  1210 ** 
  1211 ** zFilename is the name of the database file.  If zFilename is NULL
  1212 ** a new database with a random name is created.  This randomly named
  1213 ** database file will be deleted when sqlite3BtreeClose() is called.
  1214 ** If zFilename is ":memory:" then an in-memory database is created
  1215 ** that is automatically destroyed when it is closed.
  1216 */
  1217 int sqlite3BtreeOpen(
  1218   const char *zFilename,  /* Name of the file containing the BTree database */
  1219   sqlite3 *db,            /* Associated database handle */
  1220   Btree **ppBtree,        /* Pointer to new Btree object written here */
  1221   int flags,              /* Options */
  1222   int vfsFlags            /* Flags passed through to sqlite3_vfs.xOpen() */
  1223 ){
  1224   sqlite3_vfs *pVfs;      /* The VFS to use for this btree */
  1225   BtShared *pBt = 0;      /* Shared part of btree structure */
  1226   Btree *p;               /* Handle to return */
  1227   int rc = SQLITE_OK;
  1228   int nReserve;
  1229   unsigned char zDbHeader[100];
  1230 
  1231   /* Set the variable isMemdb to true for an in-memory database, or 
  1232   ** false for a file-based database. This symbol is only required if
  1233   ** either of the shared-data or autovacuum features are compiled 
  1234   ** into the library.
  1235   */
  1236 #if !defined(SQLITE_OMIT_SHARED_CACHE) || !defined(SQLITE_OMIT_AUTOVACUUM)
  1237   #ifdef SQLITE_OMIT_MEMORYDB
  1238     const int isMemdb = 0;
  1239   #else
  1240     const int isMemdb = zFilename && !strcmp(zFilename, ":memory:");
  1241   #endif
  1242 #endif
  1243 
  1244   assert( db!=0 );
  1245   assert( sqlite3_mutex_held(db->mutex) );
  1246 
  1247   pVfs = db->pVfs;
  1248   p = sqlite3MallocZero(sizeof(Btree));
  1249   if( !p ){
  1250     return SQLITE_NOMEM;
  1251   }
  1252   p->inTrans = TRANS_NONE;
  1253   p->db = db;
  1254 
  1255 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
  1256   /*
  1257   ** If this Btree is a candidate for shared cache, try to find an
  1258   ** existing BtShared object that we can share with
  1259   */
  1260   if( isMemdb==0
  1261    && (db->flags & SQLITE_Vtab)==0
  1262    && zFilename && zFilename[0]
  1263   ){
  1264     if( sqlite3GlobalConfig.sharedCacheEnabled ){
  1265       int nFullPathname = pVfs->mxPathname+1;
  1266       char *zFullPathname = sqlite3Malloc(nFullPathname);
  1267       sqlite3_mutex *mutexShared;
  1268       p->sharable = 1;
  1269       db->flags |= SQLITE_SharedCache;
  1270       if( !zFullPathname ){
  1271         sqlite3_free(p);
  1272         return SQLITE_NOMEM;
  1273       }
  1274       sqlite3OsFullPathname(pVfs, zFilename, nFullPathname, zFullPathname);
  1275       mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
  1276       sqlite3_mutex_enter(mutexShared);
  1277       for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
  1278         assert( pBt->nRef>0 );
  1279         if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager))
  1280                  && sqlite3PagerVfs(pBt->pPager)==pVfs ){
  1281           p->pBt = pBt;
  1282           pBt->nRef++;
  1283           break;
  1284         }
  1285       }
  1286       sqlite3_mutex_leave(mutexShared);
  1287       sqlite3_free(zFullPathname);
  1288     }
  1289 #ifdef SQLITE_DEBUG
  1290     else{
  1291       /* In debug mode, we mark all persistent databases as sharable
  1292       ** even when they are not.  This exercises the locking code and
  1293       ** gives more opportunity for asserts(sqlite3_mutex_held())
  1294       ** statements to find locking problems.
  1295       */
  1296       p->sharable = 1;
  1297     }
  1298 #endif
  1299   }
  1300 #endif
  1301   if( pBt==0 ){
  1302     /*
  1303     ** The following asserts make sure that structures used by the btree are
  1304     ** the right size.  This is to guard against size changes that result
  1305     ** when compiling on a different architecture.
  1306     */
  1307     assert( sizeof(i64)==8 || sizeof(i64)==4 );
  1308     assert( sizeof(u64)==8 || sizeof(u64)==4 );
  1309     assert( sizeof(u32)==4 );
  1310     assert( sizeof(u16)==2 );
  1311     assert( sizeof(Pgno)==4 );
  1312   
  1313     pBt = sqlite3MallocZero( sizeof(*pBt) );
  1314     if( pBt==0 ){
  1315       rc = SQLITE_NOMEM;
  1316       goto btree_open_out;
  1317     }
  1318     pBt->busyHdr.xFunc = sqlite3BtreeInvokeBusyHandler;
  1319     pBt->busyHdr.pArg = pBt;
  1320     rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
  1321                           EXTRA_SIZE, flags, vfsFlags);
  1322     if( rc==SQLITE_OK ){
  1323       rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
  1324     }
  1325     if( rc!=SQLITE_OK ){
  1326       goto btree_open_out;
  1327     }
  1328     sqlite3PagerSetBusyhandler(pBt->pPager, &pBt->busyHdr);
  1329     p->pBt = pBt;
  1330   
  1331     sqlite3PagerSetReiniter(pBt->pPager, pageReinit);
  1332     pBt->pCursor = 0;
  1333     pBt->pPage1 = 0;
  1334     pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager);
  1335     pBt->pageSize = get2byte(&zDbHeader[16]);
  1336     if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
  1337          || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
  1338       pBt->pageSize = 0;
  1339       sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
  1340 #ifndef SQLITE_OMIT_AUTOVACUUM
  1341       /* If the magic name ":memory:" will create an in-memory database, then
  1342       ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
  1343       ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
  1344       ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
  1345       ** regular file-name. In this case the auto-vacuum applies as per normal.
  1346       */
  1347       if( zFilename && !isMemdb ){
  1348         pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
  1349         pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
  1350       }
  1351 #endif
  1352       nReserve = 0;
  1353     }else{
  1354       nReserve = zDbHeader[20];
  1355       pBt->pageSizeFixed = 1;
  1356 #ifndef SQLITE_OMIT_AUTOVACUUM
  1357       pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
  1358       pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
  1359 #endif
  1360     }
  1361     pBt->usableSize = pBt->pageSize - nReserve;
  1362     assert( (pBt->pageSize & 7)==0 );  /* 8-byte alignment of pageSize */
  1363     sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
  1364    
  1365 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
  1366     /* Add the new BtShared object to the linked list sharable BtShareds.
  1367     */
  1368     if( p->sharable ){
  1369       sqlite3_mutex *mutexShared;
  1370       pBt->nRef = 1;
  1371       mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
  1372       if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
  1373         pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
  1374         if( pBt->mutex==0 ){
  1375           rc = SQLITE_NOMEM;
  1376           db->mallocFailed = 0;
  1377           goto btree_open_out;
  1378         }
  1379       }
  1380       sqlite3_mutex_enter(mutexShared);
  1381       pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
  1382       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
  1383       sqlite3_mutex_leave(mutexShared);
  1384     }
  1385 #endif
  1386   }
  1387 
  1388 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
  1389   /* If the new Btree uses a sharable pBtShared, then link the new
  1390   ** Btree into the list of all sharable Btrees for the same connection.
  1391   ** The list is kept in ascending order by pBt address.
  1392   */
  1393   if( p->sharable ){
  1394     int i;
  1395     Btree *pSib;
  1396     for(i=0; i<db->nDb; i++){
  1397       if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
  1398         while( pSib->pPrev ){ pSib = pSib->pPrev; }
  1399         if( p->pBt<pSib->pBt ){
  1400           p->pNext = pSib;
  1401           p->pPrev = 0;
  1402           pSib->pPrev = p;
  1403         }else{
  1404           while( pSib->pNext && pSib->pNext->pBt<p->pBt ){
  1405             pSib = pSib->pNext;
  1406           }
  1407           p->pNext = pSib->pNext;
  1408           p->pPrev = pSib;
  1409           if( p->pNext ){
  1410             p->pNext->pPrev = p;
  1411           }
  1412           pSib->pNext = p;
  1413         }
  1414         break;
  1415       }
  1416     }
  1417   }
  1418 #endif
  1419   *ppBtree = p;
  1420 
  1421 btree_open_out:
  1422   if( rc!=SQLITE_OK ){
  1423     if( pBt && pBt->pPager ){
  1424       sqlite3PagerClose(pBt->pPager);
  1425     }
  1426     sqlite3_free(pBt);
  1427     sqlite3_free(p);
  1428     *ppBtree = 0;
  1429   }
  1430   return rc;
  1431 }
  1432 
  1433 /*
  1434 ** Decrement the BtShared.nRef counter.  When it reaches zero,
  1435 ** remove the BtShared structure from the sharing list.  Return
  1436 ** true if the BtShared.nRef counter reaches zero and return
  1437 ** false if it is still positive.
  1438 */
  1439 static int removeFromSharingList(BtShared *pBt){
  1440 #ifndef SQLITE_OMIT_SHARED_CACHE
  1441   sqlite3_mutex *pMaster;
  1442   BtShared *pList;
  1443   int removed = 0;
  1444 
  1445   assert( sqlite3_mutex_notheld(pBt->mutex) );
  1446   pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
  1447   sqlite3_mutex_enter(pMaster);
  1448   pBt->nRef--;
  1449   if( pBt->nRef<=0 ){
  1450     if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
  1451       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
  1452     }else{
  1453       pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
  1454       while( ALWAYS(pList) && pList->pNext!=pBt ){
  1455         pList=pList->pNext;
  1456       }
  1457       if( ALWAYS(pList) ){
  1458         pList->pNext = pBt->pNext;
  1459       }
  1460     }
  1461     if( SQLITE_THREADSAFE ){
  1462       sqlite3_mutex_free(pBt->mutex);
  1463     }
  1464     removed = 1;
  1465   }
  1466   sqlite3_mutex_leave(pMaster);
  1467   return removed;
  1468 #else
  1469   return 1;
  1470 #endif
  1471 }
  1472 
  1473 /*
  1474 ** Make sure pBt->pTmpSpace points to an allocation of 
  1475 ** MX_CELL_SIZE(pBt) bytes.
  1476 */
  1477 static void allocateTempSpace(BtShared *pBt){
  1478   if( !pBt->pTmpSpace ){
  1479     pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
  1480   }
  1481 }
  1482 
  1483 /*
  1484 ** Free the pBt->pTmpSpace allocation
  1485 */
  1486 static void freeTempSpace(BtShared *pBt){
  1487   sqlite3PageFree( pBt->pTmpSpace);
  1488   pBt->pTmpSpace = 0;
  1489 }
  1490 
  1491 /*
  1492 ** Close an open database and invalidate all cursors.
  1493 */
  1494 int sqlite3BtreeClose(Btree *p){
  1495   BtShared *pBt = p->pBt;
  1496   BtCursor *pCur;
  1497 
  1498   /* Close all cursors opened via this handle.  */
  1499   assert( sqlite3_mutex_held(p->db->mutex) );
  1500   sqlite3BtreeEnter(p);
  1501   pBt->db = p->db;
  1502   pCur = pBt->pCursor;
  1503   while( pCur ){
  1504     BtCursor *pTmp = pCur;
  1505     pCur = pCur->pNext;
  1506     if( pTmp->pBtree==p ){
  1507       sqlite3BtreeCloseCursor(pTmp);
  1508     }
  1509   }
  1510 
  1511   /* Rollback any active transaction and free the handle structure.
  1512   ** The call to sqlite3BtreeRollback() drops any table-locks held by
  1513   ** this handle.
  1514   */
  1515   sqlite3BtreeRollback(p);
  1516   sqlite3BtreeLeave(p);
  1517 
  1518   /* If there are still other outstanding references to the shared-btree
  1519   ** structure, return now. The remainder of this procedure cleans 
  1520   ** up the shared-btree.
  1521   */
  1522   assert( p->wantToLock==0 && p->locked==0 );
  1523   if( !p->sharable || removeFromSharingList(pBt) ){
  1524     /* The pBt is no longer on the sharing list, so we can access
  1525     ** it without having to hold the mutex.
  1526     **
  1527     ** Clean out and delete the BtShared object.
  1528     */
  1529     assert( !pBt->pCursor );
  1530     sqlite3PagerClose(pBt->pPager);
  1531     if( pBt->xFreeSchema && pBt->pSchema ){
  1532       pBt->xFreeSchema(pBt->pSchema);
  1533     }
  1534     sqlite3_free(pBt->pSchema);
  1535     freeTempSpace(pBt);
  1536     sqlite3_free(pBt);
  1537   }
  1538 
  1539 #ifndef SQLITE_OMIT_SHARED_CACHE
  1540   assert( p->wantToLock==0 );
  1541   assert( p->locked==0 );
  1542   if( p->pPrev ) p->pPrev->pNext = p->pNext;
  1543   if( p->pNext ) p->pNext->pPrev = p->pPrev;
  1544 #endif
  1545 
  1546   sqlite3_free(p);
  1547   return SQLITE_OK;
  1548 }
  1549 
  1550 /*
  1551 ** Change the limit on the number of pages allowed in the cache.
  1552 **
  1553 ** The maximum number of cache pages is set to the absolute
  1554 ** value of mxPage.  If mxPage is negative, the pager will
  1555 ** operate asynchronously - it will not stop to do fsync()s
  1556 ** to insure data is written to the disk surface before
  1557 ** continuing.  Transactions still work if synchronous is off,
  1558 ** and the database cannot be corrupted if this program
  1559 ** crashes.  But if the operating system crashes or there is
  1560 ** an abrupt power failure when synchronous is off, the database
  1561 ** could be left in an inconsistent and unrecoverable state.
  1562 ** Synchronous is on by default so database corruption is not
  1563 ** normally a worry.
  1564 */
  1565 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
  1566   BtShared *pBt = p->pBt;
  1567   assert( sqlite3_mutex_held(p->db->mutex) );
  1568   sqlite3BtreeEnter(p);
  1569   sqlite3PagerSetCachesize(pBt->pPager, mxPage);
  1570   sqlite3BtreeLeave(p);
  1571   return SQLITE_OK;
  1572 }
  1573 
  1574 /*
  1575 ** Change the way data is synced to disk in order to increase or decrease
  1576 ** how well the database resists damage due to OS crashes and power
  1577 ** failures.  Level 1 is the same as asynchronous (no syncs() occur and
  1578 ** there is a high probability of damage)  Level 2 is the default.  There
  1579 ** is a very low but non-zero probability of damage.  Level 3 reduces the
  1580 ** probability of damage to near zero but with a write performance reduction.
  1581 */
  1582 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
  1583 int sqlite3BtreeSetSafetyLevel(Btree *p, int level, int fullSync){
  1584   BtShared *pBt = p->pBt;
  1585   assert( sqlite3_mutex_held(p->db->mutex) );
  1586   sqlite3BtreeEnter(p);
  1587   sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync);
  1588   sqlite3BtreeLeave(p);
  1589   return SQLITE_OK;
  1590 }
  1591 #endif
  1592 
  1593 /*
  1594 ** Return TRUE if the given btree is set to safety level 1.  In other
  1595 ** words, return TRUE if no sync() occurs on the disk files.
  1596 */
  1597 int sqlite3BtreeSyncDisabled(Btree *p){
  1598   BtShared *pBt = p->pBt;
  1599   int rc;
  1600   assert( sqlite3_mutex_held(p->db->mutex) );  
  1601   sqlite3BtreeEnter(p);
  1602   assert( pBt && pBt->pPager );
  1603   rc = sqlite3PagerNosync(pBt->pPager);
  1604   sqlite3BtreeLeave(p);
  1605   return rc;
  1606 }
  1607 
  1608 #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
  1609 /*
  1610 ** Change the default pages size and the number of reserved bytes per page.
  1611 **
  1612 ** The page size must be a power of 2 between 512 and 65536.  If the page
  1613 ** size supplied does not meet this constraint then the page size is not
  1614 ** changed.
  1615 **
  1616 ** Page sizes are constrained to be a power of two so that the region
  1617 ** of the database file used for locking (beginning at PENDING_BYTE,
  1618 ** the first byte past the 1GB boundary, 0x40000000) needs to occur
  1619 ** at the beginning of a page.
  1620 **
  1621 ** If parameter nReserve is less than zero, then the number of reserved
  1622 ** bytes per page is left unchanged.
  1623 */
  1624 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve){
  1625   int rc = SQLITE_OK;
  1626   BtShared *pBt = p->pBt;
  1627   sqlite3BtreeEnter(p);
  1628   if( pBt->pageSizeFixed ){
  1629     sqlite3BtreeLeave(p);
  1630     return SQLITE_READONLY;
  1631   }
  1632   if( nReserve<0 ){
  1633     nReserve = pBt->pageSize - pBt->usableSize;
  1634   }
  1635   if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
  1636         ((pageSize-1)&pageSize)==0 ){
  1637     assert( (pageSize & 7)==0 );
  1638     assert( !pBt->pPage1 && !pBt->pCursor );
  1639     pBt->pageSize = pageSize;
  1640     freeTempSpace(pBt);
  1641     rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
  1642   }
  1643   pBt->usableSize = pBt->pageSize - nReserve;
  1644   sqlite3BtreeLeave(p);
  1645   return rc;
  1646 }
  1647 
  1648 /*
  1649 ** Return the currently defined page size
  1650 */
  1651 int sqlite3BtreeGetPageSize(Btree *p){
  1652   return p->pBt->pageSize;
  1653 }
  1654 int sqlite3BtreeGetReserve(Btree *p){
  1655   int n;
  1656   sqlite3BtreeEnter(p);
  1657   n = p->pBt->pageSize - p->pBt->usableSize;
  1658   sqlite3BtreeLeave(p);
  1659   return n;
  1660 }
  1661 
  1662 /*
  1663 ** Set the maximum page count for a database if mxPage is positive.
  1664 ** No changes are made if mxPage is 0 or negative.
  1665 ** Regardless of the value of mxPage, return the maximum page count.
  1666 */
  1667 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
  1668   int n;
  1669   sqlite3BtreeEnter(p);
  1670   n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
  1671   sqlite3BtreeLeave(p);
  1672   return n;
  1673 }
  1674 #endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */
  1675 
  1676 /*
  1677 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
  1678 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
  1679 ** is disabled. The default value for the auto-vacuum property is 
  1680 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
  1681 */
  1682 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
  1683 #ifdef SQLITE_OMIT_AUTOVACUUM
  1684   return SQLITE_READONLY;
  1685 #else
  1686   BtShared *pBt = p->pBt;
  1687   int rc = SQLITE_OK;
  1688   int av = (autoVacuum?1:0);
  1689 
  1690   sqlite3BtreeEnter(p);
  1691   if( pBt->pageSizeFixed && av!=pBt->autoVacuum ){
  1692     rc = SQLITE_READONLY;
  1693   }else{
  1694     pBt->autoVacuum = av;
  1695   }
  1696   sqlite3BtreeLeave(p);
  1697   return rc;
  1698 #endif
  1699 }
  1700 
  1701 /*
  1702 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is 
  1703 ** enabled 1 is returned. Otherwise 0.
  1704 */
  1705 int sqlite3BtreeGetAutoVacuum(Btree *p){
  1706 #ifdef SQLITE_OMIT_AUTOVACUUM
  1707   return BTREE_AUTOVACUUM_NONE;
  1708 #else
  1709   int rc;
  1710   sqlite3BtreeEnter(p);
  1711   rc = (
  1712     (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
  1713     (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
  1714     BTREE_AUTOVACUUM_INCR
  1715   );
  1716   sqlite3BtreeLeave(p);
  1717   return rc;
  1718 #endif
  1719 }
  1720 
  1721 
  1722 /*
  1723 ** Get a reference to pPage1 of the database file.  This will
  1724 ** also acquire a readlock on that file.
  1725 **
  1726 ** SQLITE_OK is returned on success.  If the file is not a
  1727 ** well-formed database file, then SQLITE_CORRUPT is returned.
  1728 ** SQLITE_BUSY is returned if the database is locked.  SQLITE_NOMEM
  1729 ** is returned if we run out of memory. 
  1730 */
  1731 static int lockBtree(BtShared *pBt){
  1732   int rc;
  1733   MemPage *pPage1;
  1734   int nPage;
  1735 
  1736   assert( sqlite3_mutex_held(pBt->mutex) );
  1737   if( pBt->pPage1 ) return SQLITE_OK;
  1738   rc = sqlite3BtreeGetPage(pBt, 1, &pPage1, 0);
  1739   if( rc!=SQLITE_OK ) return rc;
  1740 
  1741   /* Do some checking to help insure the file we opened really is
  1742   ** a valid database file. 
  1743   */
  1744   rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
  1745   if( rc!=SQLITE_OK ){
  1746     goto page1_init_failed;
  1747   }else if( nPage>0 ){
  1748     int pageSize;
  1749     int usableSize;
  1750     u8 *page1 = pPage1->aData;
  1751     rc = SQLITE_NOTADB;
  1752     if( memcmp(page1, zMagicHeader, 16)!=0 ){
  1753       goto page1_init_failed;
  1754     }
  1755     if( page1[18]>1 ){
  1756       pBt->readOnly = 1;
  1757     }
  1758     if( page1[19]>1 ){
  1759       goto page1_init_failed;
  1760     }
  1761 
  1762     /* The maximum embedded fraction must be exactly 25%.  And the minimum
  1763     ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data.
  1764     ** The original design allowed these amounts to vary, but as of
  1765     ** version 3.6.0, we require them to be fixed.
  1766     */
  1767     if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
  1768       goto page1_init_failed;
  1769     }
  1770     pageSize = get2byte(&page1[16]);
  1771     if( ((pageSize-1)&pageSize)!=0 || pageSize<512 ||
  1772         (SQLITE_MAX_PAGE_SIZE<32768 && pageSize>SQLITE_MAX_PAGE_SIZE)
  1773     ){
  1774       goto page1_init_failed;
  1775     }
  1776     assert( (pageSize & 7)==0 );
  1777     usableSize = pageSize - page1[20];
  1778     if( pageSize!=pBt->pageSize ){
  1779       /* After reading the first page of the database assuming a page size
  1780       ** of BtShared.pageSize, we have discovered that the page-size is
  1781       ** actually pageSize. Unlock the database, leave pBt->pPage1 at
  1782       ** zero and return SQLITE_OK. The caller will call this function
  1783       ** again with the correct page-size.
  1784       */
  1785       releasePage(pPage1);
  1786       pBt->usableSize = usableSize;
  1787       pBt->pageSize = pageSize;
  1788       freeTempSpace(pBt);
  1789       sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
  1790       return SQLITE_OK;
  1791     }
  1792     if( usableSize<500 ){
  1793       goto page1_init_failed;
  1794     }
  1795     pBt->pageSize = pageSize;
  1796     pBt->usableSize = usableSize;
  1797 #ifndef SQLITE_OMIT_AUTOVACUUM
  1798     pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
  1799     pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
  1800 #endif
  1801   }
  1802 
  1803   /* maxLocal is the maximum amount of payload to store locally for
  1804   ** a cell.  Make sure it is small enough so that at least minFanout
  1805   ** cells can will fit on one page.  We assume a 10-byte page header.
  1806   ** Besides the payload, the cell must store:
  1807   **     2-byte pointer to the cell
  1808   **     4-byte child pointer
  1809   **     9-byte nKey value
  1810   **     4-byte nData value
  1811   **     4-byte overflow page pointer
  1812   ** So a cell consists of a 2-byte poiner, a header which is as much as
  1813   ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
  1814   ** page pointer.
  1815   */
  1816   pBt->maxLocal = (pBt->usableSize-12)*64/255 - 23;
  1817   pBt->minLocal = (pBt->usableSize-12)*32/255 - 23;
  1818   pBt->maxLeaf = pBt->usableSize - 35;
  1819   pBt->minLeaf = (pBt->usableSize-12)*32/255 - 23;
  1820   assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
  1821   pBt->pPage1 = pPage1;
  1822   return SQLITE_OK;
  1823 
  1824 page1_init_failed:
  1825   releasePage(pPage1);
  1826   pBt->pPage1 = 0;
  1827   return rc;
  1828 }
  1829 
  1830 /*
  1831 ** This routine works like lockBtree() except that it also invokes the
  1832 ** busy callback if there is lock contention.
  1833 */
  1834 static int lockBtreeWithRetry(Btree *pRef){
  1835   int rc = SQLITE_OK;
  1836 
  1837   assert( sqlite3BtreeHoldsMutex(pRef) );
  1838   if( pRef->inTrans==TRANS_NONE ){
  1839     u8 inTransaction = pRef->pBt->inTransaction;
  1840     btreeIntegrity(pRef);
  1841     rc = sqlite3BtreeBeginTrans(pRef, 0);
  1842     pRef->pBt->inTransaction = inTransaction;
  1843     pRef->inTrans = TRANS_NONE;
  1844     if( rc==SQLITE_OK ){
  1845       pRef->pBt->nTransaction--;
  1846     }
  1847     btreeIntegrity(pRef);
  1848   }
  1849   return rc;
  1850 }
  1851        
  1852 
  1853 /*
  1854 ** If there are no outstanding cursors and we are not in the middle
  1855 ** of a transaction but there is a read lock on the database, then
  1856 ** this routine unrefs the first page of the database file which 
  1857 ** has the effect of releasing the read lock.
  1858 **
  1859 ** If there are any outstanding cursors, this routine is a no-op.
  1860 **
  1861 ** If there is a transaction in progress, this routine is a no-op.
  1862 */
  1863 static void unlockBtreeIfUnused(BtShared *pBt){
  1864   assert( sqlite3_mutex_held(pBt->mutex) );
  1865   if( pBt->inTransaction==TRANS_NONE && pBt->pCursor==0 && pBt->pPage1!=0 ){
  1866     if( sqlite3PagerRefcount(pBt->pPager)>=1 ){
  1867       assert( pBt->pPage1->aData );
  1868 #if 0
  1869       if( pBt->pPage1->aData==0 ){
  1870         MemPage *pPage = pBt->pPage1;
  1871         pPage->aData = sqlite3PagerGetData(pPage->pDbPage);
  1872         pPage->pBt = pBt;
  1873         pPage->pgno = 1;
  1874       }
  1875 #endif
  1876       releasePage(pBt->pPage1);
  1877     }
  1878     pBt->pPage1 = 0;
  1879     pBt->inStmt = 0;
  1880   }
  1881 }
  1882 
  1883 /*
  1884 ** Create a new database by initializing the first page of the
  1885 ** file.
  1886 */
  1887 static int newDatabase(BtShared *pBt){
  1888   MemPage *pP1;
  1889   unsigned char *data;
  1890   int rc;
  1891   int nPage;
  1892 
  1893   assert( sqlite3_mutex_held(pBt->mutex) );
  1894   rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
  1895   if( rc!=SQLITE_OK || nPage>0 ){
  1896     return rc;
  1897   }
  1898   pP1 = pBt->pPage1;
  1899   assert( pP1!=0 );
  1900   data = pP1->aData;
  1901   rc = sqlite3PagerWrite(pP1->pDbPage);
  1902   if( rc ) return rc;
  1903   memcpy(data, zMagicHeader, sizeof(zMagicHeader));
  1904   assert( sizeof(zMagicHeader)==16 );
  1905   put2byte(&data[16], pBt->pageSize);
  1906   data[18] = 1;
  1907   data[19] = 1;
  1908   data[20] = pBt->pageSize - pBt->usableSize;
  1909   data[21] = 64;
  1910   data[22] = 32;
  1911   data[23] = 32;
  1912   memset(&data[24], 0, 100-24);
  1913   zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
  1914   pBt->pageSizeFixed = 1;
  1915 #ifndef SQLITE_OMIT_AUTOVACUUM
  1916   assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
  1917   assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
  1918   put4byte(&data[36 + 4*4], pBt->autoVacuum);
  1919   put4byte(&data[36 + 7*4], pBt->incrVacuum);
  1920 #endif
  1921   return SQLITE_OK;
  1922 }
  1923 
  1924 /*
  1925 ** Attempt to start a new transaction. A write-transaction
  1926 ** is started if the second argument is nonzero, otherwise a read-
  1927 ** transaction.  If the second argument is 2 or more and exclusive
  1928 ** transaction is started, meaning that no other process is allowed
  1929 ** to access the database.  A preexisting transaction may not be
  1930 ** upgraded to exclusive by calling this routine a second time - the
  1931 ** exclusivity flag only works for a new transaction.
  1932 **
  1933 ** A write-transaction must be started before attempting any 
  1934 ** changes to the database.  None of the following routines 
  1935 ** will work unless a transaction is started first:
  1936 **
  1937 **      sqlite3BtreeCreateTable()
  1938 **      sqlite3BtreeCreateIndex()
  1939 **      sqlite3BtreeClearTable()
  1940 **      sqlite3BtreeDropTable()
  1941 **      sqlite3BtreeInsert()
  1942 **      sqlite3BtreeDelete()
  1943 **      sqlite3BtreeUpdateMeta()
  1944 **
  1945 ** If an initial attempt to acquire the lock fails because of lock contention
  1946 ** and the database was previously unlocked, then invoke the busy handler
  1947 ** if there is one.  But if there was previously a read-lock, do not
  1948 ** invoke the busy handler - just return SQLITE_BUSY.  SQLITE_BUSY is 
  1949 ** returned when there is already a read-lock in order to avoid a deadlock.
  1950 **
  1951 ** Suppose there are two processes A and B.  A has a read lock and B has
  1952 ** a reserved lock.  B tries to promote to exclusive but is blocked because
  1953 ** of A's read lock.  A tries to promote to reserved but is blocked by B.
  1954 ** One or the other of the two processes must give way or there can be
  1955 ** no progress.  By returning SQLITE_BUSY and not invoking the busy callback
  1956 ** when A already has a read lock, we encourage A to give up and let B
  1957 ** proceed.
  1958 */
  1959 int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
  1960   BtShared *pBt = p->pBt;
  1961   int rc = SQLITE_OK;
  1962 
  1963   sqlite3BtreeEnter(p);
  1964   pBt->db = p->db;
  1965   btreeIntegrity(p);
  1966 
  1967   /* If the btree is already in a write-transaction, or it
  1968   ** is already in a read-transaction and a read-transaction
  1969   ** is requested, this is a no-op.
  1970   */
  1971   if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
  1972     goto trans_begun;
  1973   }
  1974 
  1975   /* Write transactions are not possible on a read-only database */
  1976   if( pBt->readOnly && wrflag ){
  1977     rc = SQLITE_READONLY;
  1978     goto trans_begun;
  1979   }
  1980 
  1981   /* If another database handle has already opened a write transaction 
  1982   ** on this shared-btree structure and a second write transaction is
  1983   ** requested, return SQLITE_BUSY.
  1984   */
  1985   if( pBt->inTransaction==TRANS_WRITE && wrflag ){
  1986     rc = SQLITE_BUSY;
  1987     goto trans_begun;
  1988   }
  1989 
  1990 #ifndef SQLITE_OMIT_SHARED_CACHE
  1991   if( wrflag>1 ){
  1992     BtLock *pIter;
  1993     for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
  1994       if( pIter->pBtree!=p ){
  1995         rc = SQLITE_BUSY;
  1996         goto trans_begun;
  1997       }
  1998     }
  1999   }
  2000 #endif
  2001 
  2002   do {
  2003     if( pBt->pPage1==0 ){
  2004       do{
  2005         rc = lockBtree(pBt);
  2006       }while( pBt->pPage1==0 && rc==SQLITE_OK );
  2007     }
  2008 
  2009     if( rc==SQLITE_OK && wrflag ){
  2010       if( pBt->readOnly ){
  2011         rc = SQLITE_READONLY;
  2012       }else{
  2013         rc = sqlite3PagerBegin(pBt->pPage1->pDbPage, wrflag>1);
  2014         if( rc==SQLITE_OK ){
  2015           rc = newDatabase(pBt);
  2016         }
  2017       }
  2018     }
  2019   
  2020     if( rc==SQLITE_OK ){
  2021       if( wrflag ) pBt->inStmt = 0;
  2022     }else{
  2023       unlockBtreeIfUnused(pBt);
  2024     }
  2025   }while( rc==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
  2026           sqlite3BtreeInvokeBusyHandler(pBt, 0) );
  2027 
  2028   if( rc==SQLITE_OK ){
  2029     if( p->inTrans==TRANS_NONE ){
  2030       pBt->nTransaction++;
  2031     }
  2032     p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
  2033     if( p->inTrans>pBt->inTransaction ){
  2034       pBt->inTransaction = p->inTrans;
  2035     }
  2036 #ifndef SQLITE_OMIT_SHARED_CACHE
  2037     if( wrflag>1 ){
  2038       assert( !pBt->pExclusive );
  2039       pBt->pExclusive = p;
  2040     }
  2041 #endif
  2042   }
  2043 
  2044 
  2045 trans_begun:
  2046   btreeIntegrity(p);
  2047   sqlite3BtreeLeave(p);
  2048   return rc;
  2049 }
  2050 
  2051 
  2052 #ifndef SQLITE_OMIT_AUTOVACUUM
  2053 
  2054 /*
  2055 ** Set the pointer-map entries for all children of page pPage. Also, if
  2056 ** pPage contains cells that point to overflow pages, set the pointer
  2057 ** map entries for the overflow pages as well.
  2058 */
  2059 static int setChildPtrmaps(MemPage *pPage){
  2060   int i;                             /* Counter variable */
  2061   int nCell;                         /* Number of cells in page pPage */
  2062   int rc;                            /* Return code */
  2063   BtShared *pBt = pPage->pBt;
  2064   int isInitOrig = pPage->isInit;
  2065   Pgno pgno = pPage->pgno;
  2066 
  2067   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  2068   rc = sqlite3BtreeInitPage(pPage);
  2069   if( rc!=SQLITE_OK ){
  2070     goto set_child_ptrmaps_out;
  2071   }
  2072   nCell = pPage->nCell;
  2073 
  2074   for(i=0; i<nCell; i++){
  2075     u8 *pCell = findCell(pPage, i);
  2076 
  2077     rc = ptrmapPutOvflPtr(pPage, pCell);
  2078     if( rc!=SQLITE_OK ){
  2079       goto set_child_ptrmaps_out;
  2080     }
  2081 
  2082     if( !pPage->leaf ){
  2083       Pgno childPgno = get4byte(pCell);
  2084       rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
  2085       if( rc!=SQLITE_OK ) goto set_child_ptrmaps_out;
  2086     }
  2087   }
  2088 
  2089   if( !pPage->leaf ){
  2090     Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
  2091     rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
  2092   }
  2093 
  2094 set_child_ptrmaps_out:
  2095   pPage->isInit = isInitOrig;
  2096   return rc;
  2097 }
  2098 
  2099 /*
  2100 ** Somewhere on pPage, which is guarenteed to be a btree page, not an overflow
  2101 ** page, is a pointer to page iFrom. Modify this pointer so that it points to
  2102 ** iTo. Parameter eType describes the type of pointer to be modified, as 
  2103 ** follows:
  2104 **
  2105 ** PTRMAP_BTREE:     pPage is a btree-page. The pointer points at a child 
  2106 **                   page of pPage.
  2107 **
  2108 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
  2109 **                   page pointed to by one of the cells on pPage.
  2110 **
  2111 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
  2112 **                   overflow page in the list.
  2113 */
  2114 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
  2115   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  2116   if( eType==PTRMAP_OVERFLOW2 ){
  2117     /* The pointer is always the first 4 bytes of the page in this case.  */
  2118     if( get4byte(pPage->aData)!=iFrom ){
  2119       return SQLITE_CORRUPT_BKPT;
  2120     }
  2121     put4byte(pPage->aData, iTo);
  2122   }else{
  2123     int isInitOrig = pPage->isInit;
  2124     int i;
  2125     int nCell;
  2126 
  2127     sqlite3BtreeInitPage(pPage);
  2128     nCell = pPage->nCell;
  2129 
  2130     for(i=0; i<nCell; i++){
  2131       u8 *pCell = findCell(pPage, i);
  2132       if( eType==PTRMAP_OVERFLOW1 ){
  2133         CellInfo info;
  2134         sqlite3BtreeParseCellPtr(pPage, pCell, &info);
  2135         if( info.iOverflow ){
  2136           if( iFrom==get4byte(&pCell[info.iOverflow]) ){
  2137             put4byte(&pCell[info.iOverflow], iTo);
  2138             break;
  2139           }
  2140         }
  2141       }else{
  2142         if( get4byte(pCell)==iFrom ){
  2143           put4byte(pCell, iTo);
  2144           break;
  2145         }
  2146       }
  2147     }
  2148   
  2149     if( i==nCell ){
  2150       if( eType!=PTRMAP_BTREE || 
  2151           get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
  2152         return SQLITE_CORRUPT_BKPT;
  2153       }
  2154       put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
  2155     }
  2156 
  2157     pPage->isInit = isInitOrig;
  2158   }
  2159   return SQLITE_OK;
  2160 }
  2161 
  2162 
  2163 /*
  2164 ** Move the open database page pDbPage to location iFreePage in the 
  2165 ** database. The pDbPage reference remains valid.
  2166 */
  2167 static int relocatePage(
  2168   BtShared *pBt,           /* Btree */
  2169   MemPage *pDbPage,        /* Open page to move */
  2170   u8 eType,                /* Pointer map 'type' entry for pDbPage */
  2171   Pgno iPtrPage,           /* Pointer map 'page-no' entry for pDbPage */
  2172   Pgno iFreePage,          /* The location to move pDbPage to */
  2173   int isCommit
  2174 ){
  2175   MemPage *pPtrPage;   /* The page that contains a pointer to pDbPage */
  2176   Pgno iDbPage = pDbPage->pgno;
  2177   Pager *pPager = pBt->pPager;
  2178   int rc;
  2179 
  2180   assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 || 
  2181       eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
  2182   assert( sqlite3_mutex_held(pBt->mutex) );
  2183   assert( pDbPage->pBt==pBt );
  2184 
  2185   /* Move page iDbPage from its current location to page number iFreePage */
  2186   TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n", 
  2187       iDbPage, iFreePage, iPtrPage, eType));
  2188   rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
  2189   if( rc!=SQLITE_OK ){
  2190     return rc;
  2191   }
  2192   pDbPage->pgno = iFreePage;
  2193 
  2194   /* If pDbPage was a btree-page, then it may have child pages and/or cells
  2195   ** that point to overflow pages. The pointer map entries for all these
  2196   ** pages need to be changed.
  2197   **
  2198   ** If pDbPage is an overflow page, then the first 4 bytes may store a
  2199   ** pointer to a subsequent overflow page. If this is the case, then
  2200   ** the pointer map needs to be updated for the subsequent overflow page.
  2201   */
  2202   if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
  2203     rc = setChildPtrmaps(pDbPage);
  2204     if( rc!=SQLITE_OK ){
  2205       return rc;
  2206     }
  2207   }else{
  2208     Pgno nextOvfl = get4byte(pDbPage->aData);
  2209     if( nextOvfl!=0 ){
  2210       rc = ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage);
  2211       if( rc!=SQLITE_OK ){
  2212         return rc;
  2213       }
  2214     }
  2215   }
  2216 
  2217   /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
  2218   ** that it points at iFreePage. Also fix the pointer map entry for
  2219   ** iPtrPage.
  2220   */
  2221   if( eType!=PTRMAP_ROOTPAGE ){
  2222     rc = sqlite3BtreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
  2223     if( rc!=SQLITE_OK ){
  2224       return rc;
  2225     }
  2226     rc = sqlite3PagerWrite(pPtrPage->pDbPage);
  2227     if( rc!=SQLITE_OK ){
  2228       releasePage(pPtrPage);
  2229       return rc;
  2230     }
  2231     rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
  2232     releasePage(pPtrPage);
  2233     if( rc==SQLITE_OK ){
  2234       rc = ptrmapPut(pBt, iFreePage, eType, iPtrPage);
  2235     }
  2236   }
  2237   return rc;
  2238 }
  2239 
  2240 /* Forward declaration required by incrVacuumStep(). */
  2241 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
  2242 
  2243 /*
  2244 ** Perform a single step of an incremental-vacuum. If successful,
  2245 ** return SQLITE_OK. If there is no work to do (and therefore no
  2246 ** point in calling this function again), return SQLITE_DONE.
  2247 **
  2248 ** More specificly, this function attempts to re-organize the 
  2249 ** database so that the last page of the file currently in use
  2250 ** is no longer in use.
  2251 **
  2252 ** If the nFin parameter is non-zero, the implementation assumes
  2253 ** that the caller will keep calling incrVacuumStep() until
  2254 ** it returns SQLITE_DONE or an error, and that nFin is the
  2255 ** number of pages the database file will contain after this 
  2256 ** process is complete.
  2257 */
  2258 static int incrVacuumStep(BtShared *pBt, Pgno nFin){
  2259   Pgno iLastPg;             /* Last page in the database */
  2260   Pgno nFreeList;           /* Number of pages still on the free-list */
  2261 
  2262   assert( sqlite3_mutex_held(pBt->mutex) );
  2263   iLastPg = pBt->nTrunc;
  2264   if( iLastPg==0 ){
  2265     iLastPg = pagerPagecount(pBt->pPager);
  2266   }
  2267 
  2268   if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
  2269     int rc;
  2270     u8 eType;
  2271     Pgno iPtrPage;
  2272 
  2273     nFreeList = get4byte(&pBt->pPage1->aData[36]);
  2274     if( nFreeList==0 || nFin==iLastPg ){
  2275       return SQLITE_DONE;
  2276     }
  2277 
  2278     rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
  2279     if( rc!=SQLITE_OK ){
  2280       return rc;
  2281     }
  2282     if( eType==PTRMAP_ROOTPAGE ){
  2283       return SQLITE_CORRUPT_BKPT;
  2284     }
  2285 
  2286     if( eType==PTRMAP_FREEPAGE ){
  2287       if( nFin==0 ){
  2288         /* Remove the page from the files free-list. This is not required
  2289         ** if nFin is non-zero. In that case, the free-list will be
  2290         ** truncated to zero after this function returns, so it doesn't 
  2291         ** matter if it still contains some garbage entries.
  2292         */
  2293         Pgno iFreePg;
  2294         MemPage *pFreePg;
  2295         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1);
  2296         if( rc!=SQLITE_OK ){
  2297           return rc;
  2298         }
  2299         assert( iFreePg==iLastPg );
  2300         releasePage(pFreePg);
  2301       }
  2302     } else {
  2303       Pgno iFreePg;             /* Index of free page to move pLastPg to */
  2304       MemPage *pLastPg;
  2305 
  2306       rc = sqlite3BtreeGetPage(pBt, iLastPg, &pLastPg, 0);
  2307       if( rc!=SQLITE_OK ){
  2308         return rc;
  2309       }
  2310 
  2311       /* If nFin is zero, this loop runs exactly once and page pLastPg
  2312       ** is swapped with the first free page pulled off the free list.
  2313       **
  2314       ** On the other hand, if nFin is greater than zero, then keep
  2315       ** looping until a free-page located within the first nFin pages
  2316       ** of the file is found.
  2317       */
  2318       do {
  2319         MemPage *pFreePg;
  2320         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0);
  2321         if( rc!=SQLITE_OK ){
  2322           releasePage(pLastPg);
  2323           return rc;
  2324         }
  2325         releasePage(pFreePg);
  2326       }while( nFin!=0 && iFreePg>nFin );
  2327       assert( iFreePg<iLastPg );
  2328       
  2329       rc = sqlite3PagerWrite(pLastPg->pDbPage);
  2330       if( rc==SQLITE_OK ){
  2331         rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, nFin!=0);
  2332       }
  2333       releasePage(pLastPg);
  2334       if( rc!=SQLITE_OK ){
  2335         return rc;
  2336       }
  2337     }
  2338   }
  2339 
  2340   pBt->nTrunc = iLastPg - 1;
  2341   while( pBt->nTrunc==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, pBt->nTrunc) ){
  2342     pBt->nTrunc--;
  2343   }
  2344   return SQLITE_OK;
  2345 }
  2346 
  2347 /*
  2348 ** A write-transaction must be opened before calling this function.
  2349 ** It performs a single unit of work towards an incremental vacuum.
  2350 **
  2351 ** If the incremental vacuum is finished after this function has run,
  2352 ** SQLITE_DONE is returned. If it is not finished, but no error occured,
  2353 ** SQLITE_OK is returned. Otherwise an SQLite error code. 
  2354 */
  2355 int sqlite3BtreeIncrVacuum(Btree *p){
  2356   int rc;
  2357   BtShared *pBt = p->pBt;
  2358 
  2359   sqlite3BtreeEnter(p);
  2360   pBt->db = p->db;
  2361   assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
  2362   if( !pBt->autoVacuum ){
  2363     rc = SQLITE_DONE;
  2364   }else{
  2365     invalidateAllOverflowCache(pBt);
  2366     rc = incrVacuumStep(pBt, 0);
  2367   }
  2368   sqlite3BtreeLeave(p);
  2369   return rc;
  2370 }
  2371 
  2372 /*
  2373 ** This routine is called prior to sqlite3PagerCommit when a transaction
  2374 ** is commited for an auto-vacuum database.
  2375 **
  2376 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
  2377 ** the database file should be truncated to during the commit process. 
  2378 ** i.e. the database has been reorganized so that only the first *pnTrunc
  2379 ** pages are in use.
  2380 */
  2381 static int autoVacuumCommit(BtShared *pBt, Pgno *pnTrunc){
  2382   int rc = SQLITE_OK;
  2383   Pager *pPager = pBt->pPager;
  2384   VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager) );
  2385 
  2386   assert( sqlite3_mutex_held(pBt->mutex) );
  2387   invalidateAllOverflowCache(pBt);
  2388   assert(pBt->autoVacuum);
  2389   if( !pBt->incrVacuum ){
  2390     Pgno nFin = 0;
  2391 
  2392     if( pBt->nTrunc==0 ){
  2393       Pgno nFree;
  2394       Pgno nPtrmap;
  2395       const int pgsz = pBt->pageSize;
  2396       int nOrig = pagerPagecount(pBt->pPager);
  2397 
  2398       if( PTRMAP_ISPAGE(pBt, nOrig) ){
  2399         return SQLITE_CORRUPT_BKPT;
  2400       }
  2401       if( nOrig==PENDING_BYTE_PAGE(pBt) ){
  2402         nOrig--;
  2403       }
  2404       nFree = get4byte(&pBt->pPage1->aData[36]);
  2405       nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+pgsz/5)/(pgsz/5);
  2406       nFin = nOrig - nFree - nPtrmap;
  2407       if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<=PENDING_BYTE_PAGE(pBt) ){
  2408         nFin--;
  2409       }
  2410       while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
  2411         nFin--;
  2412       }
  2413     }
  2414 
  2415     while( rc==SQLITE_OK ){
  2416       rc = incrVacuumStep(pBt, nFin);
  2417     }
  2418     if( rc==SQLITE_DONE ){
  2419       assert(nFin==0 || pBt->nTrunc==0 || nFin<=pBt->nTrunc);
  2420       rc = SQLITE_OK;
  2421       if( pBt->nTrunc && nFin ){
  2422         rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
  2423         put4byte(&pBt->pPage1->aData[32], 0);
  2424         put4byte(&pBt->pPage1->aData[36], 0);
  2425         pBt->nTrunc = nFin;
  2426       }
  2427     }
  2428     if( rc!=SQLITE_OK ){
  2429       sqlite3PagerRollback(pPager);
  2430     }
  2431   }
  2432 
  2433   if( rc==SQLITE_OK ){
  2434     *pnTrunc = pBt->nTrunc;
  2435     pBt->nTrunc = 0;
  2436   }
  2437   assert( nRef==sqlite3PagerRefcount(pPager) );
  2438   return rc;
  2439 }
  2440 
  2441 #endif
  2442 
  2443 /*
  2444 ** This routine does the first phase of a two-phase commit.  This routine
  2445 ** causes a rollback journal to be created (if it does not already exist)
  2446 ** and populated with enough information so that if a power loss occurs
  2447 ** the database can be restored to its original state by playing back
  2448 ** the journal.  Then the contents of the journal are flushed out to
  2449 ** the disk.  After the journal is safely on oxide, the changes to the
  2450 ** database are written into the database file and flushed to oxide.
  2451 ** At the end of this call, the rollback journal still exists on the
  2452 ** disk and we are still holding all locks, so the transaction has not
  2453 ** committed.  See sqlite3BtreeCommit() for the second phase of the
  2454 ** commit process.
  2455 **
  2456 ** This call is a no-op if no write-transaction is currently active on pBt.
  2457 **
  2458 ** Otherwise, sync the database file for the btree pBt. zMaster points to
  2459 ** the name of a master journal file that should be written into the
  2460 ** individual journal file, or is NULL, indicating no master journal file 
  2461 ** (single database transaction).
  2462 **
  2463 ** When this is called, the master journal should already have been
  2464 ** created, populated with this journal pointer and synced to disk.
  2465 **
  2466 ** Once this is routine has returned, the only thing required to commit
  2467 ** the write-transaction for this database file is to delete the journal.
  2468 */
  2469 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
  2470   int rc = SQLITE_OK;
  2471   if( p->inTrans==TRANS_WRITE ){
  2472     BtShared *pBt = p->pBt;
  2473     Pgno nTrunc = 0;
  2474     sqlite3BtreeEnter(p);
  2475     pBt->db = p->db;
  2476 #ifndef SQLITE_OMIT_AUTOVACUUM
  2477     if( pBt->autoVacuum ){
  2478       rc = autoVacuumCommit(pBt, &nTrunc); 
  2479       if( rc!=SQLITE_OK ){
  2480         sqlite3BtreeLeave(p);
  2481         return rc;
  2482       }
  2483     }
  2484 #endif
  2485     rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, nTrunc, 0);
  2486     sqlite3BtreeLeave(p);
  2487   }
  2488   return rc;
  2489 }
  2490 
  2491 /*
  2492 ** Commit the transaction currently in progress.
  2493 **
  2494 ** This routine implements the second phase of a 2-phase commit.  The
  2495 ** sqlite3BtreeSync() routine does the first phase and should be invoked
  2496 ** prior to calling this routine.  The sqlite3BtreeSync() routine did
  2497 ** all the work of writing information out to disk and flushing the
  2498 ** contents so that they are written onto the disk platter.  All this
  2499 ** routine has to do is delete or truncate the rollback journal
  2500 ** (which causes the transaction to commit) and drop locks.
  2501 **
  2502 ** This will release the write lock on the database file.  If there
  2503 ** are no active cursors, it also releases the read lock.
  2504 */
  2505 int sqlite3BtreeCommitPhaseTwo(Btree *p){
  2506   BtShared *pBt = p->pBt;
  2507 
  2508   sqlite3BtreeEnter(p);
  2509   pBt->db = p->db;
  2510   btreeIntegrity(p);
  2511 
  2512   /* If the handle has a write-transaction open, commit the shared-btrees 
  2513   ** transaction and set the shared state to TRANS_READ.
  2514   */
  2515   if( p->inTrans==TRANS_WRITE ){
  2516     int rc;
  2517     assert( pBt->inTransaction==TRANS_WRITE );
  2518     assert( pBt->nTransaction>0 );
  2519     rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
  2520     if( rc!=SQLITE_OK ){
  2521       sqlite3BtreeLeave(p);
  2522       return rc;
  2523     }
  2524     pBt->inTransaction = TRANS_READ;
  2525     pBt->inStmt = 0;
  2526   }
  2527   unlockAllTables(p);
  2528 
  2529   /* If the handle has any kind of transaction open, decrement the transaction
  2530   ** count of the shared btree. If the transaction count reaches 0, set
  2531   ** the shared state to TRANS_NONE. The unlockBtreeIfUnused() call below
  2532   ** will unlock the pager.
  2533   */
  2534   if( p->inTrans!=TRANS_NONE ){
  2535     pBt->nTransaction--;
  2536     if( 0==pBt->nTransaction ){
  2537       pBt->inTransaction = TRANS_NONE;
  2538     }
  2539   }
  2540 
  2541   /* Set the handles current transaction state to TRANS_NONE and unlock
  2542   ** the pager if this call closed the only read or write transaction.
  2543   */
  2544   p->inTrans = TRANS_NONE;
  2545   unlockBtreeIfUnused(pBt);
  2546 
  2547   btreeIntegrity(p);
  2548   sqlite3BtreeLeave(p);
  2549   return SQLITE_OK;
  2550 }
  2551 
  2552 /*
  2553 ** Do both phases of a commit.
  2554 */
  2555 int sqlite3BtreeCommit(Btree *p){
  2556   int rc;
  2557   sqlite3BtreeEnter(p);
  2558   rc = sqlite3BtreeCommitPhaseOne(p, 0);
  2559   if( rc==SQLITE_OK ){
  2560     rc = sqlite3BtreeCommitPhaseTwo(p);
  2561   }
  2562   sqlite3BtreeLeave(p);
  2563   return rc;
  2564 }
  2565 
  2566 #ifndef NDEBUG
  2567 /*
  2568 ** Return the number of write-cursors open on this handle. This is for use
  2569 ** in assert() expressions, so it is only compiled if NDEBUG is not
  2570 ** defined.
  2571 **
  2572 ** For the purposes of this routine, a write-cursor is any cursor that
  2573 ** is capable of writing to the databse.  That means the cursor was
  2574 ** originally opened for writing and the cursor has not be disabled
  2575 ** by having its state changed to CURSOR_FAULT.
  2576 */
  2577 static int countWriteCursors(BtShared *pBt){
  2578   BtCursor *pCur;
  2579   int r = 0;
  2580   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
  2581     if( pCur->wrFlag && pCur->eState!=CURSOR_FAULT ) r++; 
  2582   }
  2583   return r;
  2584 }
  2585 #endif
  2586 
  2587 /*
  2588 ** This routine sets the state to CURSOR_FAULT and the error
  2589 ** code to errCode for every cursor on BtShared that pBtree
  2590 ** references.
  2591 **
  2592 ** Every cursor is tripped, including cursors that belong
  2593 ** to other database connections that happen to be sharing
  2594 ** the cache with pBtree.
  2595 **
  2596 ** This routine gets called when a rollback occurs.
  2597 ** All cursors using the same cache must be tripped
  2598 ** to prevent them from trying to use the btree after
  2599 ** the rollback.  The rollback may have deleted tables
  2600 ** or moved root pages, so it is not sufficient to
  2601 ** save the state of the cursor.  The cursor must be
  2602 ** invalidated.
  2603 */
  2604 void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){
  2605   BtCursor *p;
  2606   sqlite3BtreeEnter(pBtree);
  2607   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
  2608     sqlite3BtreeClearCursor(p);
  2609     p->eState = CURSOR_FAULT;
  2610     p->skip = errCode;
  2611   }
  2612   sqlite3BtreeLeave(pBtree);
  2613 }
  2614 
  2615 /*
  2616 ** Rollback the transaction in progress.  All cursors will be
  2617 ** invalided by this operation.  Any attempt to use a cursor
  2618 ** that was open at the beginning of this operation will result
  2619 ** in an error.
  2620 **
  2621 ** This will release the write lock on the database file.  If there
  2622 ** are no active cursors, it also releases the read lock.
  2623 */
  2624 int sqlite3BtreeRollback(Btree *p){
  2625   int rc;
  2626   BtShared *pBt = p->pBt;
  2627   MemPage *pPage1;
  2628 
  2629   sqlite3BtreeEnter(p);
  2630   pBt->db = p->db;
  2631   rc = saveAllCursors(pBt, 0, 0);
  2632 #ifndef SQLITE_OMIT_SHARED_CACHE
  2633   if( rc!=SQLITE_OK ){
  2634     /* This is a horrible situation. An IO or malloc() error occured whilst
  2635     ** trying to save cursor positions. If this is an automatic rollback (as
  2636     ** the result of a constraint, malloc() failure or IO error) then 
  2637     ** the cache may be internally inconsistent (not contain valid trees) so
  2638     ** we cannot simply return the error to the caller. Instead, abort 
  2639     ** all queries that may be using any of the cursors that failed to save.
  2640     */
  2641     sqlite3BtreeTripAllCursors(p, rc);
  2642   }
  2643 #endif
  2644   btreeIntegrity(p);
  2645   unlockAllTables(p);
  2646 
  2647   if( p->inTrans==TRANS_WRITE ){
  2648     int rc2;
  2649 
  2650 #ifndef SQLITE_OMIT_AUTOVACUUM
  2651     pBt->nTrunc = 0;
  2652 #endif
  2653 
  2654     assert( TRANS_WRITE==pBt->inTransaction );
  2655     rc2 = sqlite3PagerRollback(pBt->pPager);
  2656     if( rc2!=SQLITE_OK ){
  2657       rc = rc2;
  2658     }
  2659 
  2660     /* The rollback may have destroyed the pPage1->aData value.  So
  2661     ** call sqlite3BtreeGetPage() on page 1 again to make
  2662     ** sure pPage1->aData is set correctly. */
  2663     if( sqlite3BtreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
  2664       releasePage(pPage1);
  2665     }
  2666     assert( countWriteCursors(pBt)==0 );
  2667     pBt->inTransaction = TRANS_READ;
  2668   }
  2669 
  2670   if( p->inTrans!=TRANS_NONE ){
  2671     assert( pBt->nTransaction>0 );
  2672     pBt->nTransaction--;
  2673     if( 0==pBt->nTransaction ){
  2674       pBt->inTransaction = TRANS_NONE;
  2675     }
  2676   }
  2677 
  2678   p->inTrans = TRANS_NONE;
  2679   pBt->inStmt = 0;
  2680   unlockBtreeIfUnused(pBt);
  2681 
  2682   btreeIntegrity(p);
  2683   sqlite3BtreeLeave(p);
  2684   return rc;
  2685 }
  2686 
  2687 /*
  2688 ** Start a statement subtransaction.  The subtransaction can
  2689 ** can be rolled back independently of the main transaction.
  2690 ** You must start a transaction before starting a subtransaction.
  2691 ** The subtransaction is ended automatically if the main transaction
  2692 ** commits or rolls back.
  2693 **
  2694 ** Only one subtransaction may be active at a time.  It is an error to try
  2695 ** to start a new subtransaction if another subtransaction is already active.
  2696 **
  2697 ** Statement subtransactions are used around individual SQL statements
  2698 ** that are contained within a BEGIN...COMMIT block.  If a constraint
  2699 ** error occurs within the statement, the effect of that one statement
  2700 ** can be rolled back without having to rollback the entire transaction.
  2701 */
  2702 int sqlite3BtreeBeginStmt(Btree *p){
  2703   int rc;
  2704   BtShared *pBt = p->pBt;
  2705   sqlite3BtreeEnter(p);
  2706   pBt->db = p->db;
  2707   if( (p->inTrans!=TRANS_WRITE) || pBt->inStmt ){
  2708     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
  2709   }else{
  2710     assert( pBt->inTransaction==TRANS_WRITE );
  2711     rc = pBt->readOnly ? SQLITE_OK : sqlite3PagerStmtBegin(pBt->pPager);
  2712     pBt->inStmt = 1;
  2713   }
  2714   sqlite3BtreeLeave(p);
  2715   return rc;
  2716 }
  2717 
  2718 
  2719 /*
  2720 ** Commit the statment subtransaction currently in progress.  If no
  2721 ** subtransaction is active, this is a no-op.
  2722 */
  2723 int sqlite3BtreeCommitStmt(Btree *p){
  2724   int rc;
  2725   BtShared *pBt = p->pBt;
  2726   sqlite3BtreeEnter(p);
  2727   pBt->db = p->db;
  2728   if( pBt->inStmt && !pBt->readOnly ){
  2729     rc = sqlite3PagerStmtCommit(pBt->pPager);
  2730   }else{
  2731     rc = SQLITE_OK;
  2732   }
  2733   pBt->inStmt = 0;
  2734   sqlite3BtreeLeave(p);
  2735   return rc;
  2736 }
  2737 
  2738 /*
  2739 ** Rollback the active statement subtransaction.  If no subtransaction
  2740 ** is active this routine is a no-op.
  2741 **
  2742 ** All cursors will be invalidated by this operation.  Any attempt
  2743 ** to use a cursor that was open at the beginning of this operation
  2744 ** will result in an error.
  2745 */
  2746 int sqlite3BtreeRollbackStmt(Btree *p){
  2747   int rc = SQLITE_OK;
  2748   BtShared *pBt = p->pBt;
  2749   sqlite3BtreeEnter(p);
  2750   pBt->db = p->db;
  2751   if( pBt->inStmt && !pBt->readOnly ){
  2752     rc = sqlite3PagerStmtRollback(pBt->pPager);
  2753     pBt->inStmt = 0;
  2754   }
  2755   sqlite3BtreeLeave(p);
  2756   return rc;
  2757 }
  2758 
  2759 /*
  2760 ** Create a new cursor for the BTree whose root is on the page
  2761 ** iTable.  The act of acquiring a cursor gets a read lock on 
  2762 ** the database file.
  2763 **
  2764 ** If wrFlag==0, then the cursor can only be used for reading.
  2765 ** If wrFlag==1, then the cursor can be used for reading or for
  2766 ** writing if other conditions for writing are also met.  These
  2767 ** are the conditions that must be met in order for writing to
  2768 ** be allowed:
  2769 **
  2770 ** 1:  The cursor must have been opened with wrFlag==1
  2771 **
  2772 ** 2:  Other database connections that share the same pager cache
  2773 **     but which are not in the READ_UNCOMMITTED state may not have
  2774 **     cursors open with wrFlag==0 on the same table.  Otherwise
  2775 **     the changes made by this write cursor would be visible to
  2776 **     the read cursors in the other database connection.
  2777 **
  2778 ** 3:  The database must be writable (not on read-only media)
  2779 **
  2780 ** 4:  There must be an active transaction.
  2781 **
  2782 ** No checking is done to make sure that page iTable really is the
  2783 ** root page of a b-tree.  If it is not, then the cursor acquired
  2784 ** will not work correctly.
  2785 **
  2786 ** It is assumed that the sqlite3BtreeCursorSize() bytes of memory 
  2787 ** pointed to by pCur have been zeroed by the caller.
  2788 */
  2789 static int btreeCursor(
  2790   Btree *p,                              /* The btree */
  2791   int iTable,                            /* Root page of table to open */
  2792   int wrFlag,                            /* 1 to write. 0 read-only */
  2793   struct KeyInfo *pKeyInfo,              /* First arg to comparison function */
  2794   BtCursor *pCur                         /* Space for new cursor */
  2795 ){
  2796   int rc;
  2797   BtShared *pBt = p->pBt;
  2798 
  2799   assert( sqlite3BtreeHoldsMutex(p) );
  2800   if( wrFlag ){
  2801     if( pBt->readOnly ){
  2802       return SQLITE_READONLY;
  2803     }
  2804     if( checkReadLocks(p, iTable, 0, 0) ){
  2805       return SQLITE_LOCKED;
  2806     }
  2807   }
  2808 
  2809   if( pBt->pPage1==0 ){
  2810     rc = lockBtreeWithRetry(p);
  2811     if( rc!=SQLITE_OK ){
  2812       return rc;
  2813     }
  2814     if( pBt->readOnly && wrFlag ){
  2815       return SQLITE_READONLY;
  2816     }
  2817   }
  2818   pCur->pgnoRoot = (Pgno)iTable;
  2819   if( iTable==1 && pagerPagecount(pBt->pPager)==0 ){
  2820     rc = SQLITE_EMPTY;
  2821     goto create_cursor_exception;
  2822   }
  2823   rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]);
  2824   if( rc!=SQLITE_OK ){
  2825     goto create_cursor_exception;
  2826   }
  2827 
  2828   /* Now that no other errors can occur, finish filling in the BtCursor
  2829   ** variables, link the cursor into the BtShared list and set *ppCur (the
  2830   ** output argument to this function).
  2831   */
  2832   pCur->pKeyInfo = pKeyInfo;
  2833   pCur->pBtree = p;
  2834   pCur->pBt = pBt;
  2835   pCur->wrFlag = wrFlag;
  2836   pCur->pNext = pBt->pCursor;
  2837   if( pCur->pNext ){
  2838     pCur->pNext->pPrev = pCur;
  2839   }
  2840   pBt->pCursor = pCur;
  2841   pCur->eState = CURSOR_INVALID;
  2842 
  2843   return SQLITE_OK;
  2844 
  2845 create_cursor_exception:
  2846   releasePage(pCur->apPage[0]);
  2847   unlockBtreeIfUnused(pBt);
  2848   return rc;
  2849 }
  2850 int sqlite3BtreeCursor(
  2851   Btree *p,                                   /* The btree */
  2852   int iTable,                                 /* Root page of table to open */
  2853   int wrFlag,                                 /* 1 to write. 0 read-only */
  2854   struct KeyInfo *pKeyInfo,                   /* First arg to xCompare() */
  2855   BtCursor *pCur                              /* Write new cursor here */
  2856 ){
  2857   int rc;
  2858   sqlite3BtreeEnter(p);
  2859   p->pBt->db = p->db;
  2860   rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
  2861   sqlite3BtreeLeave(p);
  2862   return rc;
  2863 }
  2864 int sqlite3BtreeCursorSize(){
  2865   return sizeof(BtCursor);
  2866 }
  2867 
  2868 
  2869 
  2870 /*
  2871 ** Close a cursor.  The read lock on the database file is released
  2872 ** when the last cursor is closed.
  2873 */
  2874 int sqlite3BtreeCloseCursor(BtCursor *pCur){
  2875   Btree *pBtree = pCur->pBtree;
  2876   if( pBtree ){
  2877     int i;
  2878     BtShared *pBt = pCur->pBt;
  2879     sqlite3BtreeEnter(pBtree);
  2880     pBt->db = pBtree->db;
  2881     sqlite3BtreeClearCursor(pCur);
  2882     if( pCur->pPrev ){
  2883       pCur->pPrev->pNext = pCur->pNext;
  2884     }else{
  2885       pBt->pCursor = pCur->pNext;
  2886     }
  2887     if( pCur->pNext ){
  2888       pCur->pNext->pPrev = pCur->pPrev;
  2889     }
  2890     for(i=0; i<=pCur->iPage; i++){
  2891       releasePage(pCur->apPage[i]);
  2892     }
  2893     unlockBtreeIfUnused(pBt);
  2894     invalidateOverflowCache(pCur);
  2895     /* sqlite3_free(pCur); */
  2896     sqlite3BtreeLeave(pBtree);
  2897   }
  2898   return SQLITE_OK;
  2899 }
  2900 
  2901 /*
  2902 ** Make a temporary cursor by filling in the fields of pTempCur.
  2903 ** The temporary cursor is not on the cursor list for the Btree.
  2904 */
  2905 void sqlite3BtreeGetTempCursor(BtCursor *pCur, BtCursor *pTempCur){
  2906   int i;
  2907   assert( cursorHoldsMutex(pCur) );
  2908   memcpy(pTempCur, pCur, sizeof(BtCursor));
  2909   pTempCur->pNext = 0;
  2910   pTempCur->pPrev = 0;
  2911   for(i=0; i<=pTempCur->iPage; i++){
  2912     sqlite3PagerRef(pTempCur->apPage[i]->pDbPage);
  2913   }
  2914 }
  2915 
  2916 /*
  2917 ** Delete a temporary cursor such as was made by the CreateTemporaryCursor()
  2918 ** function above.
  2919 */
  2920 void sqlite3BtreeReleaseTempCursor(BtCursor *pCur){
  2921   int i;
  2922   assert( cursorHoldsMutex(pCur) );
  2923   for(i=0; i<=pCur->iPage; i++){
  2924     sqlite3PagerUnref(pCur->apPage[i]->pDbPage);
  2925   }
  2926 }
  2927 
  2928 /*
  2929 ** Make sure the BtCursor* given in the argument has a valid
  2930 ** BtCursor.info structure.  If it is not already valid, call
  2931 ** sqlite3BtreeParseCell() to fill it in.
  2932 **
  2933 ** BtCursor.info is a cache of the information in the current cell.
  2934 ** Using this cache reduces the number of calls to sqlite3BtreeParseCell().
  2935 **
  2936 ** 2007-06-25:  There is a bug in some versions of MSVC that cause the
  2937 ** compiler to crash when getCellInfo() is implemented as a macro.
  2938 ** But there is a measureable speed advantage to using the macro on gcc
  2939 ** (when less compiler optimizations like -Os or -O0 are used and the
  2940 ** compiler is not doing agressive inlining.)  So we use a real function
  2941 ** for MSVC and a macro for everything else.  Ticket #2457.
  2942 */
  2943 #ifndef NDEBUG
  2944   static void assertCellInfo(BtCursor *pCur){
  2945     CellInfo info;
  2946     int iPage = pCur->iPage;
  2947     memset(&info, 0, sizeof(info));
  2948     sqlite3BtreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info);
  2949     assert( memcmp(&info, &pCur->info, sizeof(info))==0 );
  2950   }
  2951 #else
  2952   #define assertCellInfo(x)
  2953 #endif
  2954 #ifdef _MSC_VER
  2955   /* Use a real function in MSVC to work around bugs in that compiler. */
  2956   static void getCellInfo(BtCursor *pCur){
  2957     if( pCur->info.nSize==0 ){
  2958       int iPage = pCur->iPage;
  2959       sqlite3BtreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);
  2960       pCur->validNKey = 1;
  2961     }else{
  2962       assertCellInfo(pCur);
  2963     }
  2964   }
  2965 #else /* if not _MSC_VER */
  2966   /* Use a macro in all other compilers so that the function is inlined */
  2967 #define getCellInfo(pCur)                                                      \
  2968   if( pCur->info.nSize==0 ){                                                   \
  2969     int iPage = pCur->iPage;                                                   \
  2970     sqlite3BtreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); \
  2971     pCur->validNKey = 1;                                                       \
  2972   }else{                                                                       \
  2973     assertCellInfo(pCur);                                                      \
  2974   }
  2975 #endif /* _MSC_VER */
  2976 
  2977 /*
  2978 ** Set *pSize to the size of the buffer needed to hold the value of
  2979 ** the key for the current entry.  If the cursor is not pointing
  2980 ** to a valid entry, *pSize is set to 0. 
  2981 **
  2982 ** For a table with the INTKEY flag set, this routine returns the key
  2983 ** itself, not the number of bytes in the key.
  2984 */
  2985 int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
  2986   int rc;
  2987 
  2988   assert( cursorHoldsMutex(pCur) );
  2989   rc = restoreCursorPosition(pCur);
  2990   if( rc==SQLITE_OK ){
  2991     assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
  2992     if( pCur->eState==CURSOR_INVALID ){
  2993       *pSize = 0;
  2994     }else{
  2995       getCellInfo(pCur);
  2996       *pSize = pCur->info.nKey;
  2997     }
  2998   }
  2999   return rc;
  3000 }
  3001 
  3002 /*
  3003 ** Set *pSize to the number of bytes of data in the entry the
  3004 ** cursor currently points to.  Always return SQLITE_OK.
  3005 ** Failure is not possible.  If the cursor is not currently
  3006 ** pointing to an entry (which can happen, for example, if
  3007 ** the database is empty) then *pSize is set to 0.
  3008 */
  3009 int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
  3010   int rc;
  3011 
  3012   assert( cursorHoldsMutex(pCur) );
  3013   rc = restoreCursorPosition(pCur);
  3014   if( rc==SQLITE_OK ){
  3015     assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
  3016     if( pCur->eState==CURSOR_INVALID ){
  3017       /* Not pointing at a valid entry - set *pSize to 0. */
  3018       *pSize = 0;
  3019     }else{
  3020       getCellInfo(pCur);
  3021       *pSize = pCur->info.nData;
  3022     }
  3023   }
  3024   return rc;
  3025 }
  3026 
  3027 /*
  3028 ** Given the page number of an overflow page in the database (parameter
  3029 ** ovfl), this function finds the page number of the next page in the 
  3030 ** linked list of overflow pages. If possible, it uses the auto-vacuum
  3031 ** pointer-map data instead of reading the content of page ovfl to do so. 
  3032 **
  3033 ** If an error occurs an SQLite error code is returned. Otherwise:
  3034 **
  3035 ** Unless pPgnoNext is NULL, the page number of the next overflow 
  3036 ** page in the linked list is written to *pPgnoNext. If page ovfl
  3037 ** is the last page in its linked list, *pPgnoNext is set to zero. 
  3038 **
  3039 ** If ppPage is not NULL, *ppPage is set to the MemPage* handle
  3040 ** for page ovfl. The underlying pager page may have been requested
  3041 ** with the noContent flag set, so the page data accessable via
  3042 ** this handle may not be trusted.
  3043 */
  3044 static int getOverflowPage(
  3045   BtShared *pBt, 
  3046   Pgno ovfl,                   /* Overflow page */
  3047   MemPage **ppPage,            /* OUT: MemPage handle */
  3048   Pgno *pPgnoNext              /* OUT: Next overflow page number */
  3049 ){
  3050   Pgno next = 0;
  3051   int rc;
  3052 
  3053   assert( sqlite3_mutex_held(pBt->mutex) );
  3054   /* One of these must not be NULL. Otherwise, why call this function? */
  3055   assert(ppPage || pPgnoNext);
  3056 
  3057   /* If pPgnoNext is NULL, then this function is being called to obtain
  3058   ** a MemPage* reference only. No page-data is required in this case.
  3059   */
  3060   if( !pPgnoNext ){
  3061     return sqlite3BtreeGetPage(pBt, ovfl, ppPage, 1);
  3062   }
  3063 
  3064 #ifndef SQLITE_OMIT_AUTOVACUUM
  3065   /* Try to find the next page in the overflow list using the
  3066   ** autovacuum pointer-map pages. Guess that the next page in 
  3067   ** the overflow list is page number (ovfl+1). If that guess turns 
  3068   ** out to be wrong, fall back to loading the data of page 
  3069   ** number ovfl to determine the next page number.
  3070   */
  3071   if( pBt->autoVacuum ){
  3072     Pgno pgno;
  3073     Pgno iGuess = ovfl+1;
  3074     u8 eType;
  3075 
  3076     while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
  3077       iGuess++;
  3078     }
  3079 
  3080     if( iGuess<=pagerPagecount(pBt->pPager) ){
  3081       rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
  3082       if( rc!=SQLITE_OK ){
  3083         return rc;
  3084       }
  3085       if( eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
  3086         next = iGuess;
  3087       }
  3088     }
  3089   }
  3090 #endif
  3091 
  3092   if( next==0 || ppPage ){
  3093     MemPage *pPage = 0;
  3094 
  3095     rc = sqlite3BtreeGetPage(pBt, ovfl, &pPage, next!=0);
  3096     assert(rc==SQLITE_OK || pPage==0);
  3097     if( next==0 && rc==SQLITE_OK ){
  3098       next = get4byte(pPage->aData);
  3099     }
  3100 
  3101     if( ppPage ){
  3102       *ppPage = pPage;
  3103     }else{
  3104       releasePage(pPage);
  3105     }
  3106   }
  3107   *pPgnoNext = next;
  3108 
  3109   return rc;
  3110 }
  3111 
  3112 /*
  3113 ** Copy data from a buffer to a page, or from a page to a buffer.
  3114 **
  3115 ** pPayload is a pointer to data stored on database page pDbPage.
  3116 ** If argument eOp is false, then nByte bytes of data are copied
  3117 ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
  3118 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
  3119 ** of data are copied from the buffer pBuf to pPayload.
  3120 **
  3121 ** SQLITE_OK is returned on success, otherwise an error code.
  3122 */
  3123 static int copyPayload(
  3124   void *pPayload,           /* Pointer to page data */
  3125   void *pBuf,               /* Pointer to buffer */
  3126   int nByte,                /* Number of bytes to copy */
  3127   int eOp,                  /* 0 -> copy from page, 1 -> copy to page */
  3128   DbPage *pDbPage           /* Page containing pPayload */
  3129 ){
  3130   if( eOp ){
  3131     /* Copy data from buffer to page (a write operation) */
  3132     int rc = sqlite3PagerWrite(pDbPage);
  3133     if( rc!=SQLITE_OK ){
  3134       return rc;
  3135     }
  3136     memcpy(pPayload, pBuf, nByte);
  3137   }else{
  3138     /* Copy data from page to buffer (a read operation) */
  3139     memcpy(pBuf, pPayload, nByte);
  3140   }
  3141   return SQLITE_OK;
  3142 }
  3143 
  3144 /*
  3145 ** This function is used to read or overwrite payload information
  3146 ** for the entry that the pCur cursor is pointing to. If the eOp
  3147 ** parameter is 0, this is a read operation (data copied into
  3148 ** buffer pBuf). If it is non-zero, a write (data copied from
  3149 ** buffer pBuf).
  3150 **
  3151 ** A total of "amt" bytes are read or written beginning at "offset".
  3152 ** Data is read to or from the buffer pBuf.
  3153 **
  3154 ** This routine does not make a distinction between key and data.
  3155 ** It just reads or writes bytes from the payload area.  Data might 
  3156 ** appear on the main page or be scattered out on multiple overflow 
  3157 ** pages.
  3158 **
  3159 ** If the BtCursor.isIncrblobHandle flag is set, and the current
  3160 ** cursor entry uses one or more overflow pages, this function
  3161 ** allocates space for and lazily popluates the overflow page-list 
  3162 ** cache array (BtCursor.aOverflow). Subsequent calls use this
  3163 ** cache to make seeking to the supplied offset more efficient.
  3164 **
  3165 ** Once an overflow page-list cache has been allocated, it may be
  3166 ** invalidated if some other cursor writes to the same table, or if
  3167 ** the cursor is moved to a different row. Additionally, in auto-vacuum
  3168 ** mode, the following events may invalidate an overflow page-list cache.
  3169 **
  3170 **   * An incremental vacuum,
  3171 **   * A commit in auto_vacuum="full" mode,
  3172 **   * Creating a table (may require moving an overflow page).
  3173 */
  3174 static int accessPayload(
  3175   BtCursor *pCur,      /* Cursor pointing to entry to read from */
  3176   int offset,          /* Begin reading this far into payload */
  3177   int amt,             /* Read this many bytes */
  3178   unsigned char *pBuf, /* Write the bytes into this buffer */ 
  3179   int skipKey,         /* offset begins at data if this is true */
  3180   int eOp              /* zero to read. non-zero to write. */
  3181 ){
  3182   unsigned char *aPayload;
  3183   int rc = SQLITE_OK;
  3184   u32 nKey;
  3185   int iIdx = 0;
  3186   MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */
  3187   BtShared *pBt = pCur->pBt;                  /* Btree this cursor belongs to */
  3188 
  3189   assert( pPage );
  3190   assert( pCur->eState==CURSOR_VALID );
  3191   assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
  3192   assert( offset>=0 );
  3193   assert( cursorHoldsMutex(pCur) );
  3194 
  3195   getCellInfo(pCur);
  3196   aPayload = pCur->info.pCell + pCur->info.nHeader;
  3197   nKey = (pPage->intKey ? 0 : pCur->info.nKey);
  3198 
  3199   if( skipKey ){
  3200     offset += nKey;
  3201   }
  3202   if( offset+amt > nKey+pCur->info.nData 
  3203    || &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize]
  3204   ){
  3205     /* Trying to read or write past the end of the data is an error */
  3206     return SQLITE_CORRUPT_BKPT;
  3207   }
  3208 
  3209   /* Check if data must be read/written to/from the btree page itself. */
  3210   if( offset<pCur->info.nLocal ){
  3211     int a = amt;
  3212     if( a+offset>pCur->info.nLocal ){
  3213       a = pCur->info.nLocal - offset;
  3214     }
  3215     rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
  3216     offset = 0;
  3217     pBuf += a;
  3218     amt -= a;
  3219   }else{
  3220     offset -= pCur->info.nLocal;
  3221   }
  3222 
  3223   pBt = pCur->pBt;
  3224   if( rc==SQLITE_OK && amt>0 ){
  3225     const int ovflSize = pBt->usableSize - 4;  /* Bytes content per ovfl page */
  3226     Pgno nextPage;
  3227 
  3228     nextPage = get4byte(&aPayload[pCur->info.nLocal]);
  3229 
  3230 #ifndef SQLITE_OMIT_INCRBLOB
  3231     /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[]
  3232     ** has not been allocated, allocate it now. The array is sized at
  3233     ** one entry for each overflow page in the overflow chain. The
  3234     ** page number of the first overflow page is stored in aOverflow[0],
  3235     ** etc. A value of 0 in the aOverflow[] array means "not yet known"
  3236     ** (the cache is lazily populated).
  3237     */
  3238     if( pCur->isIncrblobHandle && !pCur->aOverflow ){
  3239       int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
  3240       pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl);
  3241       if( nOvfl && !pCur->aOverflow ){
  3242         rc = SQLITE_NOMEM;
  3243       }
  3244     }
  3245 
  3246     /* If the overflow page-list cache has been allocated and the
  3247     ** entry for the first required overflow page is valid, skip
  3248     ** directly to it.
  3249     */
  3250     if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){
  3251       iIdx = (offset/ovflSize);
  3252       nextPage = pCur->aOverflow[iIdx];
  3253       offset = (offset%ovflSize);
  3254     }
  3255 #endif
  3256 
  3257     for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){
  3258 
  3259 #ifndef SQLITE_OMIT_INCRBLOB
  3260       /* If required, populate the overflow page-list cache. */
  3261       if( pCur->aOverflow ){
  3262         assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage);
  3263         pCur->aOverflow[iIdx] = nextPage;
  3264       }
  3265 #endif
  3266 
  3267       if( offset>=ovflSize ){
  3268         /* The only reason to read this page is to obtain the page
  3269         ** number for the next page in the overflow chain. The page
  3270         ** data is not required. So first try to lookup the overflow
  3271         ** page-list cache, if any, then fall back to the getOverflowPage()
  3272         ** function.
  3273         */
  3274 #ifndef SQLITE_OMIT_INCRBLOB
  3275         if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){
  3276           nextPage = pCur->aOverflow[iIdx+1];
  3277         } else 
  3278 #endif
  3279           rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
  3280         offset -= ovflSize;
  3281       }else{
  3282         /* Need to read this page properly. It contains some of the
  3283         ** range of data that is being read (eOp==0) or written (eOp!=0).
  3284         */
  3285         DbPage *pDbPage;
  3286         int a = amt;
  3287         rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage);
  3288         if( rc==SQLITE_OK ){
  3289           aPayload = sqlite3PagerGetData(pDbPage);
  3290           nextPage = get4byte(aPayload);
  3291           if( a + offset > ovflSize ){
  3292             a = ovflSize - offset;
  3293           }
  3294           rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
  3295           sqlite3PagerUnref(pDbPage);
  3296           offset = 0;
  3297           amt -= a;
  3298           pBuf += a;
  3299         }
  3300       }
  3301     }
  3302   }
  3303 
  3304   if( rc==SQLITE_OK && amt>0 ){
  3305     return SQLITE_CORRUPT_BKPT;
  3306   }
  3307   return rc;
  3308 }
  3309 
  3310 /*
  3311 ** Read part of the key associated with cursor pCur.  Exactly
  3312 ** "amt" bytes will be transfered into pBuf[].  The transfer
  3313 ** begins at "offset".
  3314 **
  3315 ** Return SQLITE_OK on success or an error code if anything goes
  3316 ** wrong.  An error is returned if "offset+amt" is larger than
  3317 ** the available payload.
  3318 */
  3319 int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
  3320   int rc;
  3321 
  3322   assert( cursorHoldsMutex(pCur) );
  3323   rc = restoreCursorPosition(pCur);
  3324   if( rc==SQLITE_OK ){
  3325     assert( pCur->eState==CURSOR_VALID );
  3326     assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
  3327     if( pCur->apPage[0]->intKey ){
  3328       return SQLITE_CORRUPT_BKPT;
  3329     }
  3330     assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
  3331     rc = accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0, 0);
  3332   }
  3333   return rc;
  3334 }
  3335 
  3336 /*
  3337 ** Read part of the data associated with cursor pCur.  Exactly
  3338 ** "amt" bytes will be transfered into pBuf[].  The transfer
  3339 ** begins at "offset".
  3340 **
  3341 ** Return SQLITE_OK on success or an error code if anything goes
  3342 ** wrong.  An error is returned if "offset+amt" is larger than
  3343 ** the available payload.
  3344 */
  3345 int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
  3346   int rc;
  3347 
  3348 #ifndef SQLITE_OMIT_INCRBLOB
  3349   if ( pCur->eState==CURSOR_INVALID ){
  3350     return SQLITE_ABORT;
  3351   }
  3352 #endif
  3353 
  3354   assert( cursorHoldsMutex(pCur) );
  3355   rc = restoreCursorPosition(pCur);
  3356   if( rc==SQLITE_OK ){
  3357     assert( pCur->eState==CURSOR_VALID );
  3358     assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
  3359     assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
  3360     rc = accessPayload(pCur, offset, amt, pBuf, 1, 0);
  3361   }
  3362   return rc;
  3363 }
  3364 
  3365 /*
  3366 ** Return a pointer to payload information from the entry that the 
  3367 ** pCur cursor is pointing to.  The pointer is to the beginning of
  3368 ** the key if skipKey==0 and it points to the beginning of data if
  3369 ** skipKey==1.  The number of bytes of available key/data is written
  3370 ** into *pAmt.  If *pAmt==0, then the value returned will not be
  3371 ** a valid pointer.
  3372 **
  3373 ** This routine is an optimization.  It is common for the entire key
  3374 ** and data to fit on the local page and for there to be no overflow
  3375 ** pages.  When that is so, this routine can be used to access the
  3376 ** key and data without making a copy.  If the key and/or data spills
  3377 ** onto overflow pages, then accessPayload() must be used to reassembly
  3378 ** the key/data and copy it into a preallocated buffer.
  3379 **
  3380 ** The pointer returned by this routine looks directly into the cached
  3381 ** page of the database.  The data might change or move the next time
  3382 ** any btree routine is called.
  3383 */
  3384 static const unsigned char *fetchPayload(
  3385   BtCursor *pCur,      /* Cursor pointing to entry to read from */
  3386   int *pAmt,           /* Write the number of available bytes here */
  3387   int skipKey          /* read beginning at data if this is true */
  3388 ){
  3389   unsigned char *aPayload;
  3390   MemPage *pPage;
  3391   u32 nKey;
  3392   int nLocal;
  3393 
  3394   assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]);
  3395   assert( pCur->eState==CURSOR_VALID );
  3396   assert( cursorHoldsMutex(pCur) );
  3397   pPage = pCur->apPage[pCur->iPage];
  3398   assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
  3399   getCellInfo(pCur);
  3400   aPayload = pCur->info.pCell;
  3401   aPayload += pCur->info.nHeader;
  3402   if( pPage->intKey ){
  3403     nKey = 0;
  3404   }else{
  3405     nKey = pCur->info.nKey;
  3406   }
  3407   if( skipKey ){
  3408     aPayload += nKey;
  3409     nLocal = pCur->info.nLocal - nKey;
  3410   }else{
  3411     nLocal = pCur->info.nLocal;
  3412     if( nLocal>nKey ){
  3413       nLocal = nKey;
  3414     }
  3415   }
  3416   *pAmt = nLocal;
  3417   return aPayload;
  3418 }
  3419 
  3420 
  3421 /*
  3422 ** For the entry that cursor pCur is point to, return as
  3423 ** many bytes of the key or data as are available on the local
  3424 ** b-tree page.  Write the number of available bytes into *pAmt.
  3425 **
  3426 ** The pointer returned is ephemeral.  The key/data may move
  3427 ** or be destroyed on the next call to any Btree routine,
  3428 ** including calls from other threads against the same cache.
  3429 ** Hence, a mutex on the BtShared should be held prior to calling
  3430 ** this routine.
  3431 **
  3432 ** These routines is used to get quick access to key and data
  3433 ** in the common case where no overflow pages are used.
  3434 */
  3435 const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){
  3436   assert( cursorHoldsMutex(pCur) );
  3437   if( pCur->eState==CURSOR_VALID ){
  3438     return (const void*)fetchPayload(pCur, pAmt, 0);
  3439   }
  3440   return 0;
  3441 }
  3442 const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){
  3443   assert( cursorHoldsMutex(pCur) );
  3444   if( pCur->eState==CURSOR_VALID ){
  3445     return (const void*)fetchPayload(pCur, pAmt, 1);
  3446   }
  3447   return 0;
  3448 }
  3449 
  3450 
  3451 /*
  3452 ** Move the cursor down to a new child page.  The newPgno argument is the
  3453 ** page number of the child page to move to.
  3454 */
  3455 static int moveToChild(BtCursor *pCur, u32 newPgno){
  3456   int rc;
  3457   int i = pCur->iPage;
  3458   MemPage *pNewPage;
  3459   BtShared *pBt = pCur->pBt;
  3460 
  3461   assert( cursorHoldsMutex(pCur) );
  3462   assert( pCur->eState==CURSOR_VALID );
  3463   assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
  3464   if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
  3465     return SQLITE_CORRUPT_BKPT;
  3466   }
  3467   rc = getAndInitPage(pBt, newPgno, &pNewPage);
  3468   if( rc ) return rc;
  3469   pCur->apPage[i+1] = pNewPage;
  3470   pCur->aiIdx[i+1] = 0;
  3471   pCur->iPage++;
  3472 
  3473   pCur->info.nSize = 0;
  3474   pCur->validNKey = 0;
  3475   if( pNewPage->nCell<1 ){
  3476     return SQLITE_CORRUPT_BKPT;
  3477   }
  3478   return SQLITE_OK;
  3479 }
  3480 
  3481 #ifndef NDEBUG
  3482 /*
  3483 ** Page pParent is an internal (non-leaf) tree page. This function 
  3484 ** asserts that page number iChild is the left-child if the iIdx'th
  3485 ** cell in page pParent. Or, if iIdx is equal to the total number of
  3486 ** cells in pParent, that page number iChild is the right-child of
  3487 ** the page.
  3488 */
  3489 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
  3490   assert( iIdx<=pParent->nCell );
  3491   if( iIdx==pParent->nCell ){
  3492     assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
  3493   }else{
  3494     assert( get4byte(findCell(pParent, iIdx))==iChild );
  3495   }
  3496 }
  3497 #else
  3498 #  define assertParentIndex(x,y,z) 
  3499 #endif
  3500 
  3501 /*
  3502 ** Move the cursor up to the parent page.
  3503 **
  3504 ** pCur->idx is set to the cell index that contains the pointer
  3505 ** to the page we are coming from.  If we are coming from the
  3506 ** right-most child page then pCur->idx is set to one more than
  3507 ** the largest cell index.
  3508 */
  3509 void sqlite3BtreeMoveToParent(BtCursor *pCur){
  3510   assert( cursorHoldsMutex(pCur) );
  3511   assert( pCur->eState==CURSOR_VALID );
  3512   assert( pCur->iPage>0 );
  3513   assert( pCur->apPage[pCur->iPage] );
  3514   assertParentIndex(
  3515     pCur->apPage[pCur->iPage-1], 
  3516     pCur->aiIdx[pCur->iPage-1], 
  3517     pCur->apPage[pCur->iPage]->pgno
  3518   );
  3519   releasePage(pCur->apPage[pCur->iPage]);
  3520   pCur->iPage--;
  3521   pCur->info.nSize = 0;
  3522   pCur->validNKey = 0;
  3523 }
  3524 
  3525 /*
  3526 ** Move the cursor to the root page
  3527 */
  3528 static int moveToRoot(BtCursor *pCur){
  3529   MemPage *pRoot;
  3530   int rc = SQLITE_OK;
  3531   Btree *p = pCur->pBtree;
  3532   BtShared *pBt = p->pBt;
  3533 
  3534   assert( cursorHoldsMutex(pCur) );
  3535   assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
  3536   assert( CURSOR_VALID   < CURSOR_REQUIRESEEK );
  3537   assert( CURSOR_FAULT   > CURSOR_REQUIRESEEK );
  3538   if( pCur->eState>=CURSOR_REQUIRESEEK ){
  3539     if( pCur->eState==CURSOR_FAULT ){
  3540       return pCur->skip;
  3541     }
  3542     sqlite3BtreeClearCursor(pCur);
  3543   }
  3544 
  3545   if( pCur->iPage>=0 ){
  3546     int i;
  3547     for(i=1; i<=pCur->iPage; i++){
  3548       releasePage(pCur->apPage[i]);
  3549     }
  3550   }else{
  3551     if( 
  3552       SQLITE_OK!=(rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]))
  3553     ){
  3554       pCur->eState = CURSOR_INVALID;
  3555       return rc;
  3556     }
  3557   }
  3558 
  3559   pRoot = pCur->apPage[0];
  3560   assert( pRoot->pgno==pCur->pgnoRoot );
  3561   pCur->iPage = 0;
  3562   pCur->aiIdx[0] = 0;
  3563   pCur->info.nSize = 0;
  3564   pCur->atLast = 0;
  3565   pCur->validNKey = 0;
  3566 
  3567   if( pRoot->nCell==0 && !pRoot->leaf ){
  3568     Pgno subpage;
  3569     assert( pRoot->pgno==1 );
  3570     subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
  3571     assert( subpage>0 );
  3572     pCur->eState = CURSOR_VALID;
  3573     rc = moveToChild(pCur, subpage);
  3574   }else{
  3575     pCur->eState = ((pRoot->nCell>0)?CURSOR_VALID:CURSOR_INVALID);
  3576   }
  3577   return rc;
  3578 }
  3579 
  3580 /*
  3581 ** Move the cursor down to the left-most leaf entry beneath the
  3582 ** entry to which it is currently pointing.
  3583 **
  3584 ** The left-most leaf is the one with the smallest key - the first
  3585 ** in ascending order.
  3586 */
  3587 static int moveToLeftmost(BtCursor *pCur){
  3588   Pgno pgno;
  3589   int rc = SQLITE_OK;
  3590   MemPage *pPage;
  3591 
  3592   assert( cursorHoldsMutex(pCur) );
  3593   assert( pCur->eState==CURSOR_VALID );
  3594   while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
  3595     assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
  3596     pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage]));
  3597     rc = moveToChild(pCur, pgno);
  3598   }
  3599   return rc;
  3600 }
  3601 
  3602 /*
  3603 ** Move the cursor down to the right-most leaf entry beneath the
  3604 ** page to which it is currently pointing.  Notice the difference
  3605 ** between moveToLeftmost() and moveToRightmost().  moveToLeftmost()
  3606 ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
  3607 ** finds the right-most entry beneath the *page*.
  3608 **
  3609 ** The right-most entry is the one with the largest key - the last
  3610 ** key in ascending order.
  3611 */
  3612 static int moveToRightmost(BtCursor *pCur){
  3613   Pgno pgno;
  3614   int rc = SQLITE_OK;
  3615   MemPage *pPage;
  3616 
  3617   assert( cursorHoldsMutex(pCur) );
  3618   assert( pCur->eState==CURSOR_VALID );
  3619   while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
  3620     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
  3621     pCur->aiIdx[pCur->iPage] = pPage->nCell;
  3622     rc = moveToChild(pCur, pgno);
  3623   }
  3624   if( rc==SQLITE_OK ){
  3625     pCur->aiIdx[pCur->iPage] = pPage->nCell-1;
  3626     pCur->info.nSize = 0;
  3627     pCur->validNKey = 0;
  3628   }
  3629   return rc;
  3630 }
  3631 
  3632 /* Move the cursor to the first entry in the table.  Return SQLITE_OK
  3633 ** on success.  Set *pRes to 0 if the cursor actually points to something
  3634 ** or set *pRes to 1 if the table is empty.
  3635 */
  3636 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
  3637   int rc;
  3638 
  3639   assert( cursorHoldsMutex(pCur) );
  3640   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
  3641   rc = moveToRoot(pCur);
  3642   if( rc==SQLITE_OK ){
  3643     if( pCur->eState==CURSOR_INVALID ){
  3644       assert( pCur->apPage[pCur->iPage]->nCell==0 );
  3645       *pRes = 1;
  3646       rc = SQLITE_OK;
  3647     }else{
  3648       assert( pCur->apPage[pCur->iPage]->nCell>0 );
  3649       *pRes = 0;
  3650       rc = moveToLeftmost(pCur);
  3651     }
  3652   }
  3653   return rc;
  3654 }
  3655 
  3656 /* Move the cursor to the last entry in the table.  Return SQLITE_OK
  3657 ** on success.  Set *pRes to 0 if the cursor actually points to something
  3658 ** or set *pRes to 1 if the table is empty.
  3659 */
  3660 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
  3661   int rc;
  3662  
  3663   assert( cursorHoldsMutex(pCur) );
  3664   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
  3665   rc = moveToRoot(pCur);
  3666   if( rc==SQLITE_OK ){
  3667     if( CURSOR_INVALID==pCur->eState ){
  3668       assert( pCur->apPage[pCur->iPage]->nCell==0 );
  3669       *pRes = 1;
  3670     }else{
  3671       assert( pCur->eState==CURSOR_VALID );
  3672       *pRes = 0;
  3673       rc = moveToRightmost(pCur);
  3674       getCellInfo(pCur);
  3675       pCur->atLast = rc==SQLITE_OK;
  3676     }
  3677   }
  3678   return rc;
  3679 }
  3680 
  3681 /* Move the cursor so that it points to an entry near the key 
  3682 ** specified by pIdxKey or intKey.   Return a success code.
  3683 **
  3684 ** For INTKEY tables, the intKey parameter is used.  pIdxKey 
  3685 ** must be NULL.  For index tables, pIdxKey is used and intKey
  3686 ** is ignored.
  3687 **
  3688 ** If an exact match is not found, then the cursor is always
  3689 ** left pointing at a leaf page which would hold the entry if it
  3690 ** were present.  The cursor might point to an entry that comes
  3691 ** before or after the key.
  3692 **
  3693 ** The result of comparing the key with the entry to which the
  3694 ** cursor is written to *pRes if pRes!=NULL.  The meaning of
  3695 ** this value is as follows:
  3696 **
  3697 **     *pRes<0      The cursor is left pointing at an entry that
  3698 **                  is smaller than pKey or if the table is empty
  3699 **                  and the cursor is therefore left point to nothing.
  3700 **
  3701 **     *pRes==0     The cursor is left pointing at an entry that
  3702 **                  exactly matches pKey.
  3703 **
  3704 **     *pRes>0      The cursor is left pointing at an entry that
  3705 **                  is larger than pKey.
  3706 **
  3707 */
  3708 int sqlite3BtreeMovetoUnpacked(
  3709   BtCursor *pCur,          /* The cursor to be moved */
  3710   UnpackedRecord *pIdxKey, /* Unpacked index key */
  3711   i64 intKey,              /* The table key */
  3712   int biasRight,           /* If true, bias the search to the high end */
  3713   int *pRes                /* Write search results here */
  3714 ){
  3715   int rc;
  3716 
  3717   assert( cursorHoldsMutex(pCur) );
  3718   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
  3719 
  3720   /* If the cursor is already positioned at the point we are trying
  3721   ** to move to, then just return without doing any work */
  3722   if( pCur->eState==CURSOR_VALID && pCur->validNKey 
  3723    && pCur->apPage[0]->intKey 
  3724   ){
  3725     if( pCur->info.nKey==intKey ){
  3726       *pRes = 0;
  3727       return SQLITE_OK;
  3728     }
  3729     if( pCur->atLast && pCur->info.nKey<intKey ){
  3730       *pRes = -1;
  3731       return SQLITE_OK;
  3732     }
  3733   }
  3734 
  3735   rc = moveToRoot(pCur);
  3736   if( rc ){
  3737     return rc;
  3738   }
  3739   assert( pCur->apPage[pCur->iPage] );
  3740   assert( pCur->apPage[pCur->iPage]->isInit );
  3741   if( pCur->eState==CURSOR_INVALID ){
  3742     *pRes = -1;
  3743     assert( pCur->apPage[pCur->iPage]->nCell==0 );
  3744     return SQLITE_OK;
  3745   }
  3746   assert( pCur->apPage[0]->intKey || pIdxKey );
  3747   for(;;){
  3748     int lwr, upr;
  3749     Pgno chldPg;
  3750     MemPage *pPage = pCur->apPage[pCur->iPage];
  3751     int c = -1;  /* pRes return if table is empty must be -1 */
  3752     lwr = 0;
  3753     upr = pPage->nCell-1;
  3754     if( !pPage->intKey && pIdxKey==0 ){
  3755       rc = SQLITE_CORRUPT_BKPT;
  3756       goto moveto_finish;
  3757     }
  3758     if( biasRight ){
  3759       pCur->aiIdx[pCur->iPage] = upr;
  3760     }else{
  3761       pCur->aiIdx[pCur->iPage] = (upr+lwr)/2;
  3762     }
  3763     if( lwr<=upr ) for(;;){
  3764       void *pCellKey;
  3765       i64 nCellKey;
  3766       int idx = pCur->aiIdx[pCur->iPage];
  3767       pCur->info.nSize = 0;
  3768       pCur->validNKey = 1;
  3769       if( pPage->intKey ){
  3770         u8 *pCell;
  3771         pCell = findCell(pPage, idx) + pPage->childPtrSize;
  3772         if( pPage->hasData ){
  3773           u32 dummy;
  3774           pCell += getVarint32(pCell, dummy);
  3775         }
  3776         getVarint(pCell, (u64*)&nCellKey);
  3777         if( nCellKey==intKey ){
  3778           c = 0;
  3779         }else if( nCellKey<intKey ){
  3780           c = -1;
  3781         }else{
  3782           assert( nCellKey>intKey );
  3783           c = +1;
  3784         }
  3785       }else{
  3786         int available;
  3787         pCellKey = (void *)fetchPayload(pCur, &available, 0);
  3788         nCellKey = pCur->info.nKey;
  3789         if( available>=nCellKey ){
  3790           c = sqlite3VdbeRecordCompare(nCellKey, pCellKey, pIdxKey);
  3791         }else{
  3792           pCellKey = sqlite3Malloc( nCellKey );
  3793           if( pCellKey==0 ){
  3794             rc = SQLITE_NOMEM;
  3795             goto moveto_finish;
  3796           }
  3797           rc = sqlite3BtreeKey(pCur, 0, nCellKey, (void *)pCellKey);
  3798           c = sqlite3VdbeRecordCompare(nCellKey, pCellKey, pIdxKey);
  3799           sqlite3_free(pCellKey);
  3800           if( rc ) goto moveto_finish;
  3801         }
  3802       }
  3803       if( c==0 ){
  3804         pCur->info.nKey = nCellKey;
  3805         if( pPage->intKey && !pPage->leaf ){
  3806           lwr = idx;
  3807           upr = lwr - 1;
  3808           break;
  3809         }else{
  3810           if( pRes ) *pRes = 0;
  3811           rc = SQLITE_OK;
  3812           goto moveto_finish;
  3813         }
  3814       }
  3815       if( c<0 ){
  3816         lwr = idx+1;
  3817       }else{
  3818         upr = idx-1;
  3819       }
  3820       if( lwr>upr ){
  3821         pCur->info.nKey = nCellKey;
  3822         break;
  3823       }
  3824       pCur->aiIdx[pCur->iPage] = (lwr+upr)/2;
  3825     }
  3826     assert( lwr==upr+1 );
  3827     assert( pPage->isInit );
  3828     if( pPage->leaf ){
  3829       chldPg = 0;
  3830     }else if( lwr>=pPage->nCell ){
  3831       chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
  3832     }else{
  3833       chldPg = get4byte(findCell(pPage, lwr));
  3834     }
  3835     if( chldPg==0 ){
  3836       assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
  3837       if( pRes ) *pRes = c;
  3838       rc = SQLITE_OK;
  3839       goto moveto_finish;
  3840     }
  3841     pCur->aiIdx[pCur->iPage] = lwr;
  3842     pCur->info.nSize = 0;
  3843     pCur->validNKey = 0;
  3844     rc = moveToChild(pCur, chldPg);
  3845     if( rc ) goto moveto_finish;
  3846   }
  3847 moveto_finish:
  3848   return rc;
  3849 }
  3850 
  3851 /*
  3852 ** In this version of BtreeMoveto, pKey is a packed index record
  3853 ** such as is generated by the OP_MakeRecord opcode.  Unpack the
  3854 ** record and then call BtreeMovetoUnpacked() to do the work.
  3855 */
  3856 int sqlite3BtreeMoveto(
  3857   BtCursor *pCur,     /* Cursor open on the btree to be searched */
  3858   const void *pKey,   /* Packed key if the btree is an index */
  3859   i64 nKey,           /* Integer key for tables.  Size of pKey for indices */
  3860   int bias,           /* Bias search to the high end */
  3861   int *pRes           /* Write search results here */
  3862 ){
  3863   int rc;                    /* Status code */
  3864   UnpackedRecord *pIdxKey;   /* Unpacked index key */
  3865   UnpackedRecord aSpace[16]; /* Temp space for pIdxKey - to avoid a malloc */
  3866 
  3867   if( pKey ){
  3868     pIdxKey = sqlite3VdbeRecordUnpack(pCur->pKeyInfo, nKey, pKey,
  3869                                       aSpace, sizeof(aSpace));
  3870     if( pIdxKey==0 ) return SQLITE_NOMEM;
  3871   }else{
  3872     pIdxKey = 0;
  3873   }
  3874   rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
  3875   if( pKey ){
  3876     sqlite3VdbeDeleteUnpackedRecord(pIdxKey);
  3877   }
  3878   return rc;
  3879 }
  3880 
  3881 
  3882 /*
  3883 ** Return TRUE if the cursor is not pointing at an entry of the table.
  3884 **
  3885 ** TRUE will be returned after a call to sqlite3BtreeNext() moves
  3886 ** past the last entry in the table or sqlite3BtreePrev() moves past
  3887 ** the first entry.  TRUE is also returned if the table is empty.
  3888 */
  3889 int sqlite3BtreeEof(BtCursor *pCur){
  3890   /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
  3891   ** have been deleted? This API will need to change to return an error code
  3892   ** as well as the boolean result value.
  3893   */
  3894   return (CURSOR_VALID!=pCur->eState);
  3895 }
  3896 
  3897 /*
  3898 ** Return the database connection handle for a cursor.
  3899 */
  3900 sqlite3 *sqlite3BtreeCursorDb(const BtCursor *pCur){
  3901   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
  3902   return pCur->pBtree->db;
  3903 }
  3904 
  3905 /*
  3906 ** Advance the cursor to the next entry in the database.  If
  3907 ** successful then set *pRes=0.  If the cursor
  3908 ** was already pointing to the last entry in the database before
  3909 ** this routine was called, then set *pRes=1.
  3910 */
  3911 int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
  3912   int rc;
  3913   int idx;
  3914   MemPage *pPage;
  3915 
  3916   assert( cursorHoldsMutex(pCur) );
  3917   rc = restoreCursorPosition(pCur);
  3918   if( rc!=SQLITE_OK ){
  3919     return rc;
  3920   }
  3921   assert( pRes!=0 );
  3922   if( CURSOR_INVALID==pCur->eState ){
  3923     *pRes = 1;
  3924     return SQLITE_OK;
  3925   }
  3926   if( pCur->skip>0 ){
  3927     pCur->skip = 0;
  3928     *pRes = 0;
  3929     return SQLITE_OK;
  3930   }
  3931   pCur->skip = 0;
  3932 
  3933   pPage = pCur->apPage[pCur->iPage];
  3934   idx = ++pCur->aiIdx[pCur->iPage];
  3935   assert( pPage->isInit );
  3936   assert( idx<=pPage->nCell );
  3937 
  3938   pCur->info.nSize = 0;
  3939   pCur->validNKey = 0;
  3940   if( idx>=pPage->nCell ){
  3941     if( !pPage->leaf ){
  3942       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
  3943       if( rc ) return rc;
  3944       rc = moveToLeftmost(pCur);
  3945       *pRes = 0;
  3946       return rc;
  3947     }
  3948     do{
  3949       if( pCur->iPage==0 ){
  3950         *pRes = 1;
  3951         pCur->eState = CURSOR_INVALID;
  3952         return SQLITE_OK;
  3953       }
  3954       sqlite3BtreeMoveToParent(pCur);
  3955       pPage = pCur->apPage[pCur->iPage];
  3956     }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell );
  3957     *pRes = 0;
  3958     if( pPage->intKey ){
  3959       rc = sqlite3BtreeNext(pCur, pRes);
  3960     }else{
  3961       rc = SQLITE_OK;
  3962     }
  3963     return rc;
  3964   }
  3965   *pRes = 0;
  3966   if( pPage->leaf ){
  3967     return SQLITE_OK;
  3968   }
  3969   rc = moveToLeftmost(pCur);
  3970   return rc;
  3971 }
  3972 
  3973 
  3974 /*
  3975 ** Step the cursor to the back to the previous entry in the database.  If
  3976 ** successful then set *pRes=0.  If the cursor
  3977 ** was already pointing to the first entry in the database before
  3978 ** this routine was called, then set *pRes=1.
  3979 */
  3980 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
  3981   int rc;
  3982   MemPage *pPage;
  3983 
  3984   assert( cursorHoldsMutex(pCur) );
  3985   rc = restoreCursorPosition(pCur);
  3986   if( rc!=SQLITE_OK ){
  3987     return rc;
  3988   }
  3989   pCur->atLast = 0;
  3990   if( CURSOR_INVALID==pCur->eState ){
  3991     *pRes = 1;
  3992     return SQLITE_OK;
  3993   }
  3994   if( pCur->skip<0 ){
  3995     pCur->skip = 0;
  3996     *pRes = 0;
  3997     return SQLITE_OK;
  3998   }
  3999   pCur->skip = 0;
  4000 
  4001   pPage = pCur->apPage[pCur->iPage];
  4002   assert( pPage->isInit );
  4003   if( !pPage->leaf ){
  4004     int idx = pCur->aiIdx[pCur->iPage];
  4005     rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
  4006     if( rc ){
  4007       return rc;
  4008     }
  4009     rc = moveToRightmost(pCur);
  4010   }else{
  4011     while( pCur->aiIdx[pCur->iPage]==0 ){
  4012       if( pCur->iPage==0 ){
  4013         pCur->eState = CURSOR_INVALID;
  4014         *pRes = 1;
  4015         return SQLITE_OK;
  4016       }
  4017       sqlite3BtreeMoveToParent(pCur);
  4018     }
  4019     pCur->info.nSize = 0;
  4020     pCur->validNKey = 0;
  4021 
  4022     pCur->aiIdx[pCur->iPage]--;
  4023     pPage = pCur->apPage[pCur->iPage];
  4024     if( pPage->intKey && !pPage->leaf ){
  4025       rc = sqlite3BtreePrevious(pCur, pRes);
  4026     }else{
  4027       rc = SQLITE_OK;
  4028     }
  4029   }
  4030   *pRes = 0;
  4031   return rc;
  4032 }
  4033 
  4034 /*
  4035 ** Allocate a new page from the database file.
  4036 **
  4037 ** The new page is marked as dirty.  (In other words, sqlite3PagerWrite()
  4038 ** has already been called on the new page.)  The new page has also
  4039 ** been referenced and the calling routine is responsible for calling
  4040 ** sqlite3PagerUnref() on the new page when it is done.
  4041 **
  4042 ** SQLITE_OK is returned on success.  Any other return value indicates
  4043 ** an error.  *ppPage and *pPgno are undefined in the event of an error.
  4044 ** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned.
  4045 **
  4046 ** If the "nearby" parameter is not 0, then a (feeble) effort is made to 
  4047 ** locate a page close to the page number "nearby".  This can be used in an
  4048 ** attempt to keep related pages close to each other in the database file,
  4049 ** which in turn can make database access faster.
  4050 **
  4051 ** If the "exact" parameter is not 0, and the page-number nearby exists 
  4052 ** anywhere on the free-list, then it is guarenteed to be returned. This
  4053 ** is only used by auto-vacuum databases when allocating a new table.
  4054 */
  4055 static int allocateBtreePage(
  4056   BtShared *pBt, 
  4057   MemPage **ppPage, 
  4058   Pgno *pPgno, 
  4059   Pgno nearby,
  4060   u8 exact
  4061 ){
  4062   MemPage *pPage1;
  4063   int rc;
  4064   int n;     /* Number of pages on the freelist */
  4065   int k;     /* Number of leaves on the trunk of the freelist */
  4066   MemPage *pTrunk = 0;
  4067   MemPage *pPrevTrunk = 0;
  4068 
  4069   assert( sqlite3_mutex_held(pBt->mutex) );
  4070   pPage1 = pBt->pPage1;
  4071   n = get4byte(&pPage1->aData[36]);
  4072   if( n>0 ){
  4073     /* There are pages on the freelist.  Reuse one of those pages. */
  4074     Pgno iTrunk;
  4075     u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
  4076     
  4077     /* If the 'exact' parameter was true and a query of the pointer-map
  4078     ** shows that the page 'nearby' is somewhere on the free-list, then
  4079     ** the entire-list will be searched for that page.
  4080     */
  4081 #ifndef SQLITE_OMIT_AUTOVACUUM
  4082     if( exact && nearby<=pagerPagecount(pBt->pPager) ){
  4083       u8 eType;
  4084       assert( nearby>0 );
  4085       assert( pBt->autoVacuum );
  4086       rc = ptrmapGet(pBt, nearby, &eType, 0);
  4087       if( rc ) return rc;
  4088       if( eType==PTRMAP_FREEPAGE ){
  4089         searchList = 1;
  4090       }
  4091       *pPgno = nearby;
  4092     }
  4093 #endif
  4094 
  4095     /* Decrement the free-list count by 1. Set iTrunk to the index of the
  4096     ** first free-list trunk page. iPrevTrunk is initially 1.
  4097     */
  4098     rc = sqlite3PagerWrite(pPage1->pDbPage);
  4099     if( rc ) return rc;
  4100     put4byte(&pPage1->aData[36], n-1);
  4101 
  4102     /* The code within this loop is run only once if the 'searchList' variable
  4103     ** is not true. Otherwise, it runs once for each trunk-page on the
  4104     ** free-list until the page 'nearby' is located.
  4105     */
  4106     do {
  4107       pPrevTrunk = pTrunk;
  4108       if( pPrevTrunk ){
  4109         iTrunk = get4byte(&pPrevTrunk->aData[0]);
  4110       }else{
  4111         iTrunk = get4byte(&pPage1->aData[32]);
  4112       }
  4113       rc = sqlite3BtreeGetPage(pBt, iTrunk, &pTrunk, 0);
  4114       if( rc ){
  4115         pTrunk = 0;
  4116         goto end_allocate_page;
  4117       }
  4118 
  4119       k = get4byte(&pTrunk->aData[4]);
  4120       if( k==0 && !searchList ){
  4121         /* The trunk has no leaves and the list is not being searched. 
  4122         ** So extract the trunk page itself and use it as the newly 
  4123         ** allocated page */
  4124         assert( pPrevTrunk==0 );
  4125         rc = sqlite3PagerWrite(pTrunk->pDbPage);
  4126         if( rc ){
  4127           goto end_allocate_page;
  4128         }
  4129         *pPgno = iTrunk;
  4130         memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
  4131         *ppPage = pTrunk;
  4132         pTrunk = 0;
  4133         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
  4134       }else if( k>pBt->usableSize/4 - 2 ){
  4135         /* Value of k is out of range.  Database corruption */
  4136         rc = SQLITE_CORRUPT_BKPT;
  4137         goto end_allocate_page;
  4138 #ifndef SQLITE_OMIT_AUTOVACUUM
  4139       }else if( searchList && nearby==iTrunk ){
  4140         /* The list is being searched and this trunk page is the page
  4141         ** to allocate, regardless of whether it has leaves.
  4142         */
  4143         assert( *pPgno==iTrunk );
  4144         *ppPage = pTrunk;
  4145         searchList = 0;
  4146         rc = sqlite3PagerWrite(pTrunk->pDbPage);
  4147         if( rc ){
  4148           goto end_allocate_page;
  4149         }
  4150         if( k==0 ){
  4151           if( !pPrevTrunk ){
  4152             memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
  4153           }else{
  4154             memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
  4155           }
  4156         }else{
  4157           /* The trunk page is required by the caller but it contains 
  4158           ** pointers to free-list leaves. The first leaf becomes a trunk
  4159           ** page in this case.
  4160           */
  4161           MemPage *pNewTrunk;
  4162           Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
  4163           rc = sqlite3BtreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0);
  4164           if( rc!=SQLITE_OK ){
  4165             goto end_allocate_page;
  4166           }
  4167           rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
  4168           if( rc!=SQLITE_OK ){
  4169             releasePage(pNewTrunk);
  4170             goto end_allocate_page;
  4171           }
  4172           memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
  4173           put4byte(&pNewTrunk->aData[4], k-1);
  4174           memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
  4175           releasePage(pNewTrunk);
  4176           if( !pPrevTrunk ){
  4177             put4byte(&pPage1->aData[32], iNewTrunk);
  4178           }else{
  4179             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
  4180             if( rc ){
  4181               goto end_allocate_page;
  4182             }
  4183             put4byte(&pPrevTrunk->aData[0], iNewTrunk);
  4184           }
  4185         }
  4186         pTrunk = 0;
  4187         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
  4188 #endif
  4189       }else{
  4190         /* Extract a leaf from the trunk */
  4191         int closest;
  4192         Pgno iPage;
  4193         unsigned char *aData = pTrunk->aData;
  4194         rc = sqlite3PagerWrite(pTrunk->pDbPage);
  4195         if( rc ){
  4196           goto end_allocate_page;
  4197         }
  4198         if( nearby>0 ){
  4199           int i, dist;
  4200           closest = 0;
  4201           dist = get4byte(&aData[8]) - nearby;
  4202           if( dist<0 ) dist = -dist;
  4203           for(i=1; i<k; i++){
  4204             int d2 = get4byte(&aData[8+i*4]) - nearby;
  4205             if( d2<0 ) d2 = -d2;
  4206             if( d2<dist ){
  4207               closest = i;
  4208               dist = d2;
  4209             }
  4210           }
  4211         }else{
  4212           closest = 0;
  4213         }
  4214 
  4215         iPage = get4byte(&aData[8+closest*4]);
  4216         if( !searchList || iPage==nearby ){
  4217           int nPage;
  4218           *pPgno = iPage;
  4219           nPage = pagerPagecount(pBt->pPager);
  4220           if( *pPgno>nPage ){
  4221             /* Free page off the end of the file */
  4222             rc = SQLITE_CORRUPT_BKPT;
  4223             goto end_allocate_page;
  4224           }
  4225           TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
  4226                  ": %d more free pages\n",
  4227                  *pPgno, closest+1, k, pTrunk->pgno, n-1));
  4228           if( closest<k-1 ){
  4229             memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
  4230           }
  4231           put4byte(&aData[4], k-1);
  4232           rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 1);
  4233           if( rc==SQLITE_OK ){
  4234             sqlite3PagerDontRollback((*ppPage)->pDbPage);
  4235             rc = sqlite3PagerWrite((*ppPage)->pDbPage);
  4236             if( rc!=SQLITE_OK ){
  4237               releasePage(*ppPage);
  4238             }
  4239           }
  4240           searchList = 0;
  4241         }
  4242       }
  4243       releasePage(pPrevTrunk);
  4244       pPrevTrunk = 0;
  4245     }while( searchList );
  4246   }else{
  4247     /* There are no pages on the freelist, so create a new page at the
  4248     ** end of the file */
  4249     int nPage = pagerPagecount(pBt->pPager);
  4250     *pPgno = nPage + 1;
  4251 
  4252 #ifndef SQLITE_OMIT_AUTOVACUUM
  4253     if( pBt->nTrunc ){
  4254       /* An incr-vacuum has already run within this transaction. So the
  4255       ** page to allocate is not from the physical end of the file, but
  4256       ** at pBt->nTrunc. 
  4257       */
  4258       *pPgno = pBt->nTrunc+1;
  4259       if( *pPgno==PENDING_BYTE_PAGE(pBt) ){
  4260         (*pPgno)++;
  4261       }
  4262     }
  4263     if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, *pPgno) ){
  4264       /* If *pPgno refers to a pointer-map page, allocate two new pages
  4265       ** at the end of the file instead of one. The first allocated page
  4266       ** becomes a new pointer-map page, the second is used by the caller.
  4267       */
  4268       TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", *pPgno));
  4269       assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
  4270       (*pPgno)++;
  4271       if( *pPgno==PENDING_BYTE_PAGE(pBt) ){ (*pPgno)++; }
  4272     }
  4273     if( pBt->nTrunc ){
  4274       pBt->nTrunc = *pPgno;
  4275     }
  4276 #endif
  4277 
  4278     assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
  4279     rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 0);
  4280     if( rc ) return rc;
  4281     rc = sqlite3PagerWrite((*ppPage)->pDbPage);
  4282     if( rc!=SQLITE_OK ){
  4283       releasePage(*ppPage);
  4284     }
  4285     TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
  4286   }
  4287 
  4288   assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
  4289 
  4290 end_allocate_page:
  4291   releasePage(pTrunk);
  4292   releasePage(pPrevTrunk);
  4293   if( rc==SQLITE_OK ){
  4294     if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
  4295       releasePage(*ppPage);
  4296       return SQLITE_CORRUPT_BKPT;
  4297     }
  4298     (*ppPage)->isInit = 0;
  4299   }
  4300   return rc;
  4301 }
  4302 
  4303 /*
  4304 ** Add a page of the database file to the freelist.
  4305 **
  4306 ** sqlite3PagerUnref() is NOT called for pPage.
  4307 */
  4308 static int freePage(MemPage *pPage){
  4309   BtShared *pBt = pPage->pBt;
  4310   MemPage *pPage1 = pBt->pPage1;
  4311   int rc, n, k;
  4312 
  4313   /* Prepare the page for freeing */
  4314   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  4315   assert( pPage->pgno>1 );
  4316   pPage->isInit = 0;
  4317 
  4318   /* Increment the free page count on pPage1 */
  4319   rc = sqlite3PagerWrite(pPage1->pDbPage);
  4320   if( rc ) return rc;
  4321   n = get4byte(&pPage1->aData[36]);
  4322   put4byte(&pPage1->aData[36], n+1);
  4323 
  4324 #ifdef SQLITE_SECURE_DELETE
  4325   /* If the SQLITE_SECURE_DELETE compile-time option is enabled, then
  4326   ** always fully overwrite deleted information with zeros.
  4327   */
  4328   rc = sqlite3PagerWrite(pPage->pDbPage);
  4329   if( rc ) return rc;
  4330   memset(pPage->aData, 0, pPage->pBt->pageSize);
  4331 #endif
  4332 
  4333   /* If the database supports auto-vacuum, write an entry in the pointer-map
  4334   ** to indicate that the page is free.
  4335   */
  4336   if( ISAUTOVACUUM ){
  4337     rc = ptrmapPut(pBt, pPage->pgno, PTRMAP_FREEPAGE, 0);
  4338     if( rc ) return rc;
  4339   }
  4340 
  4341   if( n==0 ){
  4342     /* This is the first free page */
  4343     rc = sqlite3PagerWrite(pPage->pDbPage);
  4344     if( rc ) return rc;
  4345     memset(pPage->aData, 0, 8);
  4346     put4byte(&pPage1->aData[32], pPage->pgno);
  4347     TRACE(("FREE-PAGE: %d first\n", pPage->pgno));
  4348   }else{
  4349     /* Other free pages already exist.  Retrive the first trunk page
  4350     ** of the freelist and find out how many leaves it has. */
  4351     MemPage *pTrunk;
  4352     rc = sqlite3BtreeGetPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk, 0);
  4353     if( rc ) return rc;
  4354     k = get4byte(&pTrunk->aData[4]);
  4355     if( k>=pBt->usableSize/4 - 8 ){
  4356       /* The trunk is full.  Turn the page being freed into a new
  4357       ** trunk page with no leaves.
  4358       **
  4359       ** Note that the trunk page is not really full until it contains
  4360       ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
  4361       ** coded.  But due to a coding error in versions of SQLite prior to
  4362       ** 3.6.0, databases with freelist trunk pages holding more than
  4363       ** usableSize/4 - 8 entries will be reported as corrupt.  In order
  4364       ** to maintain backwards compatibility with older versions of SQLite,
  4365       ** we will contain to restrict the number of entries to usableSize/4 - 8
  4366       ** for now.  At some point in the future (once everyone has upgraded
  4367       ** to 3.6.0 or later) we should consider fixing the conditional above
  4368       ** to read "usableSize/4-2" instead of "usableSize/4-8".
  4369       */
  4370       rc = sqlite3PagerWrite(pPage->pDbPage);
  4371       if( rc==SQLITE_OK ){
  4372         put4byte(pPage->aData, pTrunk->pgno);
  4373         put4byte(&pPage->aData[4], 0);
  4374         put4byte(&pPage1->aData[32], pPage->pgno);
  4375         TRACE(("FREE-PAGE: %d new trunk page replacing %d\n",
  4376                 pPage->pgno, pTrunk->pgno));
  4377       }
  4378     }else if( k<0 ){
  4379       rc = SQLITE_CORRUPT;
  4380     }else{
  4381       /* Add the newly freed page as a leaf on the current trunk */
  4382       rc = sqlite3PagerWrite(pTrunk->pDbPage);
  4383       if( rc==SQLITE_OK ){
  4384         put4byte(&pTrunk->aData[4], k+1);
  4385         put4byte(&pTrunk->aData[8+k*4], pPage->pgno);
  4386 #ifndef SQLITE_SECURE_DELETE
  4387         rc = sqlite3PagerDontWrite(pPage->pDbPage);
  4388 #endif
  4389       }
  4390       TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
  4391     }
  4392     releasePage(pTrunk);
  4393   }
  4394   return rc;
  4395 }
  4396 
  4397 /*
  4398 ** Free any overflow pages associated with the given Cell.
  4399 */
  4400 static int clearCell(MemPage *pPage, unsigned char *pCell){
  4401   BtShared *pBt = pPage->pBt;
  4402   CellInfo info;
  4403   Pgno ovflPgno;
  4404   int rc;
  4405   int nOvfl;
  4406   int ovflPageSize;
  4407 
  4408   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  4409   sqlite3BtreeParseCellPtr(pPage, pCell, &info);
  4410   if( info.iOverflow==0 ){
  4411     return SQLITE_OK;  /* No overflow pages. Return without doing anything */
  4412   }
  4413   ovflPgno = get4byte(&pCell[info.iOverflow]);
  4414   ovflPageSize = pBt->usableSize - 4;
  4415   nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;
  4416   assert( ovflPgno==0 || nOvfl>0 );
  4417   while( nOvfl-- ){
  4418     MemPage *pOvfl;
  4419     if( ovflPgno==0 || ovflPgno>pagerPagecount(pBt->pPager) ){
  4420       return SQLITE_CORRUPT_BKPT;
  4421     }
  4422 
  4423     rc = getOverflowPage(pBt, ovflPgno, &pOvfl, (nOvfl==0)?0:&ovflPgno);
  4424     if( rc ) return rc;
  4425     rc = freePage(pOvfl);
  4426     sqlite3PagerUnref(pOvfl->pDbPage);
  4427     if( rc ) return rc;
  4428   }
  4429   return SQLITE_OK;
  4430 }
  4431 
  4432 /*
  4433 ** Create the byte sequence used to represent a cell on page pPage
  4434 ** and write that byte sequence into pCell[].  Overflow pages are
  4435 ** allocated and filled in as necessary.  The calling procedure
  4436 ** is responsible for making sure sufficient space has been allocated
  4437 ** for pCell[].
  4438 **
  4439 ** Note that pCell does not necessary need to point to the pPage->aData
  4440 ** area.  pCell might point to some temporary storage.  The cell will
  4441 ** be constructed in this temporary area then copied into pPage->aData
  4442 ** later.
  4443 */
  4444 static int fillInCell(
  4445   MemPage *pPage,                /* The page that contains the cell */
  4446   unsigned char *pCell,          /* Complete text of the cell */
  4447   const void *pKey, i64 nKey,    /* The key */
  4448   const void *pData,int nData,   /* The data */
  4449   int nZero,                     /* Extra zero bytes to append to pData */
  4450   int *pnSize                    /* Write cell size here */
  4451 ){
  4452   int nPayload;
  4453   const u8 *pSrc;
  4454   int nSrc, n, rc;
  4455   int spaceLeft;
  4456   MemPage *pOvfl = 0;
  4457   MemPage *pToRelease = 0;
  4458   unsigned char *pPrior;
  4459   unsigned char *pPayload;
  4460   BtShared *pBt = pPage->pBt;
  4461   Pgno pgnoOvfl = 0;
  4462   int nHeader;
  4463   CellInfo info;
  4464 
  4465   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  4466 
  4467   /* Fill in the header. */
  4468   nHeader = 0;
  4469   if( !pPage->leaf ){
  4470     nHeader += 4;
  4471   }
  4472   if( pPage->hasData ){
  4473     nHeader += putVarint(&pCell[nHeader], nData+nZero);
  4474   }else{
  4475     nData = nZero = 0;
  4476   }
  4477   nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
  4478   sqlite3BtreeParseCellPtr(pPage, pCell, &info);
  4479   assert( info.nHeader==nHeader );
  4480   assert( info.nKey==nKey );
  4481   assert( info.nData==nData+nZero );
  4482   
  4483   /* Fill in the payload */
  4484   nPayload = nData + nZero;
  4485   if( pPage->intKey ){
  4486     pSrc = pData;
  4487     nSrc = nData;
  4488     nData = 0;
  4489   }else{
  4490     nPayload += nKey;
  4491     pSrc = pKey;
  4492     nSrc = nKey;
  4493   }
  4494   *pnSize = info.nSize;
  4495   spaceLeft = info.nLocal;
  4496   pPayload = &pCell[nHeader];
  4497   pPrior = &pCell[info.iOverflow];
  4498 
  4499   while( nPayload>0 ){
  4500     if( spaceLeft==0 ){
  4501       int isExact = 0;
  4502 #ifndef SQLITE_OMIT_AUTOVACUUM
  4503       Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
  4504       if( pBt->autoVacuum ){
  4505         do{
  4506           pgnoOvfl++;
  4507         } while( 
  4508           PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt) 
  4509         );
  4510         if( pgnoOvfl>1 ){
  4511           /* isExact = 1; */
  4512         }
  4513       }
  4514 #endif
  4515       rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, isExact);
  4516 #ifndef SQLITE_OMIT_AUTOVACUUM
  4517       /* If the database supports auto-vacuum, and the second or subsequent
  4518       ** overflow page is being allocated, add an entry to the pointer-map
  4519       ** for that page now. 
  4520       **
  4521       ** If this is the first overflow page, then write a partial entry 
  4522       ** to the pointer-map. If we write nothing to this pointer-map slot,
  4523       ** then the optimistic overflow chain processing in clearCell()
  4524       ** may misinterpret the uninitialised values and delete the
  4525       ** wrong pages from the database.
  4526       */
  4527       if( pBt->autoVacuum && rc==SQLITE_OK ){
  4528         u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
  4529         rc = ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap);
  4530         if( rc ){
  4531           releasePage(pOvfl);
  4532         }
  4533       }
  4534 #endif
  4535       if( rc ){
  4536         releasePage(pToRelease);
  4537         return rc;
  4538       }
  4539       put4byte(pPrior, pgnoOvfl);
  4540       releasePage(pToRelease);
  4541       pToRelease = pOvfl;
  4542       pPrior = pOvfl->aData;
  4543       put4byte(pPrior, 0);
  4544       pPayload = &pOvfl->aData[4];
  4545       spaceLeft = pBt->usableSize - 4;
  4546     }
  4547     n = nPayload;
  4548     if( n>spaceLeft ) n = spaceLeft;
  4549     if( nSrc>0 ){
  4550       if( n>nSrc ) n = nSrc;
  4551       assert( pSrc );
  4552       memcpy(pPayload, pSrc, n);
  4553     }else{
  4554       memset(pPayload, 0, n);
  4555     }
  4556     nPayload -= n;
  4557     pPayload += n;
  4558     pSrc += n;
  4559     nSrc -= n;
  4560     spaceLeft -= n;
  4561     if( nSrc==0 ){
  4562       nSrc = nData;
  4563       pSrc = pData;
  4564     }
  4565   }
  4566   releasePage(pToRelease);
  4567   return SQLITE_OK;
  4568 }
  4569 
  4570 /*
  4571 ** Remove the i-th cell from pPage.  This routine effects pPage only.
  4572 ** The cell content is not freed or deallocated.  It is assumed that
  4573 ** the cell content has been copied someplace else.  This routine just
  4574 ** removes the reference to the cell from pPage.
  4575 **
  4576 ** "sz" must be the number of bytes in the cell.
  4577 */
  4578 static int dropCell(MemPage *pPage, int idx, int sz){
  4579   int i;          /* Loop counter */
  4580   int pc;         /* Offset to cell content of cell being deleted */
  4581   u8 *data;       /* pPage->aData */
  4582   u8 *ptr;        /* Used to move bytes around within data[] */
  4583   int rc;         /* Return code */
  4584 
  4585   assert( idx>=0 && idx<pPage->nCell );
  4586   assert( sz==cellSize(pPage, idx) );
  4587   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  4588   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  4589   data = pPage->aData;
  4590   ptr = &data[pPage->cellOffset + 2*idx];
  4591   pc = get2byte(ptr);
  4592   if( pc<pPage->hdrOffset+6+(pPage->leaf?0:4)
  4593    || pc+sz>pPage->pBt->usableSize
  4594   ){
  4595      return SQLITE_CORRUPT_BKPT;
  4596   }
  4597   rc = freeSpace(pPage, pc, sz);
  4598   if( rc ) return rc;
  4599   for(i=idx+1; i<pPage->nCell; i++, ptr+=2){
  4600     ptr[0] = ptr[2];
  4601     ptr[1] = ptr[3];
  4602   }
  4603   pPage->nCell--;
  4604   put2byte(&data[pPage->hdrOffset+3], pPage->nCell);
  4605   pPage->nFree += 2;
  4606   return SQLITE_OK;
  4607 }
  4608 
  4609 /*
  4610 ** Insert a new cell on pPage at cell index "i".  pCell points to the
  4611 ** content of the cell.
  4612 **
  4613 ** If the cell content will fit on the page, then put it there.  If it
  4614 ** will not fit, then make a copy of the cell content into pTemp if
  4615 ** pTemp is not null.  Regardless of pTemp, allocate a new entry
  4616 ** in pPage->aOvfl[] and make it point to the cell content (either
  4617 ** in pTemp or the original pCell) and also record its index. 
  4618 ** Allocating a new entry in pPage->aCell[] implies that 
  4619 ** pPage->nOverflow is incremented.
  4620 **
  4621 ** If nSkip is non-zero, then do not copy the first nSkip bytes of the
  4622 ** cell. The caller will overwrite them after this function returns. If
  4623 ** nSkip is non-zero, then pCell may not point to an invalid memory location 
  4624 ** (but pCell+nSkip is always valid).
  4625 */
  4626 static int insertCell(
  4627   MemPage *pPage,   /* Page into which we are copying */
  4628   int i,            /* New cell becomes the i-th cell of the page */
  4629   u8 *pCell,        /* Content of the new cell */
  4630   int sz,           /* Bytes of content in pCell */
  4631   u8 *pTemp,        /* Temp storage space for pCell, if needed */
  4632   u8 nSkip          /* Do not write the first nSkip bytes of the cell */
  4633 ){
  4634   int idx;          /* Where to write new cell content in data[] */
  4635   int j;            /* Loop counter */
  4636   int top;          /* First byte of content for any cell in data[] */
  4637   int end;          /* First byte past the last cell pointer in data[] */
  4638   int ins;          /* Index in data[] where new cell pointer is inserted */
  4639   int hdr;          /* Offset into data[] of the page header */
  4640   int cellOffset;   /* Address of first cell pointer in data[] */
  4641   u8 *data;         /* The content of the whole page */
  4642   u8 *ptr;          /* Used for moving information around in data[] */
  4643 
  4644   assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
  4645   assert( sz==cellSizePtr(pPage, pCell) );
  4646   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  4647   if( pPage->nOverflow || sz+2>pPage->nFree ){
  4648     if( pTemp ){
  4649       memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip);
  4650       pCell = pTemp;
  4651     }
  4652     j = pPage->nOverflow++;
  4653     assert( j<sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0]) );
  4654     pPage->aOvfl[j].pCell = pCell;
  4655     pPage->aOvfl[j].idx = i;
  4656     pPage->nFree = 0;
  4657   }else{
  4658     int rc = sqlite3PagerWrite(pPage->pDbPage);
  4659     if( rc!=SQLITE_OK ){
  4660       return rc;
  4661     }
  4662     assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  4663     data = pPage->aData;
  4664     hdr = pPage->hdrOffset;
  4665     top = get2byte(&data[hdr+5]);
  4666     cellOffset = pPage->cellOffset;
  4667     end = cellOffset + 2*pPage->nCell + 2;
  4668     ins = cellOffset + 2*i;
  4669     if( end > top - sz ){
  4670       rc = defragmentPage(pPage);
  4671       if( rc ) return rc;
  4672       top = get2byte(&data[hdr+5]);
  4673       assert( end + sz <= top );
  4674     }
  4675     idx = allocateSpace(pPage, sz);
  4676     assert( idx>0 );
  4677     assert( end <= get2byte(&data[hdr+5]) );
  4678     if( idx+sz > pPage->pBt->usableSize ){
  4679       return SQLITE_CORRUPT_BKPT;
  4680     }
  4681     pPage->nCell++;
  4682     pPage->nFree -= 2;
  4683     memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip);
  4684     for(j=end-2, ptr=&data[j]; j>ins; j-=2, ptr-=2){
  4685       ptr[0] = ptr[-2];
  4686       ptr[1] = ptr[-1];
  4687     }
  4688     put2byte(&data[ins], idx);
  4689     put2byte(&data[hdr+3], pPage->nCell);
  4690 #ifndef SQLITE_OMIT_AUTOVACUUM
  4691     if( pPage->pBt->autoVacuum ){
  4692       /* The cell may contain a pointer to an overflow page. If so, write
  4693       ** the entry for the overflow page into the pointer map.
  4694       */
  4695       CellInfo info;
  4696       sqlite3BtreeParseCellPtr(pPage, pCell, &info);
  4697       assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
  4698       if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
  4699         Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
  4700         rc = ptrmapPut(pPage->pBt, pgnoOvfl, PTRMAP_OVERFLOW1, pPage->pgno);
  4701         if( rc!=SQLITE_OK ) return rc;
  4702       }
  4703     }
  4704 #endif
  4705   }
  4706 
  4707   return SQLITE_OK;
  4708 }
  4709 
  4710 /*
  4711 ** Add a list of cells to a page.  The page should be initially empty.
  4712 ** The cells are guaranteed to fit on the page.
  4713 */
  4714 static void assemblePage(
  4715   MemPage *pPage,   /* The page to be assemblied */
  4716   int nCell,        /* The number of cells to add to this page */
  4717   u8 **apCell,      /* Pointers to cell bodies */
  4718   u16 *aSize        /* Sizes of the cells */
  4719 ){
  4720   int i;            /* Loop counter */
  4721   int totalSize;    /* Total size of all cells */
  4722   int hdr;          /* Index of page header */
  4723   int cellptr;      /* Address of next cell pointer */
  4724   int cellbody;     /* Address of next cell body */
  4725   u8 *data;         /* Data for the page */
  4726 
  4727   assert( pPage->nOverflow==0 );
  4728   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  4729   totalSize = 0;
  4730   for(i=0; i<nCell; i++){
  4731     totalSize += aSize[i];
  4732   }
  4733   assert( totalSize+2*nCell<=pPage->nFree );
  4734   assert( pPage->nCell==0 );
  4735   cellptr = pPage->cellOffset;
  4736   data = pPage->aData;
  4737   hdr = pPage->hdrOffset;
  4738   put2byte(&data[hdr+3], nCell);
  4739   if( nCell ){
  4740     cellbody = allocateSpace(pPage, totalSize);
  4741     assert( cellbody>0 );
  4742     assert( pPage->nFree >= 2*nCell );
  4743     pPage->nFree -= 2*nCell;
  4744     for(i=0; i<nCell; i++){
  4745       put2byte(&data[cellptr], cellbody);
  4746       memcpy(&data[cellbody], apCell[i], aSize[i]);
  4747       cellptr += 2;
  4748       cellbody += aSize[i];
  4749     }
  4750     assert( cellbody==pPage->pBt->usableSize );
  4751   }
  4752   pPage->nCell = nCell;
  4753 }
  4754 
  4755 /*
  4756 ** The following parameters determine how many adjacent pages get involved
  4757 ** in a balancing operation.  NN is the number of neighbors on either side
  4758 ** of the page that participate in the balancing operation.  NB is the
  4759 ** total number of pages that participate, including the target page and
  4760 ** NN neighbors on either side.
  4761 **
  4762 ** The minimum value of NN is 1 (of course).  Increasing NN above 1
  4763 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
  4764 ** in exchange for a larger degradation in INSERT and UPDATE performance.
  4765 ** The value of NN appears to give the best results overall.
  4766 */
  4767 #define NN 1             /* Number of neighbors on either side of pPage */
  4768 #define NB (NN*2+1)      /* Total pages involved in the balance */
  4769 
  4770 /* Forward reference */
  4771 static int balance(BtCursor*, int);
  4772 
  4773 #ifndef SQLITE_OMIT_QUICKBALANCE
  4774 /*
  4775 ** This version of balance() handles the common special case where
  4776 ** a new entry is being inserted on the extreme right-end of the
  4777 ** tree, in other words, when the new entry will become the largest
  4778 ** entry in the tree.
  4779 **
  4780 ** Instead of trying balance the 3 right-most leaf pages, just add
  4781 ** a new page to the right-hand side and put the one new entry in
  4782 ** that page.  This leaves the right side of the tree somewhat
  4783 ** unbalanced.  But odds are that we will be inserting new entries
  4784 ** at the end soon afterwards so the nearly empty page will quickly
  4785 ** fill up.  On average.
  4786 **
  4787 ** pPage is the leaf page which is the right-most page in the tree.
  4788 ** pParent is its parent.  pPage must have a single overflow entry
  4789 ** which is also the right-most entry on the page.
  4790 */
  4791 static int balance_quick(BtCursor *pCur){
  4792   int rc;
  4793   MemPage *pNew = 0;
  4794   Pgno pgnoNew;
  4795   u8 *pCell;
  4796   u16 szCell;
  4797   CellInfo info;
  4798   MemPage *pPage = pCur->apPage[pCur->iPage];
  4799   MemPage *pParent = pCur->apPage[pCur->iPage-1];
  4800   BtShared *pBt = pPage->pBt;
  4801   int parentIdx = pParent->nCell;   /* pParent new divider cell index */
  4802   int parentSize;                   /* Size of new divider cell */
  4803   u8 parentCell[64];                /* Space for the new divider cell */
  4804 
  4805   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  4806 
  4807   /* Allocate a new page. Insert the overflow cell from pPage
  4808   ** into it. Then remove the overflow cell from pPage.
  4809   */
  4810   rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
  4811   if( rc==SQLITE_OK ){
  4812     pCell = pPage->aOvfl[0].pCell;
  4813     szCell = cellSizePtr(pPage, pCell);
  4814     zeroPage(pNew, pPage->aData[0]);
  4815     assemblePage(pNew, 1, &pCell, &szCell);
  4816     pPage->nOverflow = 0;
  4817   
  4818     /* pPage is currently the right-child of pParent. Change this
  4819     ** so that the right-child is the new page allocated above and
  4820     ** pPage is the next-to-right child. 
  4821     **
  4822     ** Ignore the return value of the call to fillInCell(). fillInCell()
  4823     ** may only return other than SQLITE_OK if it is required to allocate
  4824     ** one or more overflow pages. Since an internal table B-Tree cell 
  4825     ** may never spill over onto an overflow page (it is a maximum of 
  4826     ** 13 bytes in size), it is not neccessary to check the return code.
  4827     **
  4828     ** Similarly, the insertCell() function cannot fail if the page
  4829     ** being inserted into is already writable and the cell does not 
  4830     ** contain an overflow pointer. So ignore this return code too.
  4831     */
  4832     assert( pPage->nCell>0 );
  4833     pCell = findCell(pPage, pPage->nCell-1);
  4834     sqlite3BtreeParseCellPtr(pPage, pCell, &info);
  4835     fillInCell(pParent, parentCell, 0, info.nKey, 0, 0, 0, &parentSize);
  4836     assert( parentSize<64 );
  4837     assert( sqlite3PagerIswriteable(pParent->pDbPage) );
  4838     insertCell(pParent, parentIdx, parentCell, parentSize, 0, 4);
  4839     put4byte(findOverflowCell(pParent,parentIdx), pPage->pgno);
  4840     put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
  4841   
  4842     /* If this is an auto-vacuum database, update the pointer map
  4843     ** with entries for the new page, and any pointer from the 
  4844     ** cell on the page to an overflow page.
  4845     */
  4846     if( ISAUTOVACUUM ){
  4847       rc = ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno);
  4848       if( rc==SQLITE_OK ){
  4849         rc = ptrmapPutOvfl(pNew, 0);
  4850       }
  4851     }
  4852 
  4853     /* Release the reference to the new page. */
  4854     releasePage(pNew);
  4855   }
  4856 
  4857   /* At this point the pPage->nFree variable is not set correctly with
  4858   ** respect to the content of the page (because it was set to 0 by 
  4859   ** insertCell). So call sqlite3BtreeInitPage() to make sure it is
  4860   ** correct.
  4861   **
  4862   ** This has to be done even if an error will be returned. Normally, if
  4863   ** an error occurs during tree balancing, the contents of MemPage are
  4864   ** not important, as they will be recalculated when the page is rolled
  4865   ** back. But here, in balance_quick(), it is possible that pPage has 
  4866   ** not yet been marked dirty or written into the journal file. Therefore
  4867   ** it will not be rolled back and so it is important to make sure that
  4868   ** the page data and contents of MemPage are consistent.
  4869   */
  4870   pPage->isInit = 0;
  4871   sqlite3BtreeInitPage(pPage);
  4872 
  4873   /* If everything else succeeded, balance the parent page, in 
  4874   ** case the divider cell inserted caused it to become overfull.
  4875   */
  4876   if( rc==SQLITE_OK ){
  4877     releasePage(pPage);
  4878     pCur->iPage--;
  4879     rc = balance(pCur, 0);
  4880   }
  4881   return rc;
  4882 }
  4883 #endif /* SQLITE_OMIT_QUICKBALANCE */
  4884 
  4885 /*
  4886 ** This routine redistributes Cells on pPage and up to NN*2 siblings
  4887 ** of pPage so that all pages have about the same amount of free space.
  4888 ** Usually NN siblings on either side of pPage is used in the balancing,
  4889 ** though more siblings might come from one side if pPage is the first
  4890 ** or last child of its parent.  If pPage has fewer than 2*NN siblings
  4891 ** (something which can only happen if pPage is the root page or a 
  4892 ** child of root) then all available siblings participate in the balancing.
  4893 **
  4894 ** The number of siblings of pPage might be increased or decreased by one or
  4895 ** two in an effort to keep pages nearly full but not over full. The root page
  4896 ** is special and is allowed to be nearly empty. If pPage is 
  4897 ** the root page, then the depth of the tree might be increased
  4898 ** or decreased by one, as necessary, to keep the root page from being
  4899 ** overfull or completely empty.
  4900 **
  4901 ** Note that when this routine is called, some of the Cells on pPage
  4902 ** might not actually be stored in pPage->aData[].  This can happen
  4903 ** if the page is overfull.  Part of the job of this routine is to
  4904 ** make sure all Cells for pPage once again fit in pPage->aData[].
  4905 **
  4906 ** In the course of balancing the siblings of pPage, the parent of pPage
  4907 ** might become overfull or underfull.  If that happens, then this routine
  4908 ** is called recursively on the parent.
  4909 **
  4910 ** If this routine fails for any reason, it might leave the database
  4911 ** in a corrupted state.  So if this routine fails, the database should
  4912 ** be rolled back.
  4913 */
  4914 static int balance_nonroot(BtCursor *pCur){
  4915   MemPage *pPage;              /* The over or underfull page to balance */
  4916   MemPage *pParent;            /* The parent of pPage */
  4917   BtShared *pBt;               /* The whole database */
  4918   int nCell = 0;               /* Number of cells in apCell[] */
  4919   int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */
  4920   int nOld;                    /* Number of pages in apOld[] */
  4921   int nNew;                    /* Number of pages in apNew[] */
  4922   int nDiv;                    /* Number of cells in apDiv[] */
  4923   int i, j, k;                 /* Loop counters */
  4924   int idx;                     /* Index of pPage in pParent->aCell[] */
  4925   int nxDiv;                   /* Next divider slot in pParent->aCell[] */
  4926   int rc;                      /* The return code */
  4927   int leafCorrection;          /* 4 if pPage is a leaf.  0 if not */
  4928   int leafData;                /* True if pPage is a leaf of a LEAFDATA tree */
  4929   int usableSpace;             /* Bytes in pPage beyond the header */
  4930   int pageFlags;               /* Value of pPage->aData[0] */
  4931   int subtotal;                /* Subtotal of bytes in cells on one page */
  4932   int iSpace1 = 0;             /* First unused byte of aSpace1[] */
  4933   int iSpace2 = 0;             /* First unused byte of aSpace2[] */
  4934   int szScratch;               /* Size of scratch memory requested */
  4935   MemPage *apOld[NB];          /* pPage and up to two siblings */
  4936   Pgno pgnoOld[NB];            /* Page numbers for each page in apOld[] */
  4937   MemPage *apCopy[NB];         /* Private copies of apOld[] pages */
  4938   MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
  4939   Pgno pgnoNew[NB+2];          /* Page numbers for each page in apNew[] */
  4940   u8 *apDiv[NB];               /* Divider cells in pParent */
  4941   int cntNew[NB+2];            /* Index in aCell[] of cell after i-th page */
  4942   int szNew[NB+2];             /* Combined size of cells place on i-th page */
  4943   u8 **apCell = 0;             /* All cells begin balanced */
  4944   u16 *szCell;                 /* Local size of all cells in apCell[] */
  4945   u8 *aCopy[NB];         /* Space for holding data of apCopy[] */
  4946   u8 *aSpace1;           /* Space for copies of dividers cells before balance */
  4947   u8 *aSpace2 = 0;       /* Space for overflow dividers cells after balance */
  4948   u8 *aFrom = 0;
  4949 
  4950   pPage = pCur->apPage[pCur->iPage];
  4951   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  4952   VVA_ONLY( pCur->pagesShuffled = 1 );
  4953 
  4954   /* 
  4955   ** Find the parent page.
  4956   */
  4957   assert( pCur->iPage>0 );
  4958   assert( pPage->isInit );
  4959   assert( sqlite3PagerIswriteable(pPage->pDbPage) || pPage->nOverflow==1 );
  4960   pBt = pPage->pBt;
  4961   pParent = pCur->apPage[pCur->iPage-1];
  4962   assert( pParent );
  4963   if( SQLITE_OK!=(rc = sqlite3PagerWrite(pParent->pDbPage)) ){
  4964     return rc;
  4965   }
  4966 
  4967   TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
  4968 
  4969 #ifndef SQLITE_OMIT_QUICKBALANCE
  4970   /*
  4971   ** A special case:  If a new entry has just been inserted into a
  4972   ** table (that is, a btree with integer keys and all data at the leaves)
  4973   ** and the new entry is the right-most entry in the tree (it has the
  4974   ** largest key) then use the special balance_quick() routine for
  4975   ** balancing.  balance_quick() is much faster and results in a tighter
  4976   ** packing of data in the common case.
  4977   */
  4978   if( pPage->leaf &&
  4979       pPage->intKey &&
  4980       pPage->nOverflow==1 &&
  4981       pPage->aOvfl[0].idx==pPage->nCell &&
  4982       pParent->pgno!=1 &&
  4983       get4byte(&pParent->aData[pParent->hdrOffset+8])==pPage->pgno
  4984   ){
  4985     assert( pPage->intKey );
  4986     /*
  4987     ** TODO: Check the siblings to the left of pPage. It may be that
  4988     ** they are not full and no new page is required.
  4989     */
  4990     return balance_quick(pCur);
  4991   }
  4992 #endif
  4993 
  4994   if( SQLITE_OK!=(rc = sqlite3PagerWrite(pPage->pDbPage)) ){
  4995     return rc;
  4996   }
  4997 
  4998   /*
  4999   ** Find the cell in the parent page whose left child points back
  5000   ** to pPage.  The "idx" variable is the index of that cell.  If pPage
  5001   ** is the rightmost child of pParent then set idx to pParent->nCell 
  5002   */
  5003   idx = pCur->aiIdx[pCur->iPage-1];
  5004   assertParentIndex(pParent, idx, pPage->pgno);
  5005 
  5006   /*
  5007   ** Initialize variables so that it will be safe to jump
  5008   ** directly to balance_cleanup at any moment.
  5009   */
  5010   nOld = nNew = 0;
  5011 
  5012   /*
  5013   ** Find sibling pages to pPage and the cells in pParent that divide
  5014   ** the siblings.  An attempt is made to find NN siblings on either
  5015   ** side of pPage.  More siblings are taken from one side, however, if
  5016   ** pPage there are fewer than NN siblings on the other side.  If pParent
  5017   ** has NB or fewer children then all children of pParent are taken.
  5018   */
  5019   nxDiv = idx - NN;
  5020   if( nxDiv + NB > pParent->nCell ){
  5021     nxDiv = pParent->nCell - NB + 1;
  5022   }
  5023   if( nxDiv<0 ){
  5024     nxDiv = 0;
  5025   }
  5026   nDiv = 0;
  5027   for(i=0, k=nxDiv; i<NB; i++, k++){
  5028     if( k<pParent->nCell ){
  5029       apDiv[i] = findCell(pParent, k);
  5030       nDiv++;
  5031       assert( !pParent->leaf );
  5032       pgnoOld[i] = get4byte(apDiv[i]);
  5033     }else if( k==pParent->nCell ){
  5034       pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+8]);
  5035     }else{
  5036       break;
  5037     }
  5038     rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i]);
  5039     if( rc ) goto balance_cleanup;
  5040     /* apOld[i]->idxParent = k; */
  5041     apCopy[i] = 0;
  5042     assert( i==nOld );
  5043     nOld++;
  5044     nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
  5045   }
  5046 
  5047   /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
  5048   ** alignment */
  5049   nMaxCells = (nMaxCells + 3)&~3;
  5050 
  5051   /*
  5052   ** Allocate space for memory structures
  5053   */
  5054   szScratch =
  5055        nMaxCells*sizeof(u8*)                       /* apCell */
  5056      + nMaxCells*sizeof(u16)                       /* szCell */
  5057      + (ROUND8(sizeof(MemPage))+pBt->pageSize)*NB  /* aCopy */
  5058      + pBt->pageSize                               /* aSpace1 */
  5059      + (ISAUTOVACUUM ? nMaxCells : 0);             /* aFrom */
  5060   apCell = sqlite3ScratchMalloc( szScratch ); 
  5061   if( apCell==0 ){
  5062     rc = SQLITE_NOMEM;
  5063     goto balance_cleanup;
  5064   }
  5065   szCell = (u16*)&apCell[nMaxCells];
  5066   aCopy[0] = (u8*)&szCell[nMaxCells];
  5067   assert( ((aCopy[0] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
  5068   for(i=1; i<NB; i++){
  5069     aCopy[i] = &aCopy[i-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
  5070     assert( ((aCopy[i] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
  5071   }
  5072   aSpace1 = &aCopy[NB-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
  5073   assert( ((aSpace1 - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
  5074   if( ISAUTOVACUUM ){
  5075     aFrom = &aSpace1[pBt->pageSize];
  5076   }
  5077   aSpace2 = sqlite3PageMalloc(pBt->pageSize);
  5078   if( aSpace2==0 ){
  5079     rc = SQLITE_NOMEM;
  5080     goto balance_cleanup;
  5081   }
  5082   
  5083   /*
  5084   ** Make copies of the content of pPage and its siblings into aOld[].
  5085   ** The rest of this function will use data from the copies rather
  5086   ** that the original pages since the original pages will be in the
  5087   ** process of being overwritten.
  5088   */
  5089   for(i=0; i<nOld; i++){
  5090     MemPage *p = apCopy[i] = (MemPage*)aCopy[i];
  5091     memcpy(p, apOld[i], sizeof(MemPage));
  5092     p->aData = (void*)&p[1];
  5093     memcpy(p->aData, apOld[i]->aData, pBt->pageSize);
  5094   }
  5095 
  5096   /*
  5097   ** Load pointers to all cells on sibling pages and the divider cells
  5098   ** into the local apCell[] array.  Make copies of the divider cells
  5099   ** into space obtained form aSpace1[] and remove the the divider Cells
  5100   ** from pParent.
  5101   **
  5102   ** If the siblings are on leaf pages, then the child pointers of the
  5103   ** divider cells are stripped from the cells before they are copied
  5104   ** into aSpace1[].  In this way, all cells in apCell[] are without
  5105   ** child pointers.  If siblings are not leaves, then all cell in
  5106   ** apCell[] include child pointers.  Either way, all cells in apCell[]
  5107   ** are alike.
  5108   **
  5109   ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
  5110   **       leafData:  1 if pPage holds key+data and pParent holds only keys.
  5111   */
  5112   nCell = 0;
  5113   leafCorrection = pPage->leaf*4;
  5114   leafData = pPage->hasData;
  5115   for(i=0; i<nOld; i++){
  5116     MemPage *pOld = apCopy[i];
  5117     int limit = pOld->nCell+pOld->nOverflow;
  5118     for(j=0; j<limit; j++){
  5119       assert( nCell<nMaxCells );
  5120       apCell[nCell] = findOverflowCell(pOld, j);
  5121       szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
  5122       if( ISAUTOVACUUM ){
  5123         int a;
  5124         aFrom[nCell] = i;
  5125         for(a=0; a<pOld->nOverflow; a++){
  5126           if( pOld->aOvfl[a].pCell==apCell[nCell] ){
  5127             aFrom[nCell] = 0xFF;
  5128             break;
  5129           }
  5130         }
  5131       }
  5132       nCell++;
  5133     }
  5134     if( i<nOld-1 ){
  5135       u16 sz = cellSizePtr(pParent, apDiv[i]);
  5136       if( leafData ){
  5137         /* With the LEAFDATA flag, pParent cells hold only INTKEYs that
  5138         ** are duplicates of keys on the child pages.  We need to remove
  5139         ** the divider cells from pParent, but the dividers cells are not
  5140         ** added to apCell[] because they are duplicates of child cells.
  5141         */
  5142         dropCell(pParent, nxDiv, sz);
  5143       }else{
  5144         u8 *pTemp;
  5145         assert( nCell<nMaxCells );
  5146         szCell[nCell] = sz;
  5147         pTemp = &aSpace1[iSpace1];
  5148         iSpace1 += sz;
  5149         assert( sz<=pBt->pageSize/4 );
  5150         assert( iSpace1<=pBt->pageSize );
  5151         memcpy(pTemp, apDiv[i], sz);
  5152         apCell[nCell] = pTemp+leafCorrection;
  5153         if( ISAUTOVACUUM ){
  5154           aFrom[nCell] = 0xFF;
  5155         }
  5156         dropCell(pParent, nxDiv, sz);
  5157         szCell[nCell] -= leafCorrection;
  5158         assert( get4byte(pTemp)==pgnoOld[i] );
  5159         if( !pOld->leaf ){
  5160           assert( leafCorrection==0 );
  5161           /* The right pointer of the child page pOld becomes the left
  5162           ** pointer of the divider cell */
  5163           memcpy(apCell[nCell], &pOld->aData[pOld->hdrOffset+8], 4);
  5164         }else{
  5165           assert( leafCorrection==4 );
  5166           if( szCell[nCell]<4 ){
  5167             /* Do not allow any cells smaller than 4 bytes. */
  5168             szCell[nCell] = 4;
  5169           }
  5170         }
  5171         nCell++;
  5172       }
  5173     }
  5174   }
  5175 
  5176   /*
  5177   ** Figure out the number of pages needed to hold all nCell cells.
  5178   ** Store this number in "k".  Also compute szNew[] which is the total
  5179   ** size of all cells on the i-th page and cntNew[] which is the index
  5180   ** in apCell[] of the cell that divides page i from page i+1.  
  5181   ** cntNew[k] should equal nCell.
  5182   **
  5183   ** Values computed by this block:
  5184   **
  5185   **           k: The total number of sibling pages
  5186   **    szNew[i]: Spaced used on the i-th sibling page.
  5187   **   cntNew[i]: Index in apCell[] and szCell[] for the first cell to
  5188   **              the right of the i-th sibling page.
  5189   ** usableSpace: Number of bytes of space available on each sibling.
  5190   ** 
  5191   */
  5192   usableSpace = pBt->usableSize - 12 + leafCorrection;
  5193   for(subtotal=k=i=0; i<nCell; i++){
  5194     assert( i<nMaxCells );
  5195     subtotal += szCell[i] + 2;
  5196     if( subtotal > usableSpace ){
  5197       szNew[k] = subtotal - szCell[i];
  5198       cntNew[k] = i;
  5199       if( leafData ){ i--; }
  5200       subtotal = 0;
  5201       k++;
  5202     }
  5203   }
  5204   szNew[k] = subtotal;
  5205   cntNew[k] = nCell;
  5206   k++;
  5207 
  5208   /*
  5209   ** The packing computed by the previous block is biased toward the siblings
  5210   ** on the left side.  The left siblings are always nearly full, while the
  5211   ** right-most sibling might be nearly empty.  This block of code attempts
  5212   ** to adjust the packing of siblings to get a better balance.
  5213   **
  5214   ** This adjustment is more than an optimization.  The packing above might
  5215   ** be so out of balance as to be illegal.  For example, the right-most
  5216   ** sibling might be completely empty.  This adjustment is not optional.
  5217   */
  5218   for(i=k-1; i>0; i--){
  5219     int szRight = szNew[i];  /* Size of sibling on the right */
  5220     int szLeft = szNew[i-1]; /* Size of sibling on the left */
  5221     int r;              /* Index of right-most cell in left sibling */
  5222     int d;              /* Index of first cell to the left of right sibling */
  5223 
  5224     r = cntNew[i-1] - 1;
  5225     d = r + 1 - leafData;
  5226     assert( d<nMaxCells );
  5227     assert( r<nMaxCells );
  5228     while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){
  5229       szRight += szCell[d] + 2;
  5230       szLeft -= szCell[r] + 2;
  5231       cntNew[i-1]--;
  5232       r = cntNew[i-1] - 1;
  5233       d = r + 1 - leafData;
  5234     }
  5235     szNew[i] = szRight;
  5236     szNew[i-1] = szLeft;
  5237   }
  5238 
  5239   /* Either we found one or more cells (cntnew[0])>0) or we are the
  5240   ** a virtual root page.  A virtual root page is when the real root
  5241   ** page is page 1 and we are the only child of that page.
  5242   */
  5243   assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
  5244 
  5245   /*
  5246   ** Allocate k new pages.  Reuse old pages where possible.
  5247   */
  5248   assert( pPage->pgno>1 );
  5249   pageFlags = pPage->aData[0];
  5250   for(i=0; i<k; i++){
  5251     MemPage *pNew;
  5252     if( i<nOld ){
  5253       pNew = apNew[i] = apOld[i];
  5254       pgnoNew[i] = pgnoOld[i];
  5255       apOld[i] = 0;
  5256       rc = sqlite3PagerWrite(pNew->pDbPage);
  5257       nNew++;
  5258       if( rc ) goto balance_cleanup;
  5259     }else{
  5260       assert( i>0 );
  5261       rc = allocateBtreePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1], 0);
  5262       if( rc ) goto balance_cleanup;
  5263       apNew[i] = pNew;
  5264       nNew++;
  5265     }
  5266   }
  5267 
  5268   /* Free any old pages that were not reused as new pages.
  5269   */
  5270   while( i<nOld ){
  5271     rc = freePage(apOld[i]);
  5272     if( rc ) goto balance_cleanup;
  5273     releasePage(apOld[i]);
  5274     apOld[i] = 0;
  5275     i++;
  5276   }
  5277 
  5278   /*
  5279   ** Put the new pages in accending order.  This helps to
  5280   ** keep entries in the disk file in order so that a scan
  5281   ** of the table is a linear scan through the file.  That
  5282   ** in turn helps the operating system to deliver pages
  5283   ** from the disk more rapidly.
  5284   **
  5285   ** An O(n^2) insertion sort algorithm is used, but since
  5286   ** n is never more than NB (a small constant), that should
  5287   ** not be a problem.
  5288   **
  5289   ** When NB==3, this one optimization makes the database
  5290   ** about 25% faster for large insertions and deletions.
  5291   */
  5292   for(i=0; i<k-1; i++){
  5293     int minV = pgnoNew[i];
  5294     int minI = i;
  5295     for(j=i+1; j<k; j++){
  5296       if( pgnoNew[j]<(unsigned)minV ){
  5297         minI = j;
  5298         minV = pgnoNew[j];
  5299       }
  5300     }
  5301     if( minI>i ){
  5302       int t;
  5303       MemPage *pT;
  5304       t = pgnoNew[i];
  5305       pT = apNew[i];
  5306       pgnoNew[i] = pgnoNew[minI];
  5307       apNew[i] = apNew[minI];
  5308       pgnoNew[minI] = t;
  5309       apNew[minI] = pT;
  5310     }
  5311   }
  5312   TRACE(("BALANCE: old: %d %d %d  new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
  5313     pgnoOld[0], 
  5314     nOld>=2 ? pgnoOld[1] : 0,
  5315     nOld>=3 ? pgnoOld[2] : 0,
  5316     pgnoNew[0], szNew[0],
  5317     nNew>=2 ? pgnoNew[1] : 0, nNew>=2 ? szNew[1] : 0,
  5318     nNew>=3 ? pgnoNew[2] : 0, nNew>=3 ? szNew[2] : 0,
  5319     nNew>=4 ? pgnoNew[3] : 0, nNew>=4 ? szNew[3] : 0,
  5320     nNew>=5 ? pgnoNew[4] : 0, nNew>=5 ? szNew[4] : 0));
  5321 
  5322   /*
  5323   ** Evenly distribute the data in apCell[] across the new pages.
  5324   ** Insert divider cells into pParent as necessary.
  5325   */
  5326   j = 0;
  5327   for(i=0; i<nNew; i++){
  5328     /* Assemble the new sibling page. */
  5329     MemPage *pNew = apNew[i];
  5330     assert( j<nMaxCells );
  5331     assert( pNew->pgno==pgnoNew[i] );
  5332     zeroPage(pNew, pageFlags);
  5333     assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
  5334     assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );
  5335     assert( pNew->nOverflow==0 );
  5336 
  5337     /* If this is an auto-vacuum database, update the pointer map entries
  5338     ** that point to the siblings that were rearranged. These can be: left
  5339     ** children of cells, the right-child of the page, or overflow pages
  5340     ** pointed to by cells.
  5341     */
  5342     if( ISAUTOVACUUM ){
  5343       for(k=j; k<cntNew[i]; k++){
  5344         assert( k<nMaxCells );
  5345         if( aFrom[k]==0xFF || apCopy[aFrom[k]]->pgno!=pNew->pgno ){
  5346           rc = ptrmapPutOvfl(pNew, k-j);
  5347           if( rc==SQLITE_OK && leafCorrection==0 ){
  5348             rc = ptrmapPut(pBt, get4byte(apCell[k]), PTRMAP_BTREE, pNew->pgno);
  5349           }
  5350           if( rc!=SQLITE_OK ){
  5351             goto balance_cleanup;
  5352           }
  5353         }
  5354       }
  5355     }
  5356 
  5357     j = cntNew[i];
  5358 
  5359     /* If the sibling page assembled above was not the right-most sibling,
  5360     ** insert a divider cell into the parent page.
  5361     */
  5362     if( i<nNew-1 && j<nCell ){
  5363       u8 *pCell;
  5364       u8 *pTemp;
  5365       int sz;
  5366 
  5367       assert( j<nMaxCells );
  5368       pCell = apCell[j];
  5369       sz = szCell[j] + leafCorrection;
  5370       pTemp = &aSpace2[iSpace2];
  5371       if( !pNew->leaf ){
  5372         memcpy(&pNew->aData[8], pCell, 4);
  5373         if( ISAUTOVACUUM 
  5374          && (aFrom[j]==0xFF || apCopy[aFrom[j]]->pgno!=pNew->pgno)
  5375         ){
  5376           rc = ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno);
  5377           if( rc!=SQLITE_OK ){
  5378             goto balance_cleanup;
  5379           }
  5380         }
  5381       }else if( leafData ){
  5382         /* If the tree is a leaf-data tree, and the siblings are leaves, 
  5383         ** then there is no divider cell in apCell[]. Instead, the divider 
  5384         ** cell consists of the integer key for the right-most cell of 
  5385         ** the sibling-page assembled above only.
  5386         */
  5387         CellInfo info;
  5388         j--;
  5389         sqlite3BtreeParseCellPtr(pNew, apCell[j], &info);
  5390         pCell = pTemp;
  5391         fillInCell(pParent, pCell, 0, info.nKey, 0, 0, 0, &sz);
  5392         pTemp = 0;
  5393       }else{
  5394         pCell -= 4;
  5395         /* Obscure case for non-leaf-data trees: If the cell at pCell was
  5396         ** previously stored on a leaf node, and its reported size was 4
  5397         ** bytes, then it may actually be smaller than this 
  5398         ** (see sqlite3BtreeParseCellPtr(), 4 bytes is the minimum size of
  5399         ** any cell). But it is important to pass the correct size to 
  5400         ** insertCell(), so reparse the cell now.
  5401         **
  5402         ** Note that this can never happen in an SQLite data file, as all
  5403         ** cells are at least 4 bytes. It only happens in b-trees used
  5404         ** to evaluate "IN (SELECT ...)" and similar clauses.
  5405         */
  5406         if( szCell[j]==4 ){
  5407           assert(leafCorrection==4);
  5408           sz = cellSizePtr(pParent, pCell);
  5409         }
  5410       }
  5411       iSpace2 += sz;
  5412       assert( sz<=pBt->pageSize/4 );
  5413       assert( iSpace2<=pBt->pageSize );
  5414       rc = insertCell(pParent, nxDiv, pCell, sz, pTemp, 4);
  5415       if( rc!=SQLITE_OK ) goto balance_cleanup;
  5416       put4byte(findOverflowCell(pParent,nxDiv), pNew->pgno);
  5417 
  5418       /* If this is an auto-vacuum database, and not a leaf-data tree,
  5419       ** then update the pointer map with an entry for the overflow page
  5420       ** that the cell just inserted points to (if any).
  5421       */
  5422       if( ISAUTOVACUUM && !leafData ){
  5423         rc = ptrmapPutOvfl(pParent, nxDiv);
  5424         if( rc!=SQLITE_OK ){
  5425           goto balance_cleanup;
  5426         }
  5427       }
  5428       j++;
  5429       nxDiv++;
  5430     }
  5431 
  5432     /* Set the pointer-map entry for the new sibling page. */
  5433     if( ISAUTOVACUUM ){
  5434       rc = ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno);
  5435       if( rc!=SQLITE_OK ){
  5436         goto balance_cleanup;
  5437       }
  5438     }
  5439   }
  5440   assert( j==nCell );
  5441   assert( nOld>0 );
  5442   assert( nNew>0 );
  5443   if( (pageFlags & PTF_LEAF)==0 ){
  5444     u8 *zChild = &apCopy[nOld-1]->aData[8];
  5445     memcpy(&apNew[nNew-1]->aData[8], zChild, 4);
  5446     if( ISAUTOVACUUM ){
  5447       rc = ptrmapPut(pBt, get4byte(zChild), PTRMAP_BTREE, apNew[nNew-1]->pgno);
  5448       if( rc!=SQLITE_OK ){
  5449         goto balance_cleanup;
  5450       }
  5451     }
  5452   }
  5453   if( nxDiv==pParent->nCell+pParent->nOverflow ){
  5454     /* Right-most sibling is the right-most child of pParent */
  5455     put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew[nNew-1]);
  5456   }else{
  5457     /* Right-most sibling is the left child of the first entry in pParent
  5458     ** past the right-most divider entry */
  5459     put4byte(findOverflowCell(pParent, nxDiv), pgnoNew[nNew-1]);
  5460   }
  5461 
  5462   /*
  5463   ** Balance the parent page.  Note that the current page (pPage) might
  5464   ** have been added to the freelist so it might no longer be initialized.
  5465   ** But the parent page will always be initialized.
  5466   */
  5467   assert( pParent->isInit );
  5468   sqlite3ScratchFree(apCell);
  5469   apCell = 0;
  5470   releasePage(pPage);
  5471   pCur->iPage--;
  5472   rc = balance(pCur, 0);
  5473   
  5474   /*
  5475   ** Cleanup before returning.
  5476   */
  5477 balance_cleanup:
  5478   sqlite3PageFree(aSpace2);
  5479   sqlite3ScratchFree(apCell);
  5480   for(i=0; i<nOld; i++){
  5481     releasePage(apOld[i]);
  5482   }
  5483   for(i=0; i<nNew; i++){
  5484     releasePage(apNew[i]);
  5485   }
  5486 
  5487   /* releasePage(pParent); */
  5488   TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n",
  5489           pPage->pgno, nOld, nNew, nCell));
  5490 
  5491   return rc;
  5492 }
  5493 
  5494 /*
  5495 ** This routine is called for the root page of a btree when the root
  5496 ** page contains no cells.  This is an opportunity to make the tree
  5497 ** shallower by one level.
  5498 */
  5499 static int balance_shallower(BtCursor *pCur){
  5500   MemPage *pPage;              /* Root page of B-Tree */
  5501   MemPage *pChild;             /* The only child page of pPage */
  5502   Pgno pgnoChild;              /* Page number for pChild */
  5503   int rc = SQLITE_OK;          /* Return code from subprocedures */
  5504   BtShared *pBt;                  /* The main BTree structure */
  5505   int mxCellPerPage;           /* Maximum number of cells per page */
  5506   u8 **apCell;                 /* All cells from pages being balanced */
  5507   u16 *szCell;                 /* Local size of all cells */
  5508 
  5509   assert( pCur->iPage==0 );
  5510   pPage = pCur->apPage[0];
  5511 
  5512   assert( pPage->nCell==0 );
  5513   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  5514   pBt = pPage->pBt;
  5515   mxCellPerPage = MX_CELL(pBt);
  5516   apCell = sqlite3Malloc( mxCellPerPage*(sizeof(u8*)+sizeof(u16)) );
  5517   if( apCell==0 ) return SQLITE_NOMEM;
  5518   szCell = (u16*)&apCell[mxCellPerPage];
  5519   if( pPage->leaf ){
  5520     /* The table is completely empty */
  5521     TRACE(("BALANCE: empty table %d\n", pPage->pgno));
  5522   }else{
  5523     /* The root page is empty but has one child.  Transfer the
  5524     ** information from that one child into the root page if it 
  5525     ** will fit.  This reduces the depth of the tree by one.
  5526     **
  5527     ** If the root page is page 1, it has less space available than
  5528     ** its child (due to the 100 byte header that occurs at the beginning
  5529     ** of the database fle), so it might not be able to hold all of the 
  5530     ** information currently contained in the child.  If this is the 
  5531     ** case, then do not do the transfer.  Leave page 1 empty except
  5532     ** for the right-pointer to the child page.  The child page becomes
  5533     ** the virtual root of the tree.
  5534     */
  5535     VVA_ONLY( pCur->pagesShuffled = 1 );
  5536     pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+8]);
  5537     assert( pgnoChild>0 );
  5538     assert( pgnoChild<=pagerPagecount(pPage->pBt->pPager) );
  5539     rc = sqlite3BtreeGetPage(pPage->pBt, pgnoChild, &pChild, 0);
  5540     if( rc ) goto end_shallow_balance;
  5541     if( pPage->pgno==1 ){
  5542       rc = sqlite3BtreeInitPage(pChild);
  5543       if( rc ) goto end_shallow_balance;
  5544       assert( pChild->nOverflow==0 );
  5545       if( pChild->nFree>=100 ){
  5546         /* The child information will fit on the root page, so do the
  5547         ** copy */
  5548         int i;
  5549         zeroPage(pPage, pChild->aData[0]);
  5550         for(i=0; i<pChild->nCell; i++){
  5551           apCell[i] = findCell(pChild,i);
  5552           szCell[i] = cellSizePtr(pChild, apCell[i]);
  5553         }
  5554         assemblePage(pPage, pChild->nCell, apCell, szCell);
  5555         /* Copy the right-pointer of the child to the parent. */
  5556         put4byte(&pPage->aData[pPage->hdrOffset+8], 
  5557             get4byte(&pChild->aData[pChild->hdrOffset+8]));
  5558         freePage(pChild);
  5559         TRACE(("BALANCE: child %d transfer to page 1\n", pChild->pgno));
  5560       }else{
  5561         /* The child has more information that will fit on the root.
  5562         ** The tree is already balanced.  Do nothing. */
  5563         TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno));
  5564       }
  5565     }else{
  5566       memcpy(pPage->aData, pChild->aData, pPage->pBt->usableSize);
  5567       pPage->isInit = 0;
  5568       rc = sqlite3BtreeInitPage(pPage);
  5569       assert( rc==SQLITE_OK );
  5570       freePage(pChild);
  5571       TRACE(("BALANCE: transfer child %d into root %d\n",
  5572               pChild->pgno, pPage->pgno));
  5573     }
  5574     assert( pPage->nOverflow==0 );
  5575     if( ISAUTOVACUUM ){
  5576       rc = setChildPtrmaps(pPage);
  5577     }
  5578     releasePage(pChild);
  5579   }
  5580 end_shallow_balance:
  5581   sqlite3_free(apCell);
  5582   return rc;
  5583 }
  5584 
  5585 
  5586 /*
  5587 ** The root page is overfull
  5588 **
  5589 ** When this happens, Create a new child page and copy the
  5590 ** contents of the root into the child.  Then make the root
  5591 ** page an empty page with rightChild pointing to the new
  5592 ** child.   Finally, call balance_internal() on the new child
  5593 ** to cause it to split.
  5594 */
  5595 static int balance_deeper(BtCursor *pCur){
  5596   int rc;             /* Return value from subprocedures */
  5597   MemPage *pPage;     /* Pointer to the root page */
  5598   MemPage *pChild;    /* Pointer to a new child page */
  5599   Pgno pgnoChild;     /* Page number of the new child page */
  5600   BtShared *pBt;         /* The BTree */
  5601   int usableSize;     /* Total usable size of a page */
  5602   u8 *data;           /* Content of the parent page */
  5603   u8 *cdata;          /* Content of the child page */
  5604   int hdr;            /* Offset to page header in parent */
  5605   int cbrk;           /* Offset to content of first cell in parent */
  5606 
  5607   assert( pCur->iPage==0 );
  5608   assert( pCur->apPage[0]->nOverflow>0 );
  5609 
  5610   VVA_ONLY( pCur->pagesShuffled = 1 );
  5611   pPage = pCur->apPage[0];
  5612   pBt = pPage->pBt;
  5613   assert( sqlite3_mutex_held(pBt->mutex) );
  5614   rc = allocateBtreePage(pBt, &pChild, &pgnoChild, pPage->pgno, 0);
  5615   if( rc ) return rc;
  5616   assert( sqlite3PagerIswriteable(pChild->pDbPage) );
  5617   usableSize = pBt->usableSize;
  5618   data = pPage->aData;
  5619   hdr = pPage->hdrOffset;
  5620   cbrk = get2byte(&data[hdr+5]);
  5621   cdata = pChild->aData;
  5622   memcpy(cdata, &data[hdr], pPage->cellOffset+2*pPage->nCell-hdr);
  5623   memcpy(&cdata[cbrk], &data[cbrk], usableSize-cbrk);
  5624   
  5625   rc = sqlite3BtreeInitPage(pChild);
  5626   if( rc==SQLITE_OK ){
  5627     int nCopy = pPage->nOverflow*sizeof(pPage->aOvfl[0]);
  5628     memcpy(pChild->aOvfl, pPage->aOvfl, nCopy);
  5629     pChild->nOverflow = pPage->nOverflow;
  5630     if( pChild->nOverflow ){
  5631       pChild->nFree = 0;
  5632     }
  5633     assert( pChild->nCell==pPage->nCell );
  5634     zeroPage(pPage, pChild->aData[0] & ~PTF_LEAF);
  5635     put4byte(&pPage->aData[pPage->hdrOffset+8], pgnoChild);
  5636     TRACE(("BALANCE: copy root %d into %d\n", pPage->pgno, pChild->pgno));
  5637     if( ISAUTOVACUUM ){
  5638       rc = ptrmapPut(pBt, pChild->pgno, PTRMAP_BTREE, pPage->pgno);
  5639       if( rc==SQLITE_OK ){
  5640         rc = setChildPtrmaps(pChild);
  5641       }
  5642     }
  5643   }
  5644 
  5645   if( rc==SQLITE_OK ){
  5646     pCur->iPage++;
  5647     pCur->apPage[1] = pChild;
  5648     pCur->aiIdx[0] = 0;
  5649     rc = balance_nonroot(pCur);
  5650   }else{
  5651     releasePage(pChild);
  5652   }
  5653 
  5654   return rc;
  5655 }
  5656 
  5657 /*
  5658 ** The page that pCur currently points to has just been modified in
  5659 ** some way. This function figures out if this modification means the
  5660 ** tree needs to be balanced, and if so calls the appropriate balancing 
  5661 ** routine.
  5662 ** 
  5663 ** Parameter isInsert is true if a new cell was just inserted into the
  5664 ** page, or false otherwise.
  5665 */
  5666 static int balance(BtCursor *pCur, int isInsert){
  5667   int rc = SQLITE_OK;
  5668   MemPage *pPage = pCur->apPage[pCur->iPage];
  5669 
  5670   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  5671   if( pCur->iPage==0 ){
  5672     rc = sqlite3PagerWrite(pPage->pDbPage);
  5673     if( rc==SQLITE_OK && pPage->nOverflow>0 ){
  5674       rc = balance_deeper(pCur);
  5675     }
  5676     if( rc==SQLITE_OK && pPage->nCell==0 ){
  5677       rc = balance_shallower(pCur);
  5678     }
  5679   }else{
  5680     if( pPage->nOverflow>0 || 
  5681         (!isInsert && pPage->nFree>pPage->pBt->usableSize*2/3) ){
  5682       rc = balance_nonroot(pCur);
  5683     }
  5684   }
  5685   return rc;
  5686 }
  5687 
  5688 /*
  5689 ** This routine checks all cursors that point to table pgnoRoot.
  5690 ** If any of those cursors were opened with wrFlag==0 in a different
  5691 ** database connection (a database connection that shares the pager
  5692 ** cache with the current connection) and that other connection 
  5693 ** is not in the ReadUncommmitted state, then this routine returns 
  5694 ** SQLITE_LOCKED.
  5695 **
  5696 ** As well as cursors with wrFlag==0, cursors with wrFlag==1 and 
  5697 ** isIncrblobHandle==1 are also considered 'read' cursors. Incremental 
  5698 ** blob cursors are used for both reading and writing.
  5699 **
  5700 ** When pgnoRoot is the root page of an intkey table, this function is also
  5701 ** responsible for invalidating incremental blob cursors when the table row
  5702 ** on which they are opened is deleted or modified. Cursors are invalidated
  5703 ** according to the following rules:
  5704 **
  5705 **   1) When BtreeClearTable() is called to completely delete the contents
  5706 **      of a B-Tree table, pExclude is set to zero and parameter iRow is 
  5707 **      set to non-zero. In this case all incremental blob cursors open
  5708 **      on the table rooted at pgnoRoot are invalidated.
  5709 **
  5710 **   2) When BtreeInsert(), BtreeDelete() or BtreePutData() is called to 
  5711 **      modify a table row via an SQL statement, pExclude is set to the 
  5712 **      write cursor used to do the modification and parameter iRow is set
  5713 **      to the integer row id of the B-Tree entry being modified. Unless
  5714 **      pExclude is itself an incremental blob cursor, then all incremental
  5715 **      blob cursors open on row iRow of the B-Tree are invalidated.
  5716 **
  5717 **   3) If both pExclude and iRow are set to zero, no incremental blob 
  5718 **      cursors are invalidated.
  5719 */
  5720 static int checkReadLocks(
  5721   Btree *pBtree, 
  5722   Pgno pgnoRoot, 
  5723   BtCursor *pExclude,
  5724   i64 iRow
  5725 ){
  5726   BtCursor *p;
  5727   BtShared *pBt = pBtree->pBt;
  5728   sqlite3 *db = pBtree->db;
  5729   assert( sqlite3BtreeHoldsMutex(pBtree) );
  5730   for(p=pBt->pCursor; p; p=p->pNext){
  5731     if( p==pExclude ) continue;
  5732     if( p->pgnoRoot!=pgnoRoot ) continue;
  5733 #ifndef SQLITE_OMIT_INCRBLOB
  5734     if( p->isIncrblobHandle && ( 
  5735          (!pExclude && iRow)
  5736       || (pExclude && !pExclude->isIncrblobHandle && p->info.nKey==iRow)
  5737     )){
  5738       p->eState = CURSOR_INVALID;
  5739     }
  5740 #endif
  5741     if( p->eState!=CURSOR_VALID ) continue;
  5742     if( p->wrFlag==0 
  5743 #ifndef SQLITE_OMIT_INCRBLOB
  5744      || p->isIncrblobHandle
  5745 #endif
  5746     ){
  5747       sqlite3 *dbOther = p->pBtree->db;
  5748       if( dbOther==0 ||
  5749          (dbOther!=db && (dbOther->flags & SQLITE_ReadUncommitted)==0) ){
  5750         return SQLITE_LOCKED;
  5751       }
  5752     }
  5753   }
  5754   return SQLITE_OK;
  5755 }
  5756 
  5757 /*
  5758 ** Insert a new record into the BTree.  The key is given by (pKey,nKey)
  5759 ** and the data is given by (pData,nData).  The cursor is used only to
  5760 ** define what table the record should be inserted into.  The cursor
  5761 ** is left pointing at a random location.
  5762 **
  5763 ** For an INTKEY table, only the nKey value of the key is used.  pKey is
  5764 ** ignored.  For a ZERODATA table, the pData and nData are both ignored.
  5765 */
  5766 int sqlite3BtreeInsert(
  5767   BtCursor *pCur,                /* Insert data into the table of this cursor */
  5768   const void *pKey, i64 nKey,    /* The key of the new record */
  5769   const void *pData, int nData,  /* The data of the new record */
  5770   int nZero,                     /* Number of extra 0 bytes to append to data */
  5771   int appendBias                 /* True if this is likely an append */
  5772 ){
  5773   int rc;
  5774   int loc;
  5775   int szNew;
  5776   int idx;
  5777   MemPage *pPage;
  5778   Btree *p = pCur->pBtree;
  5779   BtShared *pBt = p->pBt;
  5780   unsigned char *oldCell;
  5781   unsigned char *newCell = 0;
  5782 
  5783   assert( cursorHoldsMutex(pCur) );
  5784   if( pBt->inTransaction!=TRANS_WRITE ){
  5785     /* Must start a transaction before doing an insert */
  5786     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
  5787     return rc;
  5788   }
  5789   assert( !pBt->readOnly );
  5790   if( !pCur->wrFlag ){
  5791     return SQLITE_PERM;   /* Cursor not open for writing */
  5792   }
  5793   if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur, nKey) ){
  5794     return SQLITE_LOCKED; /* The table pCur points to has a read lock */
  5795   }
  5796   if( pCur->eState==CURSOR_FAULT ){
  5797     return pCur->skip;
  5798   }
  5799 
  5800   /* Save the positions of any other cursors open on this table */
  5801   sqlite3BtreeClearCursor(pCur);
  5802   if( 
  5803     SQLITE_OK!=(rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur)) ||
  5804     SQLITE_OK!=(rc = sqlite3BtreeMoveto(pCur, pKey, nKey, appendBias, &loc))
  5805   ){
  5806     return rc;
  5807   }
  5808 
  5809   pPage = pCur->apPage[pCur->iPage];
  5810   assert( pPage->intKey || nKey>=0 );
  5811   assert( pPage->leaf || !pPage->intKey );
  5812   TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
  5813           pCur->pgnoRoot, nKey, nData, pPage->pgno,
  5814           loc==0 ? "overwrite" : "new entry"));
  5815   assert( pPage->isInit );
  5816   allocateTempSpace(pBt);
  5817   newCell = pBt->pTmpSpace;
  5818   if( newCell==0 ) return SQLITE_NOMEM;
  5819   rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);
  5820   if( rc ) goto end_insert;
  5821   assert( szNew==cellSizePtr(pPage, newCell) );
  5822   assert( szNew<=MX_CELL_SIZE(pBt) );
  5823   idx = pCur->aiIdx[pCur->iPage];
  5824   if( loc==0 && CURSOR_VALID==pCur->eState ){
  5825     u16 szOld;
  5826     assert( idx<pPage->nCell );
  5827     rc = sqlite3PagerWrite(pPage->pDbPage);
  5828     if( rc ){
  5829       goto end_insert;
  5830     }
  5831     oldCell = findCell(pPage, idx);
  5832     if( !pPage->leaf ){
  5833       memcpy(newCell, oldCell, 4);
  5834     }
  5835     szOld = cellSizePtr(pPage, oldCell);
  5836     rc = clearCell(pPage, oldCell);
  5837     if( rc ) goto end_insert;
  5838     rc = dropCell(pPage, idx, szOld);
  5839     if( rc ) goto end_insert;
  5840   }else if( loc<0 && pPage->nCell>0 ){
  5841     assert( pPage->leaf );
  5842     idx = ++pCur->aiIdx[pCur->iPage];
  5843     pCur->info.nSize = 0;
  5844     pCur->validNKey = 0;
  5845   }else{
  5846     assert( pPage->leaf );
  5847   }
  5848   rc = insertCell(pPage, idx, newCell, szNew, 0, 0);
  5849   if( rc!=SQLITE_OK ) goto end_insert;
  5850   rc = balance(pCur, 1);
  5851   if( rc==SQLITE_OK ){
  5852     moveToRoot(pCur);
  5853   }
  5854 end_insert:
  5855   return rc;
  5856 }
  5857 
  5858 /*
  5859 ** Delete the entry that the cursor is pointing to.  The cursor
  5860 ** is left pointing at a arbitrary location.
  5861 */
  5862 int sqlite3BtreeDelete(BtCursor *pCur){
  5863   MemPage *pPage = pCur->apPage[pCur->iPage];
  5864   int idx;
  5865   unsigned char *pCell;
  5866   int rc;
  5867   Pgno pgnoChild = 0;
  5868   Btree *p = pCur->pBtree;
  5869   BtShared *pBt = p->pBt;
  5870 
  5871   assert( cursorHoldsMutex(pCur) );
  5872   assert( pPage->isInit );
  5873   if( pBt->inTransaction!=TRANS_WRITE ){
  5874     /* Must start a transaction before doing a delete */
  5875     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
  5876     return rc;
  5877   }
  5878   assert( !pBt->readOnly );
  5879   if( pCur->eState==CURSOR_FAULT ){
  5880     return pCur->skip;
  5881   }
  5882   if( pCur->aiIdx[pCur->iPage]>=pPage->nCell ){
  5883     return SQLITE_ERROR;  /* The cursor is not pointing to anything */
  5884   }
  5885   if( !pCur->wrFlag ){
  5886     return SQLITE_PERM;   /* Did not open this cursor for writing */
  5887   }
  5888   if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur, pCur->info.nKey) ){
  5889     return SQLITE_LOCKED; /* The table pCur points to has a read lock */
  5890   }
  5891 
  5892   /* Restore the current cursor position (a no-op if the cursor is not in 
  5893   ** CURSOR_REQUIRESEEK state) and save the positions of any other cursors 
  5894   ** open on the same table. Then call sqlite3PagerWrite() on the page
  5895   ** that the entry will be deleted from.
  5896   */
  5897   if( 
  5898     (rc = restoreCursorPosition(pCur))!=0 ||
  5899     (rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur))!=0 ||
  5900     (rc = sqlite3PagerWrite(pPage->pDbPage))!=0
  5901   ){
  5902     return rc;
  5903   }
  5904 
  5905   /* Locate the cell within its page and leave pCell pointing to the
  5906   ** data. The clearCell() call frees any overflow pages associated with the
  5907   ** cell. The cell itself is still intact.
  5908   */
  5909   idx = pCur->aiIdx[pCur->iPage];
  5910   pCell = findCell(pPage, idx);
  5911   if( !pPage->leaf ){
  5912     pgnoChild = get4byte(pCell);
  5913   }
  5914   rc = clearCell(pPage, pCell);
  5915   if( rc ){
  5916     return rc;
  5917   }
  5918 
  5919   if( !pPage->leaf ){
  5920     /*
  5921     ** The entry we are about to delete is not a leaf so if we do not
  5922     ** do something we will leave a hole on an internal page.
  5923     ** We have to fill the hole by moving in a cell from a leaf.  The
  5924     ** next Cell after the one to be deleted is guaranteed to exist and
  5925     ** to be a leaf so we can use it.
  5926     */
  5927     BtCursor leafCur;
  5928     MemPage *pLeafPage;
  5929 
  5930     unsigned char *pNext;
  5931     int notUsed;
  5932     unsigned char *tempCell = 0;
  5933     assert( !pPage->intKey );
  5934     sqlite3BtreeGetTempCursor(pCur, &leafCur);
  5935     rc = sqlite3BtreeNext(&leafCur, &notUsed);
  5936     if( rc==SQLITE_OK ){
  5937       assert( leafCur.aiIdx[leafCur.iPage]==0 );
  5938       pLeafPage = leafCur.apPage[leafCur.iPage];
  5939       rc = sqlite3PagerWrite(pLeafPage->pDbPage);
  5940     }
  5941     if( rc==SQLITE_OK ){
  5942       int leafCursorInvalid = 0;
  5943       u16 szNext;
  5944       TRACE(("DELETE: table=%d delete internal from %d replace from leaf %d\n",
  5945          pCur->pgnoRoot, pPage->pgno, pLeafPage->pgno));
  5946       dropCell(pPage, idx, cellSizePtr(pPage, pCell));
  5947       pNext = findCell(pLeafPage, 0);
  5948       szNext = cellSizePtr(pLeafPage, pNext);
  5949       assert( MX_CELL_SIZE(pBt)>=szNext+4 );
  5950       allocateTempSpace(pBt);
  5951       tempCell = pBt->pTmpSpace;
  5952       if( tempCell==0 ){
  5953         rc = SQLITE_NOMEM;
  5954       }
  5955       if( rc==SQLITE_OK ){
  5956         rc = insertCell(pPage, idx, pNext-4, szNext+4, tempCell, 0);
  5957       }
  5958 
  5959 
  5960       /* The "if" statement in the next code block is critical.  The
  5961       ** slightest error in that statement would allow SQLite to operate
  5962       ** correctly most of the time but produce very rare failures.  To
  5963       ** guard against this, the following macros help to verify that
  5964       ** the "if" statement is well tested.
  5965       */
  5966       testcase( pPage->nOverflow==0 && pPage->nFree<pBt->usableSize*2/3 
  5967                  && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
  5968       testcase( pPage->nOverflow==0 && pPage->nFree==pBt->usableSize*2/3 
  5969                  && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
  5970       testcase( pPage->nOverflow==0 && pPage->nFree==pBt->usableSize*2/3+1 
  5971                  && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
  5972       testcase( pPage->nOverflow>0 && pPage->nFree<=pBt->usableSize*2/3
  5973                  && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
  5974       testcase( (pPage->nOverflow>0 || (pPage->nFree > pBt->usableSize*2/3))
  5975                  && pLeafPage->nFree+2+szNext == pBt->usableSize*2/3 );
  5976 
  5977 
  5978       if( (pPage->nOverflow>0 || (pPage->nFree > pBt->usableSize*2/3)) &&
  5979           (pLeafPage->nFree+2+szNext > pBt->usableSize*2/3)
  5980       ){
  5981         /* This branch is taken if the internal node is now either overflowing
  5982         ** or underfull and the leaf node will be underfull after the just cell 
  5983         ** copied to the internal node is deleted from it. This is a special
  5984         ** case because the call to balance() to correct the internal node
  5985         ** may change the tree structure and invalidate the contents of
  5986         ** the leafCur.apPage[] and leafCur.aiIdx[] arrays, which will be
  5987         ** used by the balance() required to correct the underfull leaf
  5988         ** node.
  5989         **
  5990         ** The formula used in the expression above are based on facets of
  5991         ** the SQLite file-format that do not change over time.
  5992         */
  5993         testcase( pPage->nFree==pBt->usableSize*2/3+1 );
  5994         testcase( pLeafPage->nFree+2+szNext==pBt->usableSize*2/3+1 );
  5995         leafCursorInvalid = 1;
  5996       }        
  5997 
  5998       if( rc==SQLITE_OK ){
  5999         put4byte(findOverflowCell(pPage, idx), pgnoChild);
  6000         VVA_ONLY( pCur->pagesShuffled = 0 );
  6001         rc = balance(pCur, 0);
  6002       }
  6003 
  6004       if( rc==SQLITE_OK && leafCursorInvalid ){
  6005         /* The leaf-node is now underfull and so the tree needs to be 
  6006         ** rebalanced. However, the balance() operation on the internal
  6007         ** node above may have modified the structure of the B-Tree and
  6008         ** so the current contents of leafCur.apPage[] and leafCur.aiIdx[]
  6009         ** may not be trusted.
  6010         **
  6011         ** It is not possible to copy the ancestry from pCur, as the same
  6012         ** balance() call has invalidated the pCur->apPage[] and aiIdx[]
  6013         ** arrays. 
  6014         **
  6015         ** The call to saveCursorPosition() below internally saves the 
  6016         ** key that leafCur is currently pointing to. Currently, there
  6017         ** are two copies of that key in the tree - one here on the leaf
  6018         ** page and one on some internal node in the tree. The copy on
  6019         ** the leaf node is always the next key in tree-order after the 
  6020         ** copy on the internal node. So, the call to sqlite3BtreeNext()
  6021         ** calls restoreCursorPosition() to point the cursor to the copy
  6022         ** stored on the internal node, then advances to the next entry,
  6023         ** which happens to be the copy of the key on the internal node.
  6024         ** Net effect: leafCur is pointing back to the duplicate cell
  6025         ** that needs to be removed, and the leafCur.apPage[] and
  6026         ** leafCur.aiIdx[] arrays are correct.
  6027         */
  6028         VVA_ONLY( Pgno leafPgno = pLeafPage->pgno );
  6029         rc = saveCursorPosition(&leafCur);
  6030         if( rc==SQLITE_OK ){
  6031           rc = sqlite3BtreeNext(&leafCur, &notUsed);
  6032         }
  6033         pLeafPage = leafCur.apPage[leafCur.iPage];
  6034         assert( pLeafPage->pgno==leafPgno );
  6035         assert( leafCur.aiIdx[leafCur.iPage]==0 );
  6036       }
  6037 
  6038       if( rc==SQLITE_OK ){
  6039         dropCell(pLeafPage, 0, szNext);
  6040         VVA_ONLY( leafCur.pagesShuffled = 0 );
  6041         rc = balance(&leafCur, 0);
  6042         assert( leafCursorInvalid || !leafCur.pagesShuffled
  6043                                    || !pCur->pagesShuffled );
  6044       }
  6045     }
  6046     sqlite3BtreeReleaseTempCursor(&leafCur);
  6047   }else{
  6048     TRACE(("DELETE: table=%d delete from leaf %d\n",
  6049        pCur->pgnoRoot, pPage->pgno));
  6050     rc = dropCell(pPage, idx, cellSizePtr(pPage, pCell));
  6051     if( rc==SQLITE_OK ){
  6052       rc = balance(pCur, 0);
  6053     }
  6054   }
  6055   if( rc==SQLITE_OK ){
  6056     moveToRoot(pCur);
  6057   }
  6058   return rc;
  6059 }
  6060 
  6061 /*
  6062 ** Create a new BTree table.  Write into *piTable the page
  6063 ** number for the root page of the new table.
  6064 **
  6065 ** The type of type is determined by the flags parameter.  Only the
  6066 ** following values of flags are currently in use.  Other values for
  6067 ** flags might not work:
  6068 **
  6069 **     BTREE_INTKEY|BTREE_LEAFDATA     Used for SQL tables with rowid keys
  6070 **     BTREE_ZERODATA                  Used for SQL indices
  6071 */
  6072 static int btreeCreateTable(Btree *p, int *piTable, int flags){
  6073   BtShared *pBt = p->pBt;
  6074   MemPage *pRoot;
  6075   Pgno pgnoRoot;
  6076   int rc;
  6077 
  6078   assert( sqlite3BtreeHoldsMutex(p) );
  6079   if( pBt->inTransaction!=TRANS_WRITE ){
  6080     /* Must start a transaction first */
  6081     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
  6082     return rc;
  6083   }
  6084   assert( !pBt->readOnly );
  6085 
  6086 #ifdef SQLITE_OMIT_AUTOVACUUM
  6087   rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
  6088   if( rc ){
  6089     return rc;
  6090   }
  6091 #else
  6092   if( pBt->autoVacuum ){
  6093     Pgno pgnoMove;      /* Move a page here to make room for the root-page */
  6094     MemPage *pPageMove; /* The page to move to. */
  6095 
  6096     /* Creating a new table may probably require moving an existing database
  6097     ** to make room for the new tables root page. In case this page turns
  6098     ** out to be an overflow page, delete all overflow page-map caches
  6099     ** held by open cursors.
  6100     */
  6101     invalidateAllOverflowCache(pBt);
  6102 
  6103     /* Read the value of meta[3] from the database to determine where the
  6104     ** root page of the new table should go. meta[3] is the largest root-page
  6105     ** created so far, so the new root-page is (meta[3]+1).
  6106     */
  6107     rc = sqlite3BtreeGetMeta(p, 4, &pgnoRoot);
  6108     if( rc!=SQLITE_OK ){
  6109       return rc;
  6110     }
  6111     pgnoRoot++;
  6112 
  6113     /* The new root-page may not be allocated on a pointer-map page, or the
  6114     ** PENDING_BYTE page.
  6115     */
  6116     while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
  6117         pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
  6118       pgnoRoot++;
  6119     }
  6120     assert( pgnoRoot>=3 );
  6121 
  6122     /* Allocate a page. The page that currently resides at pgnoRoot will
  6123     ** be moved to the allocated page (unless the allocated page happens
  6124     ** to reside at pgnoRoot).
  6125     */
  6126     rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1);
  6127     if( rc!=SQLITE_OK ){
  6128       return rc;
  6129     }
  6130 
  6131     if( pgnoMove!=pgnoRoot ){
  6132       /* pgnoRoot is the page that will be used for the root-page of
  6133       ** the new table (assuming an error did not occur). But we were
  6134       ** allocated pgnoMove. If required (i.e. if it was not allocated
  6135       ** by extending the file), the current page at position pgnoMove
  6136       ** is already journaled.
  6137       */
  6138       u8 eType;
  6139       Pgno iPtrPage;
  6140 
  6141       releasePage(pPageMove);
  6142 
  6143       /* Move the page currently at pgnoRoot to pgnoMove. */
  6144       rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0);
  6145       if( rc!=SQLITE_OK ){
  6146         return rc;
  6147       }
  6148       rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
  6149       if( rc!=SQLITE_OK || eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
  6150         releasePage(pRoot);
  6151         return rc;
  6152       }
  6153       assert( eType!=PTRMAP_ROOTPAGE );
  6154       assert( eType!=PTRMAP_FREEPAGE );
  6155       rc = sqlite3PagerWrite(pRoot->pDbPage);
  6156       if( rc!=SQLITE_OK ){
  6157         releasePage(pRoot);
  6158         return rc;
  6159       }
  6160       rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
  6161       releasePage(pRoot);
  6162 
  6163       /* Obtain the page at pgnoRoot */
  6164       if( rc!=SQLITE_OK ){
  6165         return rc;
  6166       }
  6167       rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0);
  6168       if( rc!=SQLITE_OK ){
  6169         return rc;
  6170       }
  6171       rc = sqlite3PagerWrite(pRoot->pDbPage);
  6172       if( rc!=SQLITE_OK ){
  6173         releasePage(pRoot);
  6174         return rc;
  6175       }
  6176     }else{
  6177       pRoot = pPageMove;
  6178     } 
  6179 
  6180     /* Update the pointer-map and meta-data with the new root-page number. */
  6181     rc = ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0);
  6182     if( rc ){
  6183       releasePage(pRoot);
  6184       return rc;
  6185     }
  6186     rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
  6187     if( rc ){
  6188       releasePage(pRoot);
  6189       return rc;
  6190     }
  6191 
  6192   }else{
  6193     rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
  6194     if( rc ) return rc;
  6195   }
  6196 #endif
  6197   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
  6198   zeroPage(pRoot, flags | PTF_LEAF);
  6199   sqlite3PagerUnref(pRoot->pDbPage);
  6200   *piTable = (int)pgnoRoot;
  6201   return SQLITE_OK;
  6202 }
  6203 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
  6204   int rc;
  6205   sqlite3BtreeEnter(p);
  6206   p->pBt->db = p->db;
  6207   rc = btreeCreateTable(p, piTable, flags);
  6208   sqlite3BtreeLeave(p);
  6209   return rc;
  6210 }
  6211 
  6212 /*
  6213 ** Erase the given database page and all its children.  Return
  6214 ** the page to the freelist.
  6215 */
  6216 static int clearDatabasePage(
  6217   BtShared *pBt,           /* The BTree that contains the table */
  6218   Pgno pgno,            /* Page number to clear */
  6219   MemPage *pParent,     /* Parent page.  NULL for the root */
  6220   int freePageFlag      /* Deallocate page if true */
  6221 ){
  6222   MemPage *pPage = 0;
  6223   int rc;
  6224   unsigned char *pCell;
  6225   int i;
  6226 
  6227   assert( sqlite3_mutex_held(pBt->mutex) );
  6228   if( pgno>pagerPagecount(pBt->pPager) ){
  6229     return SQLITE_CORRUPT_BKPT;
  6230   }
  6231 
  6232   rc = getAndInitPage(pBt, pgno, &pPage);
  6233   if( rc ) goto cleardatabasepage_out;
  6234   for(i=0; i<pPage->nCell; i++){
  6235     pCell = findCell(pPage, i);
  6236     if( !pPage->leaf ){
  6237       rc = clearDatabasePage(pBt, get4byte(pCell), pPage, 1);
  6238       if( rc ) goto cleardatabasepage_out;
  6239     }
  6240     rc = clearCell(pPage, pCell);
  6241     if( rc ) goto cleardatabasepage_out;
  6242   }
  6243   if( !pPage->leaf ){
  6244     rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), pPage, 1);
  6245     if( rc ) goto cleardatabasepage_out;
  6246   }
  6247   if( freePageFlag ){
  6248     rc = freePage(pPage);
  6249   }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
  6250     zeroPage(pPage, pPage->aData[0] | PTF_LEAF);
  6251   }
  6252 
  6253 cleardatabasepage_out:
  6254   releasePage(pPage);
  6255   return rc;
  6256 }
  6257 
  6258 /*
  6259 ** Delete all information from a single table in the database.  iTable is
  6260 ** the page number of the root of the table.  After this routine returns,
  6261 ** the root page is empty, but still exists.
  6262 **
  6263 ** This routine will fail with SQLITE_LOCKED if there are any open
  6264 ** read cursors on the table.  Open write cursors are moved to the
  6265 ** root of the table.
  6266 */
  6267 int sqlite3BtreeClearTable(Btree *p, int iTable){
  6268   int rc;
  6269   BtShared *pBt = p->pBt;
  6270   sqlite3BtreeEnter(p);
  6271   pBt->db = p->db;
  6272   if( p->inTrans!=TRANS_WRITE ){
  6273     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
  6274   }else if( (rc = checkReadLocks(p, iTable, 0, 1))!=SQLITE_OK ){
  6275     /* nothing to do */
  6276   }else if( SQLITE_OK!=(rc = saveAllCursors(pBt, iTable, 0)) ){
  6277     /* nothing to do */
  6278   }else{
  6279     rc = clearDatabasePage(pBt, (Pgno)iTable, 0, 0);
  6280   }
  6281   sqlite3BtreeLeave(p);
  6282   return rc;
  6283 }
  6284 
  6285 /*
  6286 ** Erase all information in a table and add the root of the table to
  6287 ** the freelist.  Except, the root of the principle table (the one on
  6288 ** page 1) is never added to the freelist.
  6289 **
  6290 ** This routine will fail with SQLITE_LOCKED if there are any open
  6291 ** cursors on the table.
  6292 **
  6293 ** If AUTOVACUUM is enabled and the page at iTable is not the last
  6294 ** root page in the database file, then the last root page 
  6295 ** in the database file is moved into the slot formerly occupied by
  6296 ** iTable and that last slot formerly occupied by the last root page
  6297 ** is added to the freelist instead of iTable.  In this say, all
  6298 ** root pages are kept at the beginning of the database file, which
  6299 ** is necessary for AUTOVACUUM to work right.  *piMoved is set to the 
  6300 ** page number that used to be the last root page in the file before
  6301 ** the move.  If no page gets moved, *piMoved is set to 0.
  6302 ** The last root page is recorded in meta[3] and the value of
  6303 ** meta[3] is updated by this procedure.
  6304 */
  6305 static int btreeDropTable(Btree *p, int iTable, int *piMoved){
  6306   int rc;
  6307   MemPage *pPage = 0;
  6308   BtShared *pBt = p->pBt;
  6309 
  6310   assert( sqlite3BtreeHoldsMutex(p) );
  6311   if( p->inTrans!=TRANS_WRITE ){
  6312     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
  6313   }
  6314 
  6315   /* It is illegal to drop a table if any cursors are open on the
  6316   ** database. This is because in auto-vacuum mode the backend may
  6317   ** need to move another root-page to fill a gap left by the deleted
  6318   ** root page. If an open cursor was using this page a problem would 
  6319   ** occur.
  6320   */
  6321   if( pBt->pCursor ){
  6322     return SQLITE_LOCKED;
  6323   }
  6324 
  6325   rc = sqlite3BtreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
  6326   if( rc ) return rc;
  6327   rc = sqlite3BtreeClearTable(p, iTable);
  6328   if( rc ){
  6329     releasePage(pPage);
  6330     return rc;
  6331   }
  6332 
  6333   *piMoved = 0;
  6334 
  6335   if( iTable>1 ){
  6336 #ifdef SQLITE_OMIT_AUTOVACUUM
  6337     rc = freePage(pPage);
  6338     releasePage(pPage);
  6339 #else
  6340     if( pBt->autoVacuum ){
  6341       Pgno maxRootPgno;
  6342       rc = sqlite3BtreeGetMeta(p, 4, &maxRootPgno);
  6343       if( rc!=SQLITE_OK ){
  6344         releasePage(pPage);
  6345         return rc;
  6346       }
  6347 
  6348       if( iTable==maxRootPgno ){
  6349         /* If the table being dropped is the table with the largest root-page
  6350         ** number in the database, put the root page on the free list. 
  6351         */
  6352         rc = freePage(pPage);
  6353         releasePage(pPage);
  6354         if( rc!=SQLITE_OK ){
  6355           return rc;
  6356         }
  6357       }else{
  6358         /* The table being dropped does not have the largest root-page
  6359         ** number in the database. So move the page that does into the 
  6360         ** gap left by the deleted root-page.
  6361         */
  6362         MemPage *pMove;
  6363         releasePage(pPage);
  6364         rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0);
  6365         if( rc!=SQLITE_OK ){
  6366           return rc;
  6367         }
  6368         rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
  6369         releasePage(pMove);
  6370         if( rc!=SQLITE_OK ){
  6371           return rc;
  6372         }
  6373         rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0);
  6374         if( rc!=SQLITE_OK ){
  6375           return rc;
  6376         }
  6377         rc = freePage(pMove);
  6378         releasePage(pMove);
  6379         if( rc!=SQLITE_OK ){
  6380           return rc;
  6381         }
  6382         *piMoved = maxRootPgno;
  6383       }
  6384 
  6385       /* Set the new 'max-root-page' value in the database header. This
  6386       ** is the old value less one, less one more if that happens to
  6387       ** be a root-page number, less one again if that is the
  6388       ** PENDING_BYTE_PAGE.
  6389       */
  6390       maxRootPgno--;
  6391       if( maxRootPgno==PENDING_BYTE_PAGE(pBt) ){
  6392         maxRootPgno--;
  6393       }
  6394       if( maxRootPgno==PTRMAP_PAGENO(pBt, maxRootPgno) ){
  6395         maxRootPgno--;
  6396       }
  6397       assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
  6398 
  6399       rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
  6400     }else{
  6401       rc = freePage(pPage);
  6402       releasePage(pPage);
  6403     }
  6404 #endif
  6405   }else{
  6406     /* If sqlite3BtreeDropTable was called on page 1. */
  6407     zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
  6408     releasePage(pPage);
  6409   }
  6410   return rc;  
  6411 }
  6412 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
  6413   int rc;
  6414   sqlite3BtreeEnter(p);
  6415   p->pBt->db = p->db;
  6416   rc = btreeDropTable(p, iTable, piMoved);
  6417   sqlite3BtreeLeave(p);
  6418   return rc;
  6419 }
  6420 
  6421 
  6422 /*
  6423 ** Read the meta-information out of a database file.  Meta[0]
  6424 ** is the number of free pages currently in the database.  Meta[1]
  6425 ** through meta[15] are available for use by higher layers.  Meta[0]
  6426 ** is read-only, the others are read/write.
  6427 ** 
  6428 ** The schema layer numbers meta values differently.  At the schema
  6429 ** layer (and the SetCookie and ReadCookie opcodes) the number of
  6430 ** free pages is not visible.  So Cookie[0] is the same as Meta[1].
  6431 */
  6432 int sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
  6433   DbPage *pDbPage;
  6434   int rc;
  6435   unsigned char *pP1;
  6436   BtShared *pBt = p->pBt;
  6437 
  6438   sqlite3BtreeEnter(p);
  6439   pBt->db = p->db;
  6440 
  6441   /* Reading a meta-data value requires a read-lock on page 1 (and hence
  6442   ** the sqlite_master table. We grab this lock regardless of whether or
  6443   ** not the SQLITE_ReadUncommitted flag is set (the table rooted at page
  6444   ** 1 is treated as a special case by queryTableLock() and lockTable()).
  6445   */
  6446   rc = queryTableLock(p, 1, READ_LOCK);
  6447   if( rc!=SQLITE_OK ){
  6448     sqlite3BtreeLeave(p);
  6449     return rc;
  6450   }
  6451 
  6452   assert( idx>=0 && idx<=15 );
  6453   if( pBt->pPage1 ){
  6454     /* The b-tree is already holding a reference to page 1 of the database
  6455     ** file. In this case the required meta-data value can be read directly
  6456     ** from the page data of this reference. This is slightly faster than
  6457     ** requesting a new reference from the pager layer.
  6458     */
  6459     pP1 = (unsigned char *)pBt->pPage1->aData;
  6460   }else{
  6461     /* The b-tree does not have a reference to page 1 of the database file.
  6462     ** Obtain one from the pager layer.
  6463     */
  6464     rc = sqlite3PagerGet(pBt->pPager, 1, &pDbPage);
  6465     if( rc ){
  6466       sqlite3BtreeLeave(p);
  6467       return rc;
  6468     }
  6469     pP1 = (unsigned char *)sqlite3PagerGetData(pDbPage);
  6470   }
  6471   *pMeta = get4byte(&pP1[36 + idx*4]);
  6472 
  6473   /* If the b-tree is not holding a reference to page 1, then one was 
  6474   ** requested from the pager layer in the above block. Release it now.
  6475   */
  6476   if( !pBt->pPage1 ){
  6477     sqlite3PagerUnref(pDbPage);
  6478   }
  6479 
  6480   /* If autovacuumed is disabled in this build but we are trying to 
  6481   ** access an autovacuumed database, then make the database readonly. 
  6482   */
  6483 #ifdef SQLITE_OMIT_AUTOVACUUM
  6484   if( idx==4 && *pMeta>0 ) pBt->readOnly = 1;
  6485 #endif
  6486 
  6487   /* Grab the read-lock on page 1. */
  6488   rc = lockTable(p, 1, READ_LOCK);
  6489   sqlite3BtreeLeave(p);
  6490   return rc;
  6491 }
  6492 
  6493 /*
  6494 ** Write meta-information back into the database.  Meta[0] is
  6495 ** read-only and may not be written.
  6496 */
  6497 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
  6498   BtShared *pBt = p->pBt;
  6499   unsigned char *pP1;
  6500   int rc;
  6501   assert( idx>=1 && idx<=15 );
  6502   sqlite3BtreeEnter(p);
  6503   pBt->db = p->db;
  6504   if( p->inTrans!=TRANS_WRITE ){
  6505     rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
  6506   }else{
  6507     assert( pBt->pPage1!=0 );
  6508     pP1 = pBt->pPage1->aData;
  6509     rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
  6510     if( rc==SQLITE_OK ){
  6511       put4byte(&pP1[36 + idx*4], iMeta);
  6512 #ifndef SQLITE_OMIT_AUTOVACUUM
  6513       if( idx==7 ){
  6514         assert( pBt->autoVacuum || iMeta==0 );
  6515         assert( iMeta==0 || iMeta==1 );
  6516         pBt->incrVacuum = iMeta;
  6517       }
  6518 #endif
  6519     }
  6520   }
  6521   sqlite3BtreeLeave(p);
  6522   return rc;
  6523 }
  6524 
  6525 /*
  6526 ** Return the flag byte at the beginning of the page that the cursor
  6527 ** is currently pointing to.
  6528 */
  6529 int sqlite3BtreeFlags(BtCursor *pCur){
  6530   /* TODO: What about CURSOR_REQUIRESEEK state? Probably need to call
  6531   ** restoreCursorPosition() here.
  6532   */
  6533   MemPage *pPage;
  6534   restoreCursorPosition(pCur);
  6535   pPage = pCur->apPage[pCur->iPage];
  6536   assert( cursorHoldsMutex(pCur) );
  6537   assert( pPage->pBt==pCur->pBt );
  6538   return pPage ? pPage->aData[pPage->hdrOffset] : 0;
  6539 }
  6540 
  6541 
  6542 /*
  6543 ** Return the pager associated with a BTree.  This routine is used for
  6544 ** testing and debugging only.
  6545 */
  6546 Pager *sqlite3BtreePager(Btree *p){
  6547   return p->pBt->pPager;
  6548 }
  6549 
  6550 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
  6551 /*
  6552 ** Append a message to the error message string.
  6553 */
  6554 static void checkAppendMsg(
  6555   IntegrityCk *pCheck,
  6556   char *zMsg1,
  6557   const char *zFormat,
  6558   ...
  6559 ){
  6560   va_list ap;
  6561   if( !pCheck->mxErr ) return;
  6562   pCheck->mxErr--;
  6563   pCheck->nErr++;
  6564   va_start(ap, zFormat);
  6565   if( pCheck->errMsg.nChar ){
  6566     sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);
  6567   }
  6568   if( zMsg1 ){
  6569     sqlite3StrAccumAppend(&pCheck->errMsg, zMsg1, -1);
  6570   }
  6571   sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap);
  6572   va_end(ap);
  6573   if( pCheck->errMsg.mallocFailed ){
  6574     pCheck->mallocFailed = 1;
  6575   }
  6576 }
  6577 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
  6578 
  6579 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
  6580 /*
  6581 ** Add 1 to the reference count for page iPage.  If this is the second
  6582 ** reference to the page, add an error message to pCheck->zErrMsg.
  6583 ** Return 1 if there are 2 ore more references to the page and 0 if
  6584 ** if this is the first reference to the page.
  6585 **
  6586 ** Also check that the page number is in bounds.
  6587 */
  6588 static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){
  6589   if( iPage==0 ) return 1;
  6590   if( iPage>pCheck->nPage || iPage<0 ){
  6591     checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage);
  6592     return 1;
  6593   }
  6594   if( pCheck->anRef[iPage]==1 ){
  6595     checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage);
  6596     return 1;
  6597   }
  6598   return  (pCheck->anRef[iPage]++)>1;
  6599 }
  6600 
  6601 #ifndef SQLITE_OMIT_AUTOVACUUM
  6602 /*
  6603 ** Check that the entry in the pointer-map for page iChild maps to 
  6604 ** page iParent, pointer type ptrType. If not, append an error message
  6605 ** to pCheck.
  6606 */
  6607 static void checkPtrmap(
  6608   IntegrityCk *pCheck,   /* Integrity check context */
  6609   Pgno iChild,           /* Child page number */
  6610   u8 eType,              /* Expected pointer map type */
  6611   Pgno iParent,          /* Expected pointer map parent page number */
  6612   char *zContext         /* Context description (used for error msg) */
  6613 ){
  6614   int rc;
  6615   u8 ePtrmapType;
  6616   Pgno iPtrmapParent;
  6617 
  6618   rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
  6619   if( rc!=SQLITE_OK ){
  6620     checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild);
  6621     return;
  6622   }
  6623 
  6624   if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
  6625     checkAppendMsg(pCheck, zContext, 
  6626       "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)", 
  6627       iChild, eType, iParent, ePtrmapType, iPtrmapParent);
  6628   }
  6629 }
  6630 #endif
  6631 
  6632 /*
  6633 ** Check the integrity of the freelist or of an overflow page list.
  6634 ** Verify that the number of pages on the list is N.
  6635 */
  6636 static void checkList(
  6637   IntegrityCk *pCheck,  /* Integrity checking context */
  6638   int isFreeList,       /* True for a freelist.  False for overflow page list */
  6639   int iPage,            /* Page number for first page in the list */
  6640   int N,                /* Expected number of pages in the list */
  6641   char *zContext        /* Context for error messages */
  6642 ){
  6643   int i;
  6644   int expected = N;
  6645   int iFirst = iPage;
  6646   while( N-- > 0 && pCheck->mxErr ){
  6647     DbPage *pOvflPage;
  6648     unsigned char *pOvflData;
  6649     if( iPage<1 ){
  6650       checkAppendMsg(pCheck, zContext,
  6651          "%d of %d pages missing from overflow list starting at %d",
  6652           N+1, expected, iFirst);
  6653       break;
  6654     }
  6655     if( checkRef(pCheck, iPage, zContext) ) break;
  6656     if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){
  6657       checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage);
  6658       break;
  6659     }
  6660     pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
  6661     if( isFreeList ){
  6662       int n = get4byte(&pOvflData[4]);
  6663 #ifndef SQLITE_OMIT_AUTOVACUUM
  6664       if( pCheck->pBt->autoVacuum ){
  6665         checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext);
  6666       }
  6667 #endif
  6668       if( n>pCheck->pBt->usableSize/4-2 ){
  6669         checkAppendMsg(pCheck, zContext,
  6670            "freelist leaf count too big on page %d", iPage);
  6671         N--;
  6672       }else{
  6673         for(i=0; i<n; i++){
  6674           Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
  6675 #ifndef SQLITE_OMIT_AUTOVACUUM
  6676           if( pCheck->pBt->autoVacuum ){
  6677             checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext);
  6678           }
  6679 #endif
  6680           checkRef(pCheck, iFreePage, zContext);
  6681         }
  6682         N -= n;
  6683       }
  6684     }
  6685 #ifndef SQLITE_OMIT_AUTOVACUUM
  6686     else{
  6687       /* If this database supports auto-vacuum and iPage is not the last
  6688       ** page in this overflow list, check that the pointer-map entry for
  6689       ** the following page matches iPage.
  6690       */
  6691       if( pCheck->pBt->autoVacuum && N>0 ){
  6692         i = get4byte(pOvflData);
  6693         checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext);
  6694       }
  6695     }
  6696 #endif
  6697     iPage = get4byte(pOvflData);
  6698     sqlite3PagerUnref(pOvflPage);
  6699   }
  6700 }
  6701 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
  6702 
  6703 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
  6704 /*
  6705 ** Do various sanity checks on a single page of a tree.  Return
  6706 ** the tree depth.  Root pages return 0.  Parents of root pages
  6707 ** return 1, and so forth.
  6708 ** 
  6709 ** These checks are done:
  6710 **
  6711 **      1.  Make sure that cells and freeblocks do not overlap
  6712 **          but combine to completely cover the page.
  6713 **  NO  2.  Make sure cell keys are in order.
  6714 **  NO  3.  Make sure no key is less than or equal to zLowerBound.
  6715 **  NO  4.  Make sure no key is greater than or equal to zUpperBound.
  6716 **      5.  Check the integrity of overflow pages.
  6717 **      6.  Recursively call checkTreePage on all children.
  6718 **      7.  Verify that the depth of all children is the same.
  6719 **      8.  Make sure this page is at least 33% full or else it is
  6720 **          the root of the tree.
  6721 */
  6722 static int checkTreePage(
  6723   IntegrityCk *pCheck,  /* Context for the sanity check */
  6724   int iPage,            /* Page number of the page to check */
  6725   MemPage *pParent,     /* Parent page */
  6726   char *zParentContext  /* Parent context */
  6727 ){
  6728   MemPage *pPage;
  6729   int i, rc, depth, d2, pgno, cnt;
  6730   int hdr, cellStart;
  6731   int nCell;
  6732   u8 *data;
  6733   BtShared *pBt;
  6734   int usableSize;
  6735   char zContext[100];
  6736   char *hit = 0;
  6737 
  6738   sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage);
  6739 
  6740   /* Check that the page exists
  6741   */
  6742   pBt = pCheck->pBt;
  6743   usableSize = pBt->usableSize;
  6744   if( iPage==0 ) return 0;
  6745   if( checkRef(pCheck, iPage, zParentContext) ) return 0;
  6746   if( (rc = sqlite3BtreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
  6747     checkAppendMsg(pCheck, zContext,
  6748        "unable to get the page. error code=%d", rc);
  6749     return 0;
  6750   }
  6751   if( (rc = sqlite3BtreeInitPage(pPage))!=0 ){
  6752     checkAppendMsg(pCheck, zContext, 
  6753                    "sqlite3BtreeInitPage() returns error code %d", rc);
  6754     releasePage(pPage);
  6755     return 0;
  6756   }
  6757 
  6758   /* Check out all the cells.
  6759   */
  6760   depth = 0;
  6761   for(i=0; i<pPage->nCell && pCheck->mxErr; i++){
  6762     u8 *pCell;
  6763     int sz;
  6764     CellInfo info;
  6765 
  6766     /* Check payload overflow pages
  6767     */
  6768     sqlite3_snprintf(sizeof(zContext), zContext,
  6769              "On tree page %d cell %d: ", iPage, i);
  6770     pCell = findCell(pPage,i);
  6771     sqlite3BtreeParseCellPtr(pPage, pCell, &info);
  6772     sz = info.nData;
  6773     if( !pPage->intKey ) sz += info.nKey;
  6774     assert( sz==info.nPayload );
  6775     if( sz>info.nLocal ){
  6776       int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4);
  6777       Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
  6778 #ifndef SQLITE_OMIT_AUTOVACUUM
  6779       if( pBt->autoVacuum ){
  6780         checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext);
  6781       }
  6782 #endif
  6783       checkList(pCheck, 0, pgnoOvfl, nPage, zContext);
  6784     }
  6785 
  6786     /* Check sanity of left child page.
  6787     */
  6788     if( !pPage->leaf ){
  6789       pgno = get4byte(pCell);
  6790 #ifndef SQLITE_OMIT_AUTOVACUUM
  6791       if( pBt->autoVacuum ){
  6792         checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
  6793       }
  6794 #endif
  6795       d2 = checkTreePage(pCheck,pgno,pPage,zContext);
  6796       if( i>0 && d2!=depth ){
  6797         checkAppendMsg(pCheck, zContext, "Child page depth differs");
  6798       }
  6799       depth = d2;
  6800     }
  6801   }
  6802   if( !pPage->leaf ){
  6803     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
  6804     sqlite3_snprintf(sizeof(zContext), zContext, 
  6805                      "On page %d at right child: ", iPage);
  6806 #ifndef SQLITE_OMIT_AUTOVACUUM
  6807     if( pBt->autoVacuum ){
  6808       checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, 0);
  6809     }
  6810 #endif
  6811     checkTreePage(pCheck, pgno, pPage, zContext);
  6812   }
  6813  
  6814   /* Check for complete coverage of the page
  6815   */
  6816   data = pPage->aData;
  6817   hdr = pPage->hdrOffset;
  6818   hit = sqlite3PageMalloc( pBt->pageSize );
  6819   if( hit==0 ){
  6820     pCheck->mallocFailed = 1;
  6821   }else{
  6822     u16 contentOffset = get2byte(&data[hdr+5]);
  6823     if (contentOffset > usableSize) {
  6824       checkAppendMsg(pCheck, 0, 
  6825                      "Corruption detected in header on page %d",iPage,0);
  6826       goto check_page_abort;
  6827     }
  6828     memset(hit, 0, usableSize );
  6829     memset(hit, 1, get2byte(&data[hdr+5]));
  6830     nCell = get2byte(&data[hdr+3]);
  6831     cellStart = hdr + 12 - 4*pPage->leaf;
  6832     for(i=0; i<nCell; i++){
  6833       int pc = get2byte(&data[cellStart+i*2]);
  6834       u16 size = 1024;
  6835       int j;
  6836       if( pc<=usableSize ){
  6837         size = cellSizePtr(pPage, &data[pc]);
  6838       }
  6839       if( (pc+size-1)>=usableSize || pc<0 ){
  6840         checkAppendMsg(pCheck, 0, 
  6841             "Corruption detected in cell %d on page %d",i,iPage,0);
  6842       }else{
  6843         for(j=pc+size-1; j>=pc; j--) hit[j]++;
  6844       }
  6845     }
  6846     for(cnt=0, i=get2byte(&data[hdr+1]); i>0 && i<usableSize && cnt<10000; 
  6847            cnt++){
  6848       int size = get2byte(&data[i+2]);
  6849       int j;
  6850       if( (i+size-1)>=usableSize || i<0 ){
  6851         checkAppendMsg(pCheck, 0,  
  6852             "Corruption detected in cell %d on page %d",i,iPage,0);
  6853       }else{
  6854         for(j=i+size-1; j>=i; j--) hit[j]++;
  6855       }
  6856       i = get2byte(&data[i]);
  6857     }
  6858     for(i=cnt=0; i<usableSize; i++){
  6859       if( hit[i]==0 ){
  6860         cnt++;
  6861       }else if( hit[i]>1 ){
  6862         checkAppendMsg(pCheck, 0,
  6863           "Multiple uses for byte %d of page %d", i, iPage);
  6864         break;
  6865       }
  6866     }
  6867     if( cnt!=data[hdr+7] ){
  6868       checkAppendMsg(pCheck, 0, 
  6869           "Fragmented space is %d byte reported as %d on page %d",
  6870           cnt, data[hdr+7], iPage);
  6871     }
  6872   }
  6873 
  6874 check_page_abort:
  6875   if( hit ) sqlite3PageFree(hit);
  6876 
  6877   releasePage(pPage);
  6878   return depth+1;
  6879 }
  6880 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
  6881 
  6882 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
  6883 /*
  6884 ** This routine does a complete check of the given BTree file.  aRoot[] is
  6885 ** an array of pages numbers were each page number is the root page of
  6886 ** a table.  nRoot is the number of entries in aRoot.
  6887 **
  6888 ** Write the number of error seen in *pnErr.  Except for some memory
  6889 ** allocation errors,  nn error message is held in memory obtained from
  6890 ** malloc is returned if *pnErr is non-zero.  If *pnErr==0 then NULL is
  6891 ** returned.
  6892 */
  6893 char *sqlite3BtreeIntegrityCheck(
  6894   Btree *p,     /* The btree to be checked */
  6895   int *aRoot,   /* An array of root pages numbers for individual trees */
  6896   int nRoot,    /* Number of entries in aRoot[] */
  6897   int mxErr,    /* Stop reporting errors after this many */
  6898   int *pnErr    /* Write number of errors seen to this variable */
  6899 ){
  6900   int i;
  6901   int nRef;
  6902   IntegrityCk sCheck;
  6903   BtShared *pBt = p->pBt;
  6904   char zErr[100];
  6905 
  6906   sqlite3BtreeEnter(p);
  6907   pBt->db = p->db;
  6908   nRef = sqlite3PagerRefcount(pBt->pPager);
  6909   if( lockBtreeWithRetry(p)!=SQLITE_OK ){
  6910     *pnErr = 1;
  6911     sqlite3BtreeLeave(p);
  6912     return sqlite3DbStrDup(0, "cannot acquire a read lock on the database");
  6913   }
  6914   sCheck.pBt = pBt;
  6915   sCheck.pPager = pBt->pPager;
  6916   sCheck.nPage = pagerPagecount(sCheck.pPager);
  6917   sCheck.mxErr = mxErr;
  6918   sCheck.nErr = 0;
  6919   sCheck.mallocFailed = 0;
  6920   *pnErr = 0;
  6921 #ifndef SQLITE_OMIT_AUTOVACUUM
  6922   if( pBt->nTrunc!=0 ){
  6923     sCheck.nPage = pBt->nTrunc;
  6924   }
  6925 #endif
  6926   if( sCheck.nPage==0 ){
  6927     unlockBtreeIfUnused(pBt);
  6928     sqlite3BtreeLeave(p);
  6929     return 0;
  6930   }
  6931   sCheck.anRef = sqlite3Malloc( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
  6932   if( !sCheck.anRef ){
  6933     unlockBtreeIfUnused(pBt);
  6934     *pnErr = 1;
  6935     sqlite3BtreeLeave(p);
  6936     return 0;
  6937   }
  6938   for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
  6939   i = PENDING_BYTE_PAGE(pBt);
  6940   if( i<=sCheck.nPage ){
  6941     sCheck.anRef[i] = 1;
  6942   }
  6943   sqlite3StrAccumInit(&sCheck.errMsg, zErr, sizeof(zErr), 20000);
  6944 
  6945   /* Check the integrity of the freelist
  6946   */
  6947   checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
  6948             get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
  6949 
  6950   /* Check all the tables.
  6951   */
  6952   for(i=0; i<nRoot && sCheck.mxErr; i++){
  6953     if( aRoot[i]==0 ) continue;
  6954 #ifndef SQLITE_OMIT_AUTOVACUUM
  6955     if( pBt->autoVacuum && aRoot[i]>1 ){
  6956       checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0);
  6957     }
  6958 #endif
  6959     checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: ");
  6960   }
  6961 
  6962   /* Make sure every page in the file is referenced
  6963   */
  6964   for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
  6965 #ifdef SQLITE_OMIT_AUTOVACUUM
  6966     if( sCheck.anRef[i]==0 ){
  6967       checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
  6968     }
  6969 #else
  6970     /* If the database supports auto-vacuum, make sure no tables contain
  6971     ** references to pointer-map pages.
  6972     */
  6973     if( sCheck.anRef[i]==0 && 
  6974        (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
  6975       checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
  6976     }
  6977     if( sCheck.anRef[i]!=0 && 
  6978        (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
  6979       checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i);
  6980     }
  6981 #endif
  6982   }
  6983 
  6984   /* Make sure this analysis did not leave any unref() pages
  6985   */
  6986   unlockBtreeIfUnused(pBt);
  6987   if( nRef != sqlite3PagerRefcount(pBt->pPager) ){
  6988     checkAppendMsg(&sCheck, 0, 
  6989       "Outstanding page count goes from %d to %d during this analysis",
  6990       nRef, sqlite3PagerRefcount(pBt->pPager)
  6991     );
  6992   }
  6993 
  6994   /* Clean  up and report errors.
  6995   */
  6996   sqlite3BtreeLeave(p);
  6997   sqlite3_free(sCheck.anRef);
  6998   if( sCheck.mallocFailed ){
  6999     sqlite3StrAccumReset(&sCheck.errMsg);
  7000     *pnErr = sCheck.nErr+1;
  7001     return 0;
  7002   }
  7003   *pnErr = sCheck.nErr;
  7004   if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);
  7005   return sqlite3StrAccumFinish(&sCheck.errMsg);
  7006 }
  7007 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
  7008 
  7009 /*
  7010 ** Return the full pathname of the underlying database file.
  7011 **
  7012 ** The pager filename is invariant as long as the pager is
  7013 ** open so it is safe to access without the BtShared mutex.
  7014 */
  7015 const char *sqlite3BtreeGetFilename(Btree *p){
  7016   assert( p->pBt->pPager!=0 );
  7017   return sqlite3PagerFilename(p->pBt->pPager);
  7018 }
  7019 
  7020 /*
  7021 ** Return the pathname of the directory that contains the database file.
  7022 **
  7023 ** The pager directory name is invariant as long as the pager is
  7024 ** open so it is safe to access without the BtShared mutex.
  7025 */
  7026 const char *sqlite3BtreeGetDirname(Btree *p){
  7027   assert( p->pBt->pPager!=0 );
  7028   return sqlite3PagerDirname(p->pBt->pPager);
  7029 }
  7030 
  7031 /*
  7032 ** Return the pathname of the journal file for this database. The return
  7033 ** value of this routine is the same regardless of whether the journal file
  7034 ** has been created or not.
  7035 **
  7036 ** The pager journal filename is invariant as long as the pager is
  7037 ** open so it is safe to access without the BtShared mutex.
  7038 */
  7039 const char *sqlite3BtreeGetJournalname(Btree *p){
  7040   assert( p->pBt->pPager!=0 );
  7041   return sqlite3PagerJournalname(p->pBt->pPager);
  7042 }
  7043 
  7044 #ifndef SQLITE_OMIT_VACUUM
  7045 /*
  7046 ** Copy the complete content of pBtFrom into pBtTo.  A transaction
  7047 ** must be active for both files.
  7048 **
  7049 ** The size of file pTo may be reduced by this operation.
  7050 ** If anything goes wrong, the transaction on pTo is rolled back. 
  7051 **
  7052 ** If successful, CommitPhaseOne() may be called on pTo before returning. 
  7053 ** The caller should finish committing the transaction on pTo by calling
  7054 ** sqlite3BtreeCommit().
  7055 */
  7056 static int btreeCopyFile(Btree *pTo, Btree *pFrom){
  7057   int rc = SQLITE_OK;
  7058   Pgno i;
  7059 
  7060   Pgno nFromPage;     /* Number of pages in pFrom */
  7061   Pgno nToPage;       /* Number of pages in pTo */
  7062   Pgno nNewPage;      /* Number of pages in pTo after the copy */
  7063 
  7064   Pgno iSkip;         /* Pending byte page in pTo */
  7065   int nToPageSize;    /* Page size of pTo in bytes */
  7066   int nFromPageSize;  /* Page size of pFrom in bytes */
  7067 
  7068   BtShared *pBtTo = pTo->pBt;
  7069   BtShared *pBtFrom = pFrom->pBt;
  7070   pBtTo->db = pTo->db;
  7071   pBtFrom->db = pFrom->db;
  7072 
  7073   nToPageSize = pBtTo->pageSize;
  7074   nFromPageSize = pBtFrom->pageSize;
  7075 
  7076   if( pTo->inTrans!=TRANS_WRITE || pFrom->inTrans!=TRANS_WRITE ){
  7077     return SQLITE_ERROR;
  7078   }
  7079   if( pBtTo->pCursor ){
  7080     return SQLITE_BUSY;
  7081   }
  7082 
  7083   nToPage = pagerPagecount(pBtTo->pPager);
  7084   nFromPage = pagerPagecount(pBtFrom->pPager);
  7085   iSkip = PENDING_BYTE_PAGE(pBtTo);
  7086 
  7087   /* Variable nNewPage is the number of pages required to store the
  7088   ** contents of pFrom using the current page-size of pTo.
  7089   */
  7090   nNewPage = ((i64)nFromPage * (i64)nFromPageSize + (i64)nToPageSize - 1) / 
  7091       (i64)nToPageSize;
  7092 
  7093   for(i=1; rc==SQLITE_OK && (i<=nToPage || i<=nNewPage); i++){
  7094 
  7095     /* Journal the original page.
  7096     **
  7097     ** iSkip is the page number of the locking page (PENDING_BYTE_PAGE)
  7098     ** in database *pTo (before the copy). This page is never written 
  7099     ** into the journal file. Unless i==iSkip or the page was not
  7100     ** present in pTo before the copy operation, journal page i from pTo.
  7101     */
  7102     if( i!=iSkip && i<=nToPage ){
  7103       DbPage *pDbPage = 0;
  7104       rc = sqlite3PagerGet(pBtTo->pPager, i, &pDbPage);
  7105       if( rc==SQLITE_OK ){
  7106         rc = sqlite3PagerWrite(pDbPage);
  7107         if( rc==SQLITE_OK && i>nFromPage ){
  7108           /* Yeah.  It seems wierd to call DontWrite() right after Write(). But
  7109           ** that is because the names of those procedures do not exactly 
  7110           ** represent what they do.  Write() really means "put this page in the
  7111           ** rollback journal and mark it as dirty so that it will be written
  7112           ** to the database file later."  DontWrite() undoes the second part of
  7113           ** that and prevents the page from being written to the database. The
  7114           ** page is still on the rollback journal, though.  And that is the 
  7115           ** whole point of this block: to put pages on the rollback journal. 
  7116           */
  7117           rc = sqlite3PagerDontWrite(pDbPage);
  7118         }
  7119         sqlite3PagerUnref(pDbPage);
  7120       }
  7121     }
  7122 
  7123     /* Overwrite the data in page i of the target database */
  7124     if( rc==SQLITE_OK && i!=iSkip && i<=nNewPage ){
  7125 
  7126       DbPage *pToPage = 0;
  7127       sqlite3_int64 iOff;
  7128 
  7129       rc = sqlite3PagerGet(pBtTo->pPager, i, &pToPage);
  7130       if( rc==SQLITE_OK ){
  7131         rc = sqlite3PagerWrite(pToPage);
  7132       }
  7133 
  7134       for(
  7135         iOff=(i-1)*nToPageSize; 
  7136         rc==SQLITE_OK && iOff<i*nToPageSize; 
  7137         iOff += nFromPageSize
  7138       ){
  7139         DbPage *pFromPage = 0;
  7140         Pgno iFrom = (iOff/nFromPageSize)+1;
  7141 
  7142         if( iFrom==PENDING_BYTE_PAGE(pBtFrom) ){
  7143           continue;
  7144         }
  7145 
  7146         rc = sqlite3PagerGet(pBtFrom->pPager, iFrom, &pFromPage);
  7147         if( rc==SQLITE_OK ){
  7148           char *zTo = sqlite3PagerGetData(pToPage);
  7149           char *zFrom = sqlite3PagerGetData(pFromPage);
  7150           int nCopy;
  7151 
  7152           if( nFromPageSize>=nToPageSize ){
  7153             zFrom += ((i-1)*nToPageSize - ((iFrom-1)*nFromPageSize));
  7154             nCopy = nToPageSize;
  7155           }else{
  7156             zTo += (((iFrom-1)*nFromPageSize) - (i-1)*nToPageSize);
  7157             nCopy = nFromPageSize;
  7158           }
  7159 
  7160           memcpy(zTo, zFrom, nCopy);
  7161           sqlite3PagerUnref(pFromPage);
  7162         }
  7163       }
  7164 
  7165       if( pToPage ){
  7166         MemPage *p = (MemPage *)sqlite3PagerGetExtra(pToPage);
  7167         p->isInit = 0;
  7168         sqlite3PagerUnref(pToPage);
  7169       }
  7170     }
  7171   }
  7172 
  7173   /* If things have worked so far, the database file may need to be 
  7174   ** truncated. The complex part is that it may need to be truncated to
  7175   ** a size that is not an integer multiple of nToPageSize - the current
  7176   ** page size used by the pager associated with B-Tree pTo.
  7177   **
  7178   ** For example, say the page-size of pTo is 2048 bytes and the original 
  7179   ** number of pages is 5 (10 KB file). If pFrom has a page size of 1024 
  7180   ** bytes and 9 pages, then the file needs to be truncated to 9KB.
  7181   */
  7182   if( rc==SQLITE_OK ){
  7183     if( nFromPageSize!=nToPageSize ){
  7184       sqlite3_file *pFile = sqlite3PagerFile(pBtTo->pPager);
  7185       i64 iSize = (i64)nFromPageSize * (i64)nFromPage;
  7186       i64 iNow = (i64)((nToPage>nNewPage)?nToPage:nNewPage) * (i64)nToPageSize; 
  7187       i64 iPending = ((i64)PENDING_BYTE_PAGE(pBtTo)-1) *(i64)nToPageSize;
  7188   
  7189       assert( iSize<=iNow );
  7190   
  7191       /* Commit phase one syncs the journal file associated with pTo 
  7192       ** containing the original data. It does not sync the database file
  7193       ** itself. After doing this it is safe to use OsTruncate() and other
  7194       ** file APIs on the database file directly.
  7195       */
  7196       pBtTo->db = pTo->db;
  7197       rc = sqlite3PagerCommitPhaseOne(pBtTo->pPager, 0, 0, 1);
  7198       if( iSize<iNow && rc==SQLITE_OK ){
  7199         rc = sqlite3OsTruncate(pFile, iSize);
  7200       }
  7201   
  7202       /* The loop that copied data from database pFrom to pTo did not
  7203       ** populate the locking page of database pTo. If the page-size of
  7204       ** pFrom is smaller than that of pTo, this means some data will
  7205       ** not have been copied. 
  7206       **
  7207       ** This block copies the missing data from database pFrom to pTo 
  7208       ** using file APIs. This is safe because at this point we know that
  7209       ** all of the original data from pTo has been synced into the 
  7210       ** journal file. At this point it would be safe to do anything at
  7211       ** all to the database file except truncate it to zero bytes.
  7212       */
  7213       if( rc==SQLITE_OK && nFromPageSize<nToPageSize && iSize>iPending){
  7214         i64 iOff;
  7215         for(
  7216           iOff=iPending; 
  7217           rc==SQLITE_OK && iOff<(iPending+nToPageSize); 
  7218           iOff += nFromPageSize
  7219         ){
  7220           DbPage *pFromPage = 0;
  7221           Pgno iFrom = (iOff/nFromPageSize)+1;
  7222   
  7223           if( iFrom==PENDING_BYTE_PAGE(pBtFrom) || iFrom>nFromPage ){
  7224             continue;
  7225           }
  7226   
  7227           rc = sqlite3PagerGet(pBtFrom->pPager, iFrom, &pFromPage);
  7228           if( rc==SQLITE_OK ){
  7229             char *zFrom = sqlite3PagerGetData(pFromPage);
  7230             rc = sqlite3OsWrite(pFile, zFrom, nFromPageSize, iOff);
  7231             sqlite3PagerUnref(pFromPage);
  7232           }
  7233         }
  7234       }
  7235   
  7236       /* Sync the database file */
  7237       if( rc==SQLITE_OK ){
  7238         rc = sqlite3PagerSync(pBtTo->pPager);
  7239       }
  7240     }else{
  7241       rc = sqlite3PagerTruncate(pBtTo->pPager, nNewPage);
  7242     }
  7243     if( rc==SQLITE_OK ){
  7244       pBtTo->pageSizeFixed = 0;
  7245     }
  7246   }
  7247 
  7248   if( rc ){
  7249     sqlite3BtreeRollback(pTo);
  7250   }
  7251 
  7252   return rc;  
  7253 }
  7254 int sqlite3BtreeCopyFile(Btree *pTo, Btree *pFrom){
  7255   int rc;
  7256   sqlite3BtreeEnter(pTo);
  7257   sqlite3BtreeEnter(pFrom);
  7258   rc = btreeCopyFile(pTo, pFrom);
  7259   sqlite3BtreeLeave(pFrom);
  7260   sqlite3BtreeLeave(pTo);
  7261   return rc;
  7262 }
  7263 
  7264 #endif /* SQLITE_OMIT_VACUUM */
  7265 
  7266 /*
  7267 ** Return non-zero if a transaction is active.
  7268 */
  7269 int sqlite3BtreeIsInTrans(Btree *p){
  7270   assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
  7271   return (p && (p->inTrans==TRANS_WRITE));
  7272 }
  7273 
  7274 /*
  7275 ** Return non-zero if a statement transaction is active.
  7276 */
  7277 int sqlite3BtreeIsInStmt(Btree *p){
  7278   assert( sqlite3BtreeHoldsMutex(p) );
  7279   return (p->pBt && p->pBt->inStmt);
  7280 }
  7281 
  7282 /*
  7283 ** Return non-zero if a read (or write) transaction is active.
  7284 */
  7285 int sqlite3BtreeIsInReadTrans(Btree *p){
  7286   assert( sqlite3_mutex_held(p->db->mutex) );
  7287   return (p && (p->inTrans!=TRANS_NONE));
  7288 }
  7289 
  7290 /*
  7291 ** This function returns a pointer to a blob of memory associated with
  7292 ** a single shared-btree. The memory is used by client code for its own
  7293 ** purposes (for example, to store a high-level schema associated with 
  7294 ** the shared-btree). The btree layer manages reference counting issues.
  7295 **
  7296 ** The first time this is called on a shared-btree, nBytes bytes of memory
  7297 ** are allocated, zeroed, and returned to the caller. For each subsequent 
  7298 ** call the nBytes parameter is ignored and a pointer to the same blob
  7299 ** of memory returned. 
  7300 **
  7301 ** If the nBytes parameter is 0 and the blob of memory has not yet been
  7302 ** allocated, a null pointer is returned. If the blob has already been
  7303 ** allocated, it is returned as normal.
  7304 **
  7305 ** Just before the shared-btree is closed, the function passed as the 
  7306 ** xFree argument when the memory allocation was made is invoked on the 
  7307 ** blob of allocated memory. This function should not call sqlite3_free()
  7308 ** on the memory, the btree layer does that.
  7309 */
  7310 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
  7311   BtShared *pBt = p->pBt;
  7312   sqlite3BtreeEnter(p);
  7313   if( !pBt->pSchema && nBytes ){
  7314     pBt->pSchema = sqlite3MallocZero(nBytes);
  7315     pBt->xFreeSchema = xFree;
  7316   }
  7317   sqlite3BtreeLeave(p);
  7318   return pBt->pSchema;
  7319 }
  7320 
  7321 /*
  7322 ** Return true if another user of the same shared btree as the argument
  7323 ** handle holds an exclusive lock on the sqlite_master table.
  7324 */
  7325 int sqlite3BtreeSchemaLocked(Btree *p){
  7326   int rc;
  7327   assert( sqlite3_mutex_held(p->db->mutex) );
  7328   sqlite3BtreeEnter(p);
  7329   rc = (queryTableLock(p, MASTER_ROOT, READ_LOCK)!=SQLITE_OK);
  7330   sqlite3BtreeLeave(p);
  7331   return rc;
  7332 }
  7333 
  7334 
  7335 #ifndef SQLITE_OMIT_SHARED_CACHE
  7336 /*
  7337 ** Obtain a lock on the table whose root page is iTab.  The
  7338 ** lock is a write lock if isWritelock is true or a read lock
  7339 ** if it is false.
  7340 */
  7341 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
  7342   int rc = SQLITE_OK;
  7343   if( p->sharable ){
  7344     u8 lockType = READ_LOCK + isWriteLock;
  7345     assert( READ_LOCK+1==WRITE_LOCK );
  7346     assert( isWriteLock==0 || isWriteLock==1 );
  7347     sqlite3BtreeEnter(p);
  7348     rc = queryTableLock(p, iTab, lockType);
  7349     if( rc==SQLITE_OK ){
  7350       rc = lockTable(p, iTab, lockType);
  7351     }
  7352     sqlite3BtreeLeave(p);
  7353   }
  7354   return rc;
  7355 }
  7356 #endif
  7357 
  7358 #ifndef SQLITE_OMIT_INCRBLOB
  7359 /*
  7360 ** Argument pCsr must be a cursor opened for writing on an 
  7361 ** INTKEY table currently pointing at a valid table entry. 
  7362 ** This function modifies the data stored as part of that entry.
  7363 ** Only the data content may only be modified, it is not possible
  7364 ** to change the length of the data stored.
  7365 */
  7366 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
  7367   assert( cursorHoldsMutex(pCsr) );
  7368   assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
  7369   assert(pCsr->isIncrblobHandle);
  7370 
  7371   restoreCursorPosition(pCsr);
  7372   assert( pCsr->eState!=CURSOR_REQUIRESEEK );
  7373   if( pCsr->eState!=CURSOR_VALID ){
  7374     return SQLITE_ABORT;
  7375   }
  7376 
  7377   /* Check some preconditions: 
  7378   **   (a) the cursor is open for writing,
  7379   **   (b) there is no read-lock on the table being modified and
  7380   **   (c) the cursor points at a valid row of an intKey table.
  7381   */
  7382   if( !pCsr->wrFlag ){
  7383     return SQLITE_READONLY;
  7384   }
  7385   assert( !pCsr->pBt->readOnly 
  7386           && pCsr->pBt->inTransaction==TRANS_WRITE );
  7387   if( checkReadLocks(pCsr->pBtree, pCsr->pgnoRoot, pCsr, 0) ){
  7388     return SQLITE_LOCKED; /* The table pCur points to has a read lock */
  7389   }
  7390   if( pCsr->eState==CURSOR_INVALID || !pCsr->apPage[pCsr->iPage]->intKey ){
  7391     return SQLITE_ERROR;
  7392   }
  7393 
  7394   return accessPayload(pCsr, offset, amt, (unsigned char *)z, 0, 1);
  7395 }
  7396 
  7397 /* 
  7398 ** Set a flag on this cursor to cache the locations of pages from the 
  7399 ** overflow list for the current row. This is used by cursors opened
  7400 ** for incremental blob IO only.
  7401 **
  7402 ** This function sets a flag only. The actual page location cache
  7403 ** (stored in BtCursor.aOverflow[]) is allocated and used by function
  7404 ** accessPayload() (the worker function for sqlite3BtreeData() and
  7405 ** sqlite3BtreePutData()).
  7406 */
  7407 void sqlite3BtreeCacheOverflow(BtCursor *pCur){
  7408   assert( cursorHoldsMutex(pCur) );
  7409   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
  7410   assert(!pCur->isIncrblobHandle);
  7411   assert(!pCur->aOverflow);
  7412   pCur->isIncrblobHandle = 1;
  7413 }
  7414 #endif