1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/persistentdata/persistentstorage/sqlite3api/SQLite/btree.c Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,7378 @@
1.4 +/*
1.5 +** 2004 April 6
1.6 +**
1.7 +** The author disclaims copyright to this source code. In place of
1.8 +** a legal notice, here is a blessing:
1.9 +**
1.10 +** May you do good and not evil.
1.11 +** May you find forgiveness for yourself and forgive others.
1.12 +** May you share freely, never taking more than you give.
1.13 +**
1.14 +*************************************************************************
1.15 +** $Id: btree.c,v 1.524 2008/09/30 17:18:17 drh Exp $
1.16 +**
1.17 +** This file implements a external (disk-based) database using BTrees.
1.18 +** See the header comment on "btreeInt.h" for additional information.
1.19 +** Including a description of file format and an overview of operation.
1.20 +*/
1.21 +#include "btreeInt.h"
1.22 +
1.23 +/*
1.24 +** The header string that appears at the beginning of every
1.25 +** SQLite database.
1.26 +*/
1.27 +static const char zMagicHeader[] = SQLITE_FILE_HEADER;
1.28 +
1.29 +/*
1.30 +** Set this global variable to 1 to enable tracing using the TRACE
1.31 +** macro.
1.32 +*/
1.33 +#if 0
1.34 +int sqlite3BtreeTrace=0; /* True to enable tracing */
1.35 +# define TRACE(X) if(sqlite3BtreeTrace){printf X;fflush(stdout);}
1.36 +#else
1.37 +# define TRACE(X)
1.38 +#endif
1.39 +
1.40 +/*
1.41 +** Sometimes we need a small amount of code such as a variable initialization
1.42 +** to setup for a later assert() statement. We do not want this code to
1.43 +** appear when assert() is disabled. The following macro is therefore
1.44 +** used to contain that setup code. The "VVA" acronym stands for
1.45 +** "Verification, Validation, and Accreditation". In other words, the
1.46 +** code within VVA_ONLY() will only run during verification processes.
1.47 +*/
1.48 +#ifndef NDEBUG
1.49 +# define VVA_ONLY(X) X
1.50 +#else
1.51 +# define VVA_ONLY(X)
1.52 +#endif
1.53 +
1.54 +
1.55 +
1.56 +#ifndef SQLITE_OMIT_SHARED_CACHE
1.57 +/*
1.58 +** A list of BtShared objects that are eligible for participation
1.59 +** in shared cache. This variable has file scope during normal builds,
1.60 +** but the test harness needs to access it so we make it global for
1.61 +** test builds.
1.62 +*/
1.63 +#ifdef SQLITE_TEST
1.64 +BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
1.65 +#else
1.66 +static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
1.67 +#endif
1.68 +#endif /* SQLITE_OMIT_SHARED_CACHE */
1.69 +
1.70 +#ifndef SQLITE_OMIT_SHARED_CACHE
1.71 +/*
1.72 +** Enable or disable the shared pager and schema features.
1.73 +**
1.74 +** This routine has no effect on existing database connections.
1.75 +** The shared cache setting effects only future calls to
1.76 +** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
1.77 +*/
1.78 +SQLITE_EXPORT int sqlite3_enable_shared_cache(int enable){
1.79 + sqlite3GlobalConfig.sharedCacheEnabled = enable;
1.80 + return SQLITE_OK;
1.81 +}
1.82 +#endif
1.83 +
1.84 +
1.85 +/*
1.86 +** Forward declaration
1.87 +*/
1.88 +static int checkReadLocks(Btree*, Pgno, BtCursor*, i64);
1.89 +
1.90 +
1.91 +#ifdef SQLITE_OMIT_SHARED_CACHE
1.92 + /*
1.93 + ** The functions queryTableLock(), lockTable() and unlockAllTables()
1.94 + ** manipulate entries in the BtShared.pLock linked list used to store
1.95 + ** shared-cache table level locks. If the library is compiled with the
1.96 + ** shared-cache feature disabled, then there is only ever one user
1.97 + ** of each BtShared structure and so this locking is not necessary.
1.98 + ** So define the lock related functions as no-ops.
1.99 + */
1.100 + #define queryTableLock(a,b,c) SQLITE_OK
1.101 + #define lockTable(a,b,c) SQLITE_OK
1.102 + #define unlockAllTables(a)
1.103 +#endif
1.104 +
1.105 +#ifndef SQLITE_OMIT_SHARED_CACHE
1.106 +/*
1.107 +** Query to see if btree handle p may obtain a lock of type eLock
1.108 +** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
1.109 +** SQLITE_OK if the lock may be obtained (by calling lockTable()), or
1.110 +** SQLITE_LOCKED if not.
1.111 +*/
1.112 +static int queryTableLock(Btree *p, Pgno iTab, u8 eLock){
1.113 + BtShared *pBt = p->pBt;
1.114 + BtLock *pIter;
1.115 +
1.116 + assert( sqlite3BtreeHoldsMutex(p) );
1.117 + assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
1.118 + assert( p->db!=0 );
1.119 +
1.120 + /* This is a no-op if the shared-cache is not enabled */
1.121 + if( !p->sharable ){
1.122 + return SQLITE_OK;
1.123 + }
1.124 +
1.125 + /* If some other connection is holding an exclusive lock, the
1.126 + ** requested lock may not be obtained.
1.127 + */
1.128 + if( pBt->pExclusive && pBt->pExclusive!=p ){
1.129 + return SQLITE_LOCKED;
1.130 + }
1.131 +
1.132 + /* This (along with lockTable()) is where the ReadUncommitted flag is
1.133 + ** dealt with. If the caller is querying for a read-lock and the flag is
1.134 + ** set, it is unconditionally granted - even if there are write-locks
1.135 + ** on the table. If a write-lock is requested, the ReadUncommitted flag
1.136 + ** is not considered.
1.137 + **
1.138 + ** In function lockTable(), if a read-lock is demanded and the
1.139 + ** ReadUncommitted flag is set, no entry is added to the locks list
1.140 + ** (BtShared.pLock).
1.141 + **
1.142 + ** To summarize: If the ReadUncommitted flag is set, then read cursors do
1.143 + ** not create or respect table locks. The locking procedure for a
1.144 + ** write-cursor does not change.
1.145 + */
1.146 + if(
1.147 + 0==(p->db->flags&SQLITE_ReadUncommitted) ||
1.148 + eLock==WRITE_LOCK ||
1.149 + iTab==MASTER_ROOT
1.150 + ){
1.151 + for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
1.152 + if( pIter->pBtree!=p && pIter->iTable==iTab &&
1.153 + (pIter->eLock!=eLock || eLock!=READ_LOCK) ){
1.154 + return SQLITE_LOCKED;
1.155 + }
1.156 + }
1.157 + }
1.158 + return SQLITE_OK;
1.159 +}
1.160 +#endif /* !SQLITE_OMIT_SHARED_CACHE */
1.161 +
1.162 +#ifndef SQLITE_OMIT_SHARED_CACHE
1.163 +/*
1.164 +** Add a lock on the table with root-page iTable to the shared-btree used
1.165 +** by Btree handle p. Parameter eLock must be either READ_LOCK or
1.166 +** WRITE_LOCK.
1.167 +**
1.168 +** SQLITE_OK is returned if the lock is added successfully. SQLITE_BUSY and
1.169 +** SQLITE_NOMEM may also be returned.
1.170 +*/
1.171 +static int lockTable(Btree *p, Pgno iTable, u8 eLock){
1.172 + BtShared *pBt = p->pBt;
1.173 + BtLock *pLock = 0;
1.174 + BtLock *pIter;
1.175 +
1.176 + assert( sqlite3BtreeHoldsMutex(p) );
1.177 + assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
1.178 + assert( p->db!=0 );
1.179 +
1.180 + /* This is a no-op if the shared-cache is not enabled */
1.181 + if( !p->sharable ){
1.182 + return SQLITE_OK;
1.183 + }
1.184 +
1.185 + assert( SQLITE_OK==queryTableLock(p, iTable, eLock) );
1.186 +
1.187 + /* If the read-uncommitted flag is set and a read-lock is requested,
1.188 + ** return early without adding an entry to the BtShared.pLock list. See
1.189 + ** comment in function queryTableLock() for more info on handling
1.190 + ** the ReadUncommitted flag.
1.191 + */
1.192 + if(
1.193 + (p->db->flags&SQLITE_ReadUncommitted) &&
1.194 + (eLock==READ_LOCK) &&
1.195 + iTable!=MASTER_ROOT
1.196 + ){
1.197 + return SQLITE_OK;
1.198 + }
1.199 +
1.200 + /* First search the list for an existing lock on this table. */
1.201 + for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
1.202 + if( pIter->iTable==iTable && pIter->pBtree==p ){
1.203 + pLock = pIter;
1.204 + break;
1.205 + }
1.206 + }
1.207 +
1.208 + /* If the above search did not find a BtLock struct associating Btree p
1.209 + ** with table iTable, allocate one and link it into the list.
1.210 + */
1.211 + if( !pLock ){
1.212 + pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
1.213 + if( !pLock ){
1.214 + return SQLITE_NOMEM;
1.215 + }
1.216 + pLock->iTable = iTable;
1.217 + pLock->pBtree = p;
1.218 + pLock->pNext = pBt->pLock;
1.219 + pBt->pLock = pLock;
1.220 + }
1.221 +
1.222 + /* Set the BtLock.eLock variable to the maximum of the current lock
1.223 + ** and the requested lock. This means if a write-lock was already held
1.224 + ** and a read-lock requested, we don't incorrectly downgrade the lock.
1.225 + */
1.226 + assert( WRITE_LOCK>READ_LOCK );
1.227 + if( eLock>pLock->eLock ){
1.228 + pLock->eLock = eLock;
1.229 + }
1.230 +
1.231 + return SQLITE_OK;
1.232 +}
1.233 +#endif /* !SQLITE_OMIT_SHARED_CACHE */
1.234 +
1.235 +#ifndef SQLITE_OMIT_SHARED_CACHE
1.236 +/*
1.237 +** Release all the table locks (locks obtained via calls to the lockTable()
1.238 +** procedure) held by Btree handle p.
1.239 +*/
1.240 +static void unlockAllTables(Btree *p){
1.241 + BtShared *pBt = p->pBt;
1.242 + BtLock **ppIter = &pBt->pLock;
1.243 +
1.244 + assert( sqlite3BtreeHoldsMutex(p) );
1.245 + assert( p->sharable || 0==*ppIter );
1.246 +
1.247 + while( *ppIter ){
1.248 + BtLock *pLock = *ppIter;
1.249 + assert( pBt->pExclusive==0 || pBt->pExclusive==pLock->pBtree );
1.250 + if( pLock->pBtree==p ){
1.251 + *ppIter = pLock->pNext;
1.252 + sqlite3_free(pLock);
1.253 + }else{
1.254 + ppIter = &pLock->pNext;
1.255 + }
1.256 + }
1.257 +
1.258 + if( pBt->pExclusive==p ){
1.259 + pBt->pExclusive = 0;
1.260 + }
1.261 +}
1.262 +#endif /* SQLITE_OMIT_SHARED_CACHE */
1.263 +
1.264 +static void releasePage(MemPage *pPage); /* Forward reference */
1.265 +
1.266 +/*
1.267 +** Verify that the cursor holds a mutex on the BtShared
1.268 +*/
1.269 +#ifndef NDEBUG
1.270 +static int cursorHoldsMutex(BtCursor *p){
1.271 + return sqlite3_mutex_held(p->pBt->mutex);
1.272 +}
1.273 +#endif
1.274 +
1.275 +
1.276 +#ifndef SQLITE_OMIT_INCRBLOB
1.277 +/*
1.278 +** Invalidate the overflow page-list cache for cursor pCur, if any.
1.279 +*/
1.280 +static void invalidateOverflowCache(BtCursor *pCur){
1.281 + assert( cursorHoldsMutex(pCur) );
1.282 + sqlite3_free(pCur->aOverflow);
1.283 + pCur->aOverflow = 0;
1.284 +}
1.285 +
1.286 +/*
1.287 +** Invalidate the overflow page-list cache for all cursors opened
1.288 +** on the shared btree structure pBt.
1.289 +*/
1.290 +static void invalidateAllOverflowCache(BtShared *pBt){
1.291 + BtCursor *p;
1.292 + assert( sqlite3_mutex_held(pBt->mutex) );
1.293 + for(p=pBt->pCursor; p; p=p->pNext){
1.294 + invalidateOverflowCache(p);
1.295 + }
1.296 +}
1.297 +#else
1.298 + #define invalidateOverflowCache(x)
1.299 + #define invalidateAllOverflowCache(x)
1.300 +#endif
1.301 +
1.302 +/*
1.303 +** Save the current cursor position in the variables BtCursor.nKey
1.304 +** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
1.305 +*/
1.306 +static int saveCursorPosition(BtCursor *pCur){
1.307 + int rc;
1.308 +
1.309 + assert( CURSOR_VALID==pCur->eState );
1.310 + assert( 0==pCur->pKey );
1.311 + assert( cursorHoldsMutex(pCur) );
1.312 +
1.313 + rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);
1.314 +
1.315 + /* If this is an intKey table, then the above call to BtreeKeySize()
1.316 + ** stores the integer key in pCur->nKey. In this case this value is
1.317 + ** all that is required. Otherwise, if pCur is not open on an intKey
1.318 + ** table, then malloc space for and store the pCur->nKey bytes of key
1.319 + ** data.
1.320 + */
1.321 + if( rc==SQLITE_OK && 0==pCur->apPage[0]->intKey){
1.322 + void *pKey = sqlite3Malloc(pCur->nKey);
1.323 + if( pKey ){
1.324 + rc = sqlite3BtreeKey(pCur, 0, pCur->nKey, pKey);
1.325 + if( rc==SQLITE_OK ){
1.326 + pCur->pKey = pKey;
1.327 + }else{
1.328 + sqlite3_free(pKey);
1.329 + }
1.330 + }else{
1.331 + rc = SQLITE_NOMEM;
1.332 + }
1.333 + }
1.334 + assert( !pCur->apPage[0]->intKey || !pCur->pKey );
1.335 +
1.336 + if( rc==SQLITE_OK ){
1.337 + int i;
1.338 + for(i=0; i<=pCur->iPage; i++){
1.339 + releasePage(pCur->apPage[i]);
1.340 + pCur->apPage[i] = 0;
1.341 + }
1.342 + pCur->iPage = -1;
1.343 + pCur->eState = CURSOR_REQUIRESEEK;
1.344 + }
1.345 +
1.346 + invalidateOverflowCache(pCur);
1.347 + return rc;
1.348 +}
1.349 +
1.350 +/*
1.351 +** Save the positions of all cursors except pExcept open on the table
1.352 +** with root-page iRoot. Usually, this is called just before cursor
1.353 +** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()).
1.354 +*/
1.355 +static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
1.356 + BtCursor *p;
1.357 + assert( sqlite3_mutex_held(pBt->mutex) );
1.358 + assert( pExcept==0 || pExcept->pBt==pBt );
1.359 + for(p=pBt->pCursor; p; p=p->pNext){
1.360 + if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) &&
1.361 + p->eState==CURSOR_VALID ){
1.362 + int rc = saveCursorPosition(p);
1.363 + if( SQLITE_OK!=rc ){
1.364 + return rc;
1.365 + }
1.366 + }
1.367 + }
1.368 + return SQLITE_OK;
1.369 +}
1.370 +
1.371 +/*
1.372 +** Clear the current cursor position.
1.373 +*/
1.374 +static void clearCursorPosition(BtCursor *pCur){
1.375 + assert( cursorHoldsMutex(pCur) );
1.376 + sqlite3_free(pCur->pKey);
1.377 + pCur->pKey = 0;
1.378 + pCur->eState = CURSOR_INVALID;
1.379 +}
1.380 +
1.381 +/*
1.382 +** Restore the cursor to the position it was in (or as close to as possible)
1.383 +** when saveCursorPosition() was called. Note that this call deletes the
1.384 +** saved position info stored by saveCursorPosition(), so there can be
1.385 +** at most one effective restoreCursorPosition() call after each
1.386 +** saveCursorPosition().
1.387 +*/
1.388 +int sqlite3BtreeRestoreCursorPosition(BtCursor *pCur){
1.389 + int rc;
1.390 + assert( cursorHoldsMutex(pCur) );
1.391 + assert( pCur->eState>=CURSOR_REQUIRESEEK );
1.392 + if( pCur->eState==CURSOR_FAULT ){
1.393 + return pCur->skip;
1.394 + }
1.395 + pCur->eState = CURSOR_INVALID;
1.396 + rc = sqlite3BtreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skip);
1.397 + if( rc==SQLITE_OK ){
1.398 + sqlite3_free(pCur->pKey);
1.399 + pCur->pKey = 0;
1.400 + assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
1.401 + }
1.402 + return rc;
1.403 +}
1.404 +
1.405 +#define restoreCursorPosition(p) \
1.406 + (p->eState>=CURSOR_REQUIRESEEK ? \
1.407 + sqlite3BtreeRestoreCursorPosition(p) : \
1.408 + SQLITE_OK)
1.409 +
1.410 +/*
1.411 +** Determine whether or not a cursor has moved from the position it
1.412 +** was last placed at. Cursor can move when the row they are pointing
1.413 +** at is deleted out from under them.
1.414 +**
1.415 +** This routine returns an error code if something goes wrong. The
1.416 +** integer *pHasMoved is set to one if the cursor has moved and 0 if not.
1.417 +*/
1.418 +int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved){
1.419 + int rc;
1.420 +
1.421 + rc = restoreCursorPosition(pCur);
1.422 + if( rc ){
1.423 + *pHasMoved = 1;
1.424 + return rc;
1.425 + }
1.426 + if( pCur->eState!=CURSOR_VALID || pCur->skip!=0 ){
1.427 + *pHasMoved = 1;
1.428 + }else{
1.429 + *pHasMoved = 0;
1.430 + }
1.431 + return SQLITE_OK;
1.432 +}
1.433 +
1.434 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.435 +/*
1.436 +** Given a page number of a regular database page, return the page
1.437 +** number for the pointer-map page that contains the entry for the
1.438 +** input page number.
1.439 +*/
1.440 +static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
1.441 + int nPagesPerMapPage, iPtrMap, ret;
1.442 + assert( sqlite3_mutex_held(pBt->mutex) );
1.443 + nPagesPerMapPage = (pBt->usableSize/5)+1;
1.444 + iPtrMap = (pgno-2)/nPagesPerMapPage;
1.445 + ret = (iPtrMap*nPagesPerMapPage) + 2;
1.446 + if( ret==PENDING_BYTE_PAGE(pBt) ){
1.447 + ret++;
1.448 + }
1.449 + return ret;
1.450 +}
1.451 +
1.452 +/*
1.453 +** Write an entry into the pointer map.
1.454 +**
1.455 +** This routine updates the pointer map entry for page number 'key'
1.456 +** so that it maps to type 'eType' and parent page number 'pgno'.
1.457 +** An error code is returned if something goes wrong, otherwise SQLITE_OK.
1.458 +*/
1.459 +static int ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent){
1.460 + DbPage *pDbPage; /* The pointer map page */
1.461 + u8 *pPtrmap; /* The pointer map data */
1.462 + Pgno iPtrmap; /* The pointer map page number */
1.463 + int offset; /* Offset in pointer map page */
1.464 + int rc;
1.465 +
1.466 + assert( sqlite3_mutex_held(pBt->mutex) );
1.467 + /* The master-journal page number must never be used as a pointer map page */
1.468 + assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
1.469 +
1.470 + assert( pBt->autoVacuum );
1.471 + if( key==0 ){
1.472 + return SQLITE_CORRUPT_BKPT;
1.473 + }
1.474 + iPtrmap = PTRMAP_PAGENO(pBt, key);
1.475 + rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
1.476 + if( rc!=SQLITE_OK ){
1.477 + return rc;
1.478 + }
1.479 + offset = PTRMAP_PTROFFSET(iPtrmap, key);
1.480 + pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
1.481 +
1.482 + if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
1.483 + TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
1.484 + rc = sqlite3PagerWrite(pDbPage);
1.485 + if( rc==SQLITE_OK ){
1.486 + pPtrmap[offset] = eType;
1.487 + put4byte(&pPtrmap[offset+1], parent);
1.488 + }
1.489 + }
1.490 +
1.491 + sqlite3PagerUnref(pDbPage);
1.492 + return rc;
1.493 +}
1.494 +
1.495 +/*
1.496 +** Read an entry from the pointer map.
1.497 +**
1.498 +** This routine retrieves the pointer map entry for page 'key', writing
1.499 +** the type and parent page number to *pEType and *pPgno respectively.
1.500 +** An error code is returned if something goes wrong, otherwise SQLITE_OK.
1.501 +*/
1.502 +static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
1.503 + DbPage *pDbPage; /* The pointer map page */
1.504 + int iPtrmap; /* Pointer map page index */
1.505 + u8 *pPtrmap; /* Pointer map page data */
1.506 + int offset; /* Offset of entry in pointer map */
1.507 + int rc;
1.508 +
1.509 + assert( sqlite3_mutex_held(pBt->mutex) );
1.510 +
1.511 + iPtrmap = PTRMAP_PAGENO(pBt, key);
1.512 + rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
1.513 + if( rc!=0 ){
1.514 + return rc;
1.515 + }
1.516 + pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
1.517 +
1.518 + offset = PTRMAP_PTROFFSET(iPtrmap, key);
1.519 + assert( pEType!=0 );
1.520 + *pEType = pPtrmap[offset];
1.521 + if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
1.522 +
1.523 + sqlite3PagerUnref(pDbPage);
1.524 + if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
1.525 + return SQLITE_OK;
1.526 +}
1.527 +
1.528 +#else /* if defined SQLITE_OMIT_AUTOVACUUM */
1.529 + #define ptrmapPut(w,x,y,z) SQLITE_OK
1.530 + #define ptrmapGet(w,x,y,z) SQLITE_OK
1.531 + #define ptrmapPutOvfl(y,z) SQLITE_OK
1.532 +#endif
1.533 +
1.534 +/*
1.535 +** Given a btree page and a cell index (0 means the first cell on
1.536 +** the page, 1 means the second cell, and so forth) return a pointer
1.537 +** to the cell content.
1.538 +**
1.539 +** This routine works only for pages that do not contain overflow cells.
1.540 +*/
1.541 +#define findCell(P,I) \
1.542 + ((P)->aData + ((P)->maskPage & get2byte(&(P)->aData[(P)->cellOffset+2*(I)])))
1.543 +
1.544 +/*
1.545 +** This a more complex version of findCell() that works for
1.546 +** pages that do contain overflow cells. See insert
1.547 +*/
1.548 +static u8 *findOverflowCell(MemPage *pPage, int iCell){
1.549 + int i;
1.550 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.551 + for(i=pPage->nOverflow-1; i>=0; i--){
1.552 + int k;
1.553 + struct _OvflCell *pOvfl;
1.554 + pOvfl = &pPage->aOvfl[i];
1.555 + k = pOvfl->idx;
1.556 + if( k<=iCell ){
1.557 + if( k==iCell ){
1.558 + return pOvfl->pCell;
1.559 + }
1.560 + iCell--;
1.561 + }
1.562 + }
1.563 + return findCell(pPage, iCell);
1.564 +}
1.565 +
1.566 +/*
1.567 +** Parse a cell content block and fill in the CellInfo structure. There
1.568 +** are two versions of this function. sqlite3BtreeParseCell() takes a
1.569 +** cell index as the second argument and sqlite3BtreeParseCellPtr()
1.570 +** takes a pointer to the body of the cell as its second argument.
1.571 +**
1.572 +** Within this file, the parseCell() macro can be called instead of
1.573 +** sqlite3BtreeParseCellPtr(). Using some compilers, this will be faster.
1.574 +*/
1.575 +void sqlite3BtreeParseCellPtr(
1.576 + MemPage *pPage, /* Page containing the cell */
1.577 + u8 *pCell, /* Pointer to the cell text. */
1.578 + CellInfo *pInfo /* Fill in this structure */
1.579 +){
1.580 + int n; /* Number bytes in cell content header */
1.581 + u32 nPayload; /* Number of bytes of cell payload */
1.582 +
1.583 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.584 +
1.585 + pInfo->pCell = pCell;
1.586 + assert( pPage->leaf==0 || pPage->leaf==1 );
1.587 + n = pPage->childPtrSize;
1.588 + assert( n==4-4*pPage->leaf );
1.589 + if( pPage->intKey ){
1.590 + if( pPage->hasData ){
1.591 + n += getVarint32(&pCell[n], nPayload);
1.592 + }else{
1.593 + nPayload = 0;
1.594 + }
1.595 + n += getVarint(&pCell[n], (u64*)&pInfo->nKey);
1.596 + pInfo->nData = nPayload;
1.597 + }else{
1.598 + pInfo->nData = 0;
1.599 + n += getVarint32(&pCell[n], nPayload);
1.600 + pInfo->nKey = nPayload;
1.601 + }
1.602 + pInfo->nPayload = nPayload;
1.603 + pInfo->nHeader = n;
1.604 + if( likely(nPayload<=pPage->maxLocal) ){
1.605 + /* This is the (easy) common case where the entire payload fits
1.606 + ** on the local page. No overflow is required.
1.607 + */
1.608 + int nSize; /* Total size of cell content in bytes */
1.609 + nSize = nPayload + n;
1.610 + pInfo->nLocal = nPayload;
1.611 + pInfo->iOverflow = 0;
1.612 + if( (nSize & ~3)==0 ){
1.613 + nSize = 4; /* Minimum cell size is 4 */
1.614 + }
1.615 + pInfo->nSize = nSize;
1.616 + }else{
1.617 + /* If the payload will not fit completely on the local page, we have
1.618 + ** to decide how much to store locally and how much to spill onto
1.619 + ** overflow pages. The strategy is to minimize the amount of unused
1.620 + ** space on overflow pages while keeping the amount of local storage
1.621 + ** in between minLocal and maxLocal.
1.622 + **
1.623 + ** Warning: changing the way overflow payload is distributed in any
1.624 + ** way will result in an incompatible file format.
1.625 + */
1.626 + int minLocal; /* Minimum amount of payload held locally */
1.627 + int maxLocal; /* Maximum amount of payload held locally */
1.628 + int surplus; /* Overflow payload available for local storage */
1.629 +
1.630 + minLocal = pPage->minLocal;
1.631 + maxLocal = pPage->maxLocal;
1.632 + surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);
1.633 + if( surplus <= maxLocal ){
1.634 + pInfo->nLocal = surplus;
1.635 + }else{
1.636 + pInfo->nLocal = minLocal;
1.637 + }
1.638 + pInfo->iOverflow = pInfo->nLocal + n;
1.639 + pInfo->nSize = pInfo->iOverflow + 4;
1.640 + }
1.641 +}
1.642 +#define parseCell(pPage, iCell, pInfo) \
1.643 + sqlite3BtreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo))
1.644 +void sqlite3BtreeParseCell(
1.645 + MemPage *pPage, /* Page containing the cell */
1.646 + int iCell, /* The cell index. First cell is 0 */
1.647 + CellInfo *pInfo /* Fill in this structure */
1.648 +){
1.649 + parseCell(pPage, iCell, pInfo);
1.650 +}
1.651 +
1.652 +/*
1.653 +** Compute the total number of bytes that a Cell needs in the cell
1.654 +** data area of the btree-page. The return number includes the cell
1.655 +** data header and the local payload, but not any overflow page or
1.656 +** the space used by the cell pointer.
1.657 +*/
1.658 +#ifndef NDEBUG
1.659 +static u16 cellSize(MemPage *pPage, int iCell){
1.660 + CellInfo info;
1.661 + sqlite3BtreeParseCell(pPage, iCell, &info);
1.662 + return info.nSize;
1.663 +}
1.664 +#endif
1.665 +static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
1.666 + CellInfo info;
1.667 + sqlite3BtreeParseCellPtr(pPage, pCell, &info);
1.668 + return info.nSize;
1.669 +}
1.670 +
1.671 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.672 +/*
1.673 +** If the cell pCell, part of page pPage contains a pointer
1.674 +** to an overflow page, insert an entry into the pointer-map
1.675 +** for the overflow page.
1.676 +*/
1.677 +static int ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell){
1.678 + CellInfo info;
1.679 + assert( pCell!=0 );
1.680 + sqlite3BtreeParseCellPtr(pPage, pCell, &info);
1.681 + assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
1.682 + if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
1.683 + Pgno ovfl = get4byte(&pCell[info.iOverflow]);
1.684 + return ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno);
1.685 + }
1.686 + return SQLITE_OK;
1.687 +}
1.688 +/*
1.689 +** If the cell with index iCell on page pPage contains a pointer
1.690 +** to an overflow page, insert an entry into the pointer-map
1.691 +** for the overflow page.
1.692 +*/
1.693 +static int ptrmapPutOvfl(MemPage *pPage, int iCell){
1.694 + u8 *pCell;
1.695 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.696 + pCell = findOverflowCell(pPage, iCell);
1.697 + return ptrmapPutOvflPtr(pPage, pCell);
1.698 +}
1.699 +#endif
1.700 +
1.701 +
1.702 +/*
1.703 +** Defragment the page given. All Cells are moved to the
1.704 +** end of the page and all free space is collected into one
1.705 +** big FreeBlk that occurs in between the header and cell
1.706 +** pointer array and the cell content area.
1.707 +*/
1.708 +static void defragmentPage(MemPage *pPage){
1.709 + int i; /* Loop counter */
1.710 + int pc; /* Address of a i-th cell */
1.711 + int addr; /* Offset of first byte after cell pointer array */
1.712 + int hdr; /* Offset to the page header */
1.713 + int size; /* Size of a cell */
1.714 + int usableSize; /* Number of usable bytes on a page */
1.715 + int cellOffset; /* Offset to the cell pointer array */
1.716 + int cbrk; /* Offset to the cell content area */
1.717 + int nCell; /* Number of cells on the page */
1.718 + unsigned char *data; /* The page data */
1.719 + unsigned char *temp; /* Temp area for cell content */
1.720 +
1.721 + assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1.722 + assert( pPage->pBt!=0 );
1.723 + assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
1.724 + assert( pPage->nOverflow==0 );
1.725 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.726 + temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
1.727 + data = pPage->aData;
1.728 + hdr = pPage->hdrOffset;
1.729 + cellOffset = pPage->cellOffset;
1.730 + nCell = pPage->nCell;
1.731 + assert( nCell==get2byte(&data[hdr+3]) );
1.732 + usableSize = pPage->pBt->usableSize;
1.733 + cbrk = get2byte(&data[hdr+5]);
1.734 + memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk);
1.735 + cbrk = usableSize;
1.736 + for(i=0; i<nCell; i++){
1.737 + u8 *pAddr; /* The i-th cell pointer */
1.738 + pAddr = &data[cellOffset + i*2];
1.739 + pc = get2byte(pAddr);
1.740 + assert( pc<pPage->pBt->usableSize );
1.741 + size = cellSizePtr(pPage, &temp[pc]);
1.742 + cbrk -= size;
1.743 + memcpy(&data[cbrk], &temp[pc], size);
1.744 + put2byte(pAddr, cbrk);
1.745 + }
1.746 + assert( cbrk>=cellOffset+2*nCell );
1.747 + put2byte(&data[hdr+5], cbrk);
1.748 + data[hdr+1] = 0;
1.749 + data[hdr+2] = 0;
1.750 + data[hdr+7] = 0;
1.751 + addr = cellOffset+2*nCell;
1.752 + memset(&data[addr], 0, cbrk-addr);
1.753 +}
1.754 +
1.755 +/*
1.756 +** Allocate nByte bytes of space on a page.
1.757 +**
1.758 +** Return the index into pPage->aData[] of the first byte of
1.759 +** the new allocation. The caller guarantees that there is enough
1.760 +** space. This routine will never fail.
1.761 +**
1.762 +** If the page contains nBytes of free space but does not contain
1.763 +** nBytes of contiguous free space, then this routine automatically
1.764 +** calls defragementPage() to consolidate all free space before
1.765 +** allocating the new chunk.
1.766 +*/
1.767 +static int allocateSpace(MemPage *pPage, int nByte){
1.768 + int addr, pc, hdr;
1.769 + int size;
1.770 + int nFrag;
1.771 + int top;
1.772 + int nCell;
1.773 + int cellOffset;
1.774 + unsigned char *data;
1.775 +
1.776 + data = pPage->aData;
1.777 + assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1.778 + assert( pPage->pBt );
1.779 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.780 + assert( nByte>=0 ); /* Minimum cell size is 4 */
1.781 + assert( pPage->nFree>=nByte );
1.782 + assert( pPage->nOverflow==0 );
1.783 + pPage->nFree -= nByte;
1.784 + hdr = pPage->hdrOffset;
1.785 +
1.786 + nFrag = data[hdr+7];
1.787 + if( nFrag<60 ){
1.788 + /* Search the freelist looking for a slot big enough to satisfy the
1.789 + ** space request. */
1.790 + addr = hdr+1;
1.791 + while( (pc = get2byte(&data[addr]))>0 ){
1.792 + size = get2byte(&data[pc+2]);
1.793 + if( size>=nByte ){
1.794 + if( size<nByte+4 ){
1.795 + memcpy(&data[addr], &data[pc], 2);
1.796 + data[hdr+7] = nFrag + size - nByte;
1.797 + return pc;
1.798 + }else{
1.799 + put2byte(&data[pc+2], size-nByte);
1.800 + return pc + size - nByte;
1.801 + }
1.802 + }
1.803 + addr = pc;
1.804 + }
1.805 + }
1.806 +
1.807 + /* Allocate memory from the gap in between the cell pointer array
1.808 + ** and the cell content area.
1.809 + */
1.810 + top = get2byte(&data[hdr+5]);
1.811 + nCell = get2byte(&data[hdr+3]);
1.812 + cellOffset = pPage->cellOffset;
1.813 + if( nFrag>=60 || cellOffset + 2*nCell > top - nByte ){
1.814 + defragmentPage(pPage);
1.815 + top = get2byte(&data[hdr+5]);
1.816 + }
1.817 + top -= nByte;
1.818 + assert( cellOffset + 2*nCell <= top );
1.819 + put2byte(&data[hdr+5], top);
1.820 + return top;
1.821 +}
1.822 +
1.823 +/*
1.824 +** Return a section of the pPage->aData to the freelist.
1.825 +** The first byte of the new free block is pPage->aDisk[start]
1.826 +** and the size of the block is "size" bytes.
1.827 +**
1.828 +** Most of the effort here is involved in coalesing adjacent
1.829 +** free blocks into a single big free block.
1.830 +*/
1.831 +static void freeSpace(MemPage *pPage, int start, int size){
1.832 + int addr, pbegin, hdr;
1.833 + unsigned char *data = pPage->aData;
1.834 +
1.835 + assert( pPage->pBt!=0 );
1.836 + assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1.837 + assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) );
1.838 + assert( (start + size)<=pPage->pBt->usableSize );
1.839 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.840 + assert( size>=0 ); /* Minimum cell size is 4 */
1.841 +
1.842 +#ifdef SQLITE_SECURE_DELETE
1.843 + /* Overwrite deleted information with zeros when the SECURE_DELETE
1.844 + ** option is enabled at compile-time */
1.845 + memset(&data[start], 0, size);
1.846 +#endif
1.847 +
1.848 + /* Add the space back into the linked list of freeblocks */
1.849 + hdr = pPage->hdrOffset;
1.850 + addr = hdr + 1;
1.851 + while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
1.852 + assert( pbegin<=pPage->pBt->usableSize-4 );
1.853 + assert( pbegin>addr );
1.854 + addr = pbegin;
1.855 + }
1.856 + assert( pbegin<=pPage->pBt->usableSize-4 );
1.857 + assert( pbegin>addr || pbegin==0 );
1.858 + put2byte(&data[addr], start);
1.859 + put2byte(&data[start], pbegin);
1.860 + put2byte(&data[start+2], size);
1.861 + pPage->nFree += size;
1.862 +
1.863 + /* Coalesce adjacent free blocks */
1.864 + addr = pPage->hdrOffset + 1;
1.865 + while( (pbegin = get2byte(&data[addr]))>0 ){
1.866 + int pnext, psize;
1.867 + assert( pbegin>addr );
1.868 + assert( pbegin<=pPage->pBt->usableSize-4 );
1.869 + pnext = get2byte(&data[pbegin]);
1.870 + psize = get2byte(&data[pbegin+2]);
1.871 + if( pbegin + psize + 3 >= pnext && pnext>0 ){
1.872 + int frag = pnext - (pbegin+psize);
1.873 + assert( frag<=data[pPage->hdrOffset+7] );
1.874 + data[pPage->hdrOffset+7] -= frag;
1.875 + put2byte(&data[pbegin], get2byte(&data[pnext]));
1.876 + put2byte(&data[pbegin+2], pnext+get2byte(&data[pnext+2])-pbegin);
1.877 + }else{
1.878 + addr = pbegin;
1.879 + }
1.880 + }
1.881 +
1.882 + /* If the cell content area begins with a freeblock, remove it. */
1.883 + if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){
1.884 + int top;
1.885 + pbegin = get2byte(&data[hdr+1]);
1.886 + memcpy(&data[hdr+1], &data[pbegin], 2);
1.887 + top = get2byte(&data[hdr+5]);
1.888 + put2byte(&data[hdr+5], top + get2byte(&data[pbegin+2]));
1.889 + }
1.890 +}
1.891 +
1.892 +/*
1.893 +** Decode the flags byte (the first byte of the header) for a page
1.894 +** and initialize fields of the MemPage structure accordingly.
1.895 +**
1.896 +** Only the following combinations are supported. Anything different
1.897 +** indicates a corrupt database files:
1.898 +**
1.899 +** PTF_ZERODATA
1.900 +** PTF_ZERODATA | PTF_LEAF
1.901 +** PTF_LEAFDATA | PTF_INTKEY
1.902 +** PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
1.903 +*/
1.904 +static int decodeFlags(MemPage *pPage, int flagByte){
1.905 + BtShared *pBt; /* A copy of pPage->pBt */
1.906 +
1.907 + assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
1.908 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.909 + pPage->leaf = flagByte>>3; assert( PTF_LEAF == 1<<3 );
1.910 + flagByte &= ~PTF_LEAF;
1.911 + pPage->childPtrSize = 4-4*pPage->leaf;
1.912 + pBt = pPage->pBt;
1.913 + if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
1.914 + pPage->intKey = 1;
1.915 + pPage->hasData = pPage->leaf;
1.916 + pPage->maxLocal = pBt->maxLeaf;
1.917 + pPage->minLocal = pBt->minLeaf;
1.918 + }else if( flagByte==PTF_ZERODATA ){
1.919 + pPage->intKey = 0;
1.920 + pPage->hasData = 0;
1.921 + pPage->maxLocal = pBt->maxLocal;
1.922 + pPage->minLocal = pBt->minLocal;
1.923 + }else{
1.924 + return SQLITE_CORRUPT_BKPT;
1.925 + }
1.926 + return SQLITE_OK;
1.927 +}
1.928 +
1.929 +/*
1.930 +** Initialize the auxiliary information for a disk block.
1.931 +**
1.932 +** Return SQLITE_OK on success. If we see that the page does
1.933 +** not contain a well-formed database page, then return
1.934 +** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not
1.935 +** guarantee that the page is well-formed. It only shows that
1.936 +** we failed to detect any corruption.
1.937 +*/
1.938 +int sqlite3BtreeInitPage(MemPage *pPage){
1.939 +
1.940 + assert( pPage->pBt!=0 );
1.941 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.942 + assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
1.943 + assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
1.944 + assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
1.945 +
1.946 + if( !pPage->isInit ){
1.947 + int pc; /* Address of a freeblock within pPage->aData[] */
1.948 + int hdr; /* Offset to beginning of page header */
1.949 + u8 *data; /* Equal to pPage->aData */
1.950 + BtShared *pBt; /* The main btree structure */
1.951 + int usableSize; /* Amount of usable space on each page */
1.952 + int cellOffset; /* Offset from start of page to first cell pointer */
1.953 + int nFree; /* Number of unused bytes on the page */
1.954 + int top; /* First byte of the cell content area */
1.955 +
1.956 + pBt = pPage->pBt;
1.957 +
1.958 + hdr = pPage->hdrOffset;
1.959 + data = pPage->aData;
1.960 + if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;
1.961 + assert( pBt->pageSize>=512 && pBt->pageSize<=32768 );
1.962 + pPage->maskPage = pBt->pageSize - 1;
1.963 + pPage->nOverflow = 0;
1.964 + usableSize = pBt->usableSize;
1.965 + pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
1.966 + top = get2byte(&data[hdr+5]);
1.967 + pPage->nCell = get2byte(&data[hdr+3]);
1.968 + if( pPage->nCell>MX_CELL(pBt) ){
1.969 + /* To many cells for a single page. The page must be corrupt */
1.970 + return SQLITE_CORRUPT_BKPT;
1.971 + }
1.972 +
1.973 + /* Compute the total free space on the page */
1.974 + pc = get2byte(&data[hdr+1]);
1.975 + nFree = data[hdr+7] + top - (cellOffset + 2*pPage->nCell);
1.976 + while( pc>0 ){
1.977 + int next, size;
1.978 + if( pc>usableSize-4 ){
1.979 + /* Free block is off the page */
1.980 + return SQLITE_CORRUPT_BKPT;
1.981 + }
1.982 + next = get2byte(&data[pc]);
1.983 + size = get2byte(&data[pc+2]);
1.984 + if( next>0 && next<=pc+size+3 ){
1.985 + /* Free blocks must be in accending order */
1.986 + return SQLITE_CORRUPT_BKPT;
1.987 + }
1.988 + nFree += size;
1.989 + pc = next;
1.990 + }
1.991 + pPage->nFree = nFree;
1.992 + if( nFree>=usableSize ){
1.993 + /* Free space cannot exceed total page size */
1.994 + return SQLITE_CORRUPT_BKPT;
1.995 + }
1.996 +
1.997 +#if 0
1.998 + /* Check that all the offsets in the cell offset array are within range.
1.999 + **
1.1000 + ** Omitting this consistency check and using the pPage->maskPage mask
1.1001 + ** to prevent overrunning the page buffer in findCell() results in a
1.1002 + ** 2.5% performance gain.
1.1003 + */
1.1004 + {
1.1005 + u8 *pOff; /* Iterator used to check all cell offsets are in range */
1.1006 + u8 *pEnd; /* Pointer to end of cell offset array */
1.1007 + u8 mask; /* Mask of bits that must be zero in MSB of cell offsets */
1.1008 + mask = ~(((u8)(pBt->pageSize>>8))-1);
1.1009 + pEnd = &data[cellOffset + pPage->nCell*2];
1.1010 + for(pOff=&data[cellOffset]; pOff!=pEnd && !((*pOff)&mask); pOff+=2);
1.1011 + if( pOff!=pEnd ){
1.1012 + return SQLITE_CORRUPT_BKPT;
1.1013 + }
1.1014 + }
1.1015 +#endif
1.1016 +
1.1017 + pPage->isInit = 1;
1.1018 + }
1.1019 + return SQLITE_OK;
1.1020 +}
1.1021 +
1.1022 +/*
1.1023 +** Set up a raw page so that it looks like a database page holding
1.1024 +** no entries.
1.1025 +*/
1.1026 +static void zeroPage(MemPage *pPage, int flags){
1.1027 + unsigned char *data = pPage->aData;
1.1028 + BtShared *pBt = pPage->pBt;
1.1029 + int hdr = pPage->hdrOffset;
1.1030 + int first;
1.1031 +
1.1032 + assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
1.1033 + assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1.1034 + assert( sqlite3PagerGetData(pPage->pDbPage) == data );
1.1035 + assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1.1036 + assert( sqlite3_mutex_held(pBt->mutex) );
1.1037 + /*memset(&data[hdr], 0, pBt->usableSize - hdr);*/
1.1038 + data[hdr] = flags;
1.1039 + first = hdr + 8 + 4*((flags&PTF_LEAF)==0);
1.1040 + memset(&data[hdr+1], 0, 4);
1.1041 + data[hdr+7] = 0;
1.1042 + put2byte(&data[hdr+5], pBt->usableSize);
1.1043 + pPage->nFree = pBt->usableSize - first;
1.1044 + decodeFlags(pPage, flags);
1.1045 + pPage->hdrOffset = hdr;
1.1046 + pPage->cellOffset = first;
1.1047 + pPage->nOverflow = 0;
1.1048 + assert( pBt->pageSize>=512 && pBt->pageSize<=32768 );
1.1049 + pPage->maskPage = pBt->pageSize - 1;
1.1050 + pPage->nCell = 0;
1.1051 + pPage->isInit = 1;
1.1052 +}
1.1053 +
1.1054 +
1.1055 +/*
1.1056 +** Convert a DbPage obtained from the pager into a MemPage used by
1.1057 +** the btree layer.
1.1058 +*/
1.1059 +static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
1.1060 + MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
1.1061 + pPage->aData = sqlite3PagerGetData(pDbPage);
1.1062 + pPage->pDbPage = pDbPage;
1.1063 + pPage->pBt = pBt;
1.1064 + pPage->pgno = pgno;
1.1065 + pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
1.1066 + return pPage;
1.1067 +}
1.1068 +
1.1069 +/*
1.1070 +** Get a page from the pager. Initialize the MemPage.pBt and
1.1071 +** MemPage.aData elements if needed.
1.1072 +**
1.1073 +** If the noContent flag is set, it means that we do not care about
1.1074 +** the content of the page at this time. So do not go to the disk
1.1075 +** to fetch the content. Just fill in the content with zeros for now.
1.1076 +** If in the future we call sqlite3PagerWrite() on this page, that
1.1077 +** means we have started to be concerned about content and the disk
1.1078 +** read should occur at that point.
1.1079 +*/
1.1080 +int sqlite3BtreeGetPage(
1.1081 + BtShared *pBt, /* The btree */
1.1082 + Pgno pgno, /* Number of the page to fetch */
1.1083 + MemPage **ppPage, /* Return the page in this parameter */
1.1084 + int noContent /* Do not load page content if true */
1.1085 +){
1.1086 + int rc;
1.1087 + DbPage *pDbPage;
1.1088 +
1.1089 + assert( sqlite3_mutex_held(pBt->mutex) );
1.1090 + rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent);
1.1091 + if( rc ) return rc;
1.1092 + *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
1.1093 + return SQLITE_OK;
1.1094 +}
1.1095 +
1.1096 +/*
1.1097 +** Return the size of the database file in pages. Or return -1 if
1.1098 +** there is any kind of error.
1.1099 +*/
1.1100 +static int pagerPagecount(Pager *pPager){
1.1101 + int rc;
1.1102 + int nPage;
1.1103 + rc = sqlite3PagerPagecount(pPager, &nPage);
1.1104 + return (rc==SQLITE_OK?nPage:-1);
1.1105 +}
1.1106 +
1.1107 +/*
1.1108 +** Get a page from the pager and initialize it. This routine
1.1109 +** is just a convenience wrapper around separate calls to
1.1110 +** sqlite3BtreeGetPage() and sqlite3BtreeInitPage().
1.1111 +*/
1.1112 +static int getAndInitPage(
1.1113 + BtShared *pBt, /* The database file */
1.1114 + Pgno pgno, /* Number of the page to get */
1.1115 + MemPage **ppPage /* Write the page pointer here */
1.1116 +){
1.1117 + int rc;
1.1118 + DbPage *pDbPage;
1.1119 + MemPage *pPage;
1.1120 +
1.1121 + assert( sqlite3_mutex_held(pBt->mutex) );
1.1122 + if( pgno==0 ){
1.1123 + return SQLITE_CORRUPT_BKPT;
1.1124 + }
1.1125 +
1.1126 + /* It is often the case that the page we want is already in cache.
1.1127 + ** If so, get it directly. This saves us from having to call
1.1128 + ** pagerPagecount() to make sure pgno is within limits, which results
1.1129 + ** in a measureable performance improvements.
1.1130 + */
1.1131 + pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
1.1132 + if( pDbPage ){
1.1133 + /* Page is already in cache */
1.1134 + *ppPage = pPage = btreePageFromDbPage(pDbPage, pgno, pBt);
1.1135 + rc = SQLITE_OK;
1.1136 + }else{
1.1137 + /* Page not in cache. Acquire it. */
1.1138 + if( pgno>pagerPagecount(pBt->pPager) ){
1.1139 + return SQLITE_CORRUPT_BKPT;
1.1140 + }
1.1141 + rc = sqlite3BtreeGetPage(pBt, pgno, ppPage, 0);
1.1142 + if( rc ) return rc;
1.1143 + pPage = *ppPage;
1.1144 + }
1.1145 + if( !pPage->isInit ){
1.1146 + rc = sqlite3BtreeInitPage(pPage);
1.1147 + }
1.1148 + if( rc!=SQLITE_OK ){
1.1149 + releasePage(pPage);
1.1150 + *ppPage = 0;
1.1151 + }
1.1152 + return rc;
1.1153 +}
1.1154 +
1.1155 +/*
1.1156 +** Release a MemPage. This should be called once for each prior
1.1157 +** call to sqlite3BtreeGetPage.
1.1158 +*/
1.1159 +static void releasePage(MemPage *pPage){
1.1160 + if( pPage ){
1.1161 + assert( pPage->aData );
1.1162 + assert( pPage->pBt );
1.1163 + assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1.1164 + assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
1.1165 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.1166 + sqlite3PagerUnref(pPage->pDbPage);
1.1167 + }
1.1168 +}
1.1169 +
1.1170 +/*
1.1171 +** During a rollback, when the pager reloads information into the cache
1.1172 +** so that the cache is restored to its original state at the start of
1.1173 +** the transaction, for each page restored this routine is called.
1.1174 +**
1.1175 +** This routine needs to reset the extra data section at the end of the
1.1176 +** page to agree with the restored data.
1.1177 +*/
1.1178 +static void pageReinit(DbPage *pData){
1.1179 + MemPage *pPage;
1.1180 + pPage = (MemPage *)sqlite3PagerGetExtra(pData);
1.1181 + if( pPage->isInit ){
1.1182 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.1183 + pPage->isInit = 0;
1.1184 + if( sqlite3PagerPageRefcount(pData)>0 ){
1.1185 + sqlite3BtreeInitPage(pPage);
1.1186 + }
1.1187 + }
1.1188 +}
1.1189 +
1.1190 +/*
1.1191 +** Invoke the busy handler for a btree.
1.1192 +*/
1.1193 +static int sqlite3BtreeInvokeBusyHandler(void *pArg, int n){
1.1194 + BtShared *pBt = (BtShared*)pArg;
1.1195 + assert( pBt->db );
1.1196 + assert( sqlite3_mutex_held(pBt->db->mutex) );
1.1197 + return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
1.1198 +}
1.1199 +
1.1200 +/*
1.1201 +** Open a database file.
1.1202 +**
1.1203 +** zFilename is the name of the database file. If zFilename is NULL
1.1204 +** a new database with a random name is created. This randomly named
1.1205 +** database file will be deleted when sqlite3BtreeClose() is called.
1.1206 +** If zFilename is ":memory:" then an in-memory database is created
1.1207 +** that is automatically destroyed when it is closed.
1.1208 +*/
1.1209 +int sqlite3BtreeOpen(
1.1210 + const char *zFilename, /* Name of the file containing the BTree database */
1.1211 + sqlite3 *db, /* Associated database handle */
1.1212 + Btree **ppBtree, /* Pointer to new Btree object written here */
1.1213 + int flags, /* Options */
1.1214 + int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */
1.1215 +){
1.1216 + sqlite3_vfs *pVfs; /* The VFS to use for this btree */
1.1217 + BtShared *pBt = 0; /* Shared part of btree structure */
1.1218 + Btree *p; /* Handle to return */
1.1219 + int rc = SQLITE_OK;
1.1220 + int nReserve;
1.1221 + unsigned char zDbHeader[100];
1.1222 +
1.1223 + /* Set the variable isMemdb to true for an in-memory database, or
1.1224 + ** false for a file-based database. This symbol is only required if
1.1225 + ** either of the shared-data or autovacuum features are compiled
1.1226 + ** into the library.
1.1227 + */
1.1228 +#if !defined(SQLITE_OMIT_SHARED_CACHE) || !defined(SQLITE_OMIT_AUTOVACUUM)
1.1229 + #ifdef SQLITE_OMIT_MEMORYDB
1.1230 + const int isMemdb = 0;
1.1231 + #else
1.1232 + const int isMemdb = zFilename && !strcmp(zFilename, ":memory:");
1.1233 + #endif
1.1234 +#endif
1.1235 +
1.1236 + assert( db!=0 );
1.1237 + assert( sqlite3_mutex_held(db->mutex) );
1.1238 +
1.1239 + pVfs = db->pVfs;
1.1240 + p = sqlite3MallocZero(sizeof(Btree));
1.1241 + if( !p ){
1.1242 + return SQLITE_NOMEM;
1.1243 + }
1.1244 + p->inTrans = TRANS_NONE;
1.1245 + p->db = db;
1.1246 +
1.1247 +#if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1.1248 + /*
1.1249 + ** If this Btree is a candidate for shared cache, try to find an
1.1250 + ** existing BtShared object that we can share with
1.1251 + */
1.1252 + if( isMemdb==0
1.1253 + && (db->flags & SQLITE_Vtab)==0
1.1254 + && zFilename && zFilename[0]
1.1255 + ){
1.1256 + if( sqlite3GlobalConfig.sharedCacheEnabled ){
1.1257 + int nFullPathname = pVfs->mxPathname+1;
1.1258 + char *zFullPathname = sqlite3Malloc(nFullPathname);
1.1259 + sqlite3_mutex *mutexShared;
1.1260 + p->sharable = 1;
1.1261 + db->flags |= SQLITE_SharedCache;
1.1262 + if( !zFullPathname ){
1.1263 + sqlite3_free(p);
1.1264 + return SQLITE_NOMEM;
1.1265 + }
1.1266 + sqlite3OsFullPathname(pVfs, zFilename, nFullPathname, zFullPathname);
1.1267 + mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1.1268 + sqlite3_mutex_enter(mutexShared);
1.1269 + for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
1.1270 + assert( pBt->nRef>0 );
1.1271 + if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager))
1.1272 + && sqlite3PagerVfs(pBt->pPager)==pVfs ){
1.1273 + p->pBt = pBt;
1.1274 + pBt->nRef++;
1.1275 + break;
1.1276 + }
1.1277 + }
1.1278 + sqlite3_mutex_leave(mutexShared);
1.1279 + sqlite3_free(zFullPathname);
1.1280 + }
1.1281 +#ifdef SQLITE_DEBUG
1.1282 + else{
1.1283 + /* In debug mode, we mark all persistent databases as sharable
1.1284 + ** even when they are not. This exercises the locking code and
1.1285 + ** gives more opportunity for asserts(sqlite3_mutex_held())
1.1286 + ** statements to find locking problems.
1.1287 + */
1.1288 + p->sharable = 1;
1.1289 + }
1.1290 +#endif
1.1291 + }
1.1292 +#endif
1.1293 + if( pBt==0 ){
1.1294 + /*
1.1295 + ** The following asserts make sure that structures used by the btree are
1.1296 + ** the right size. This is to guard against size changes that result
1.1297 + ** when compiling on a different architecture.
1.1298 + */
1.1299 + assert( sizeof(i64)==8 || sizeof(i64)==4 );
1.1300 + assert( sizeof(u64)==8 || sizeof(u64)==4 );
1.1301 + assert( sizeof(u32)==4 );
1.1302 + assert( sizeof(u16)==2 );
1.1303 + assert( sizeof(Pgno)==4 );
1.1304 +
1.1305 + pBt = sqlite3MallocZero( sizeof(*pBt) );
1.1306 + if( pBt==0 ){
1.1307 + rc = SQLITE_NOMEM;
1.1308 + goto btree_open_out;
1.1309 + }
1.1310 + pBt->busyHdr.xFunc = sqlite3BtreeInvokeBusyHandler;
1.1311 + pBt->busyHdr.pArg = pBt;
1.1312 + rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
1.1313 + EXTRA_SIZE, flags, vfsFlags);
1.1314 + if( rc==SQLITE_OK ){
1.1315 + rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
1.1316 + }
1.1317 + if( rc!=SQLITE_OK ){
1.1318 + goto btree_open_out;
1.1319 + }
1.1320 + sqlite3PagerSetBusyhandler(pBt->pPager, &pBt->busyHdr);
1.1321 + p->pBt = pBt;
1.1322 +
1.1323 + sqlite3PagerSetReiniter(pBt->pPager, pageReinit);
1.1324 + pBt->pCursor = 0;
1.1325 + pBt->pPage1 = 0;
1.1326 + pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager);
1.1327 + pBt->pageSize = get2byte(&zDbHeader[16]);
1.1328 + if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
1.1329 + || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
1.1330 + pBt->pageSize = 0;
1.1331 + sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1.1332 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.1333 + /* If the magic name ":memory:" will create an in-memory database, then
1.1334 + ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
1.1335 + ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
1.1336 + ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
1.1337 + ** regular file-name. In this case the auto-vacuum applies as per normal.
1.1338 + */
1.1339 + if( zFilename && !isMemdb ){
1.1340 + pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
1.1341 + pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
1.1342 + }
1.1343 +#endif
1.1344 + nReserve = 0;
1.1345 + }else{
1.1346 + nReserve = zDbHeader[20];
1.1347 + pBt->pageSizeFixed = 1;
1.1348 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.1349 + pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
1.1350 + pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
1.1351 +#endif
1.1352 + }
1.1353 + pBt->usableSize = pBt->pageSize - nReserve;
1.1354 + assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */
1.1355 + sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1.1356 +
1.1357 +#if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1.1358 + /* Add the new BtShared object to the linked list sharable BtShareds.
1.1359 + */
1.1360 + if( p->sharable ){
1.1361 + sqlite3_mutex *mutexShared;
1.1362 + pBt->nRef = 1;
1.1363 + mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1.1364 + if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
1.1365 + pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
1.1366 + if( pBt->mutex==0 ){
1.1367 + rc = SQLITE_NOMEM;
1.1368 + db->mallocFailed = 0;
1.1369 + goto btree_open_out;
1.1370 + }
1.1371 + }
1.1372 + sqlite3_mutex_enter(mutexShared);
1.1373 + pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
1.1374 + GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
1.1375 + sqlite3_mutex_leave(mutexShared);
1.1376 + }
1.1377 +#endif
1.1378 + }
1.1379 +
1.1380 +#if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1.1381 + /* If the new Btree uses a sharable pBtShared, then link the new
1.1382 + ** Btree into the list of all sharable Btrees for the same connection.
1.1383 + ** The list is kept in ascending order by pBt address.
1.1384 + */
1.1385 + if( p->sharable ){
1.1386 + int i;
1.1387 + Btree *pSib;
1.1388 + for(i=0; i<db->nDb; i++){
1.1389 + if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
1.1390 + while( pSib->pPrev ){ pSib = pSib->pPrev; }
1.1391 + if( p->pBt<pSib->pBt ){
1.1392 + p->pNext = pSib;
1.1393 + p->pPrev = 0;
1.1394 + pSib->pPrev = p;
1.1395 + }else{
1.1396 + while( pSib->pNext && pSib->pNext->pBt<p->pBt ){
1.1397 + pSib = pSib->pNext;
1.1398 + }
1.1399 + p->pNext = pSib->pNext;
1.1400 + p->pPrev = pSib;
1.1401 + if( p->pNext ){
1.1402 + p->pNext->pPrev = p;
1.1403 + }
1.1404 + pSib->pNext = p;
1.1405 + }
1.1406 + break;
1.1407 + }
1.1408 + }
1.1409 + }
1.1410 +#endif
1.1411 + *ppBtree = p;
1.1412 +
1.1413 +btree_open_out:
1.1414 + if( rc!=SQLITE_OK ){
1.1415 + if( pBt && pBt->pPager ){
1.1416 + sqlite3PagerClose(pBt->pPager);
1.1417 + }
1.1418 + sqlite3_free(pBt);
1.1419 + sqlite3_free(p);
1.1420 + *ppBtree = 0;
1.1421 + }
1.1422 + return rc;
1.1423 +}
1.1424 +
1.1425 +/*
1.1426 +** Decrement the BtShared.nRef counter. When it reaches zero,
1.1427 +** remove the BtShared structure from the sharing list. Return
1.1428 +** true if the BtShared.nRef counter reaches zero and return
1.1429 +** false if it is still positive.
1.1430 +*/
1.1431 +static int removeFromSharingList(BtShared *pBt){
1.1432 +#ifndef SQLITE_OMIT_SHARED_CACHE
1.1433 + sqlite3_mutex *pMaster;
1.1434 + BtShared *pList;
1.1435 + int removed = 0;
1.1436 +
1.1437 + assert( sqlite3_mutex_notheld(pBt->mutex) );
1.1438 + pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1.1439 + sqlite3_mutex_enter(pMaster);
1.1440 + pBt->nRef--;
1.1441 + if( pBt->nRef<=0 ){
1.1442 + if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
1.1443 + GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
1.1444 + }else{
1.1445 + pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
1.1446 + while( ALWAYS(pList) && pList->pNext!=pBt ){
1.1447 + pList=pList->pNext;
1.1448 + }
1.1449 + if( ALWAYS(pList) ){
1.1450 + pList->pNext = pBt->pNext;
1.1451 + }
1.1452 + }
1.1453 + if( SQLITE_THREADSAFE ){
1.1454 + sqlite3_mutex_free(pBt->mutex);
1.1455 + }
1.1456 + removed = 1;
1.1457 + }
1.1458 + sqlite3_mutex_leave(pMaster);
1.1459 + return removed;
1.1460 +#else
1.1461 + return 1;
1.1462 +#endif
1.1463 +}
1.1464 +
1.1465 +/*
1.1466 +** Make sure pBt->pTmpSpace points to an allocation of
1.1467 +** MX_CELL_SIZE(pBt) bytes.
1.1468 +*/
1.1469 +static void allocateTempSpace(BtShared *pBt){
1.1470 + if( !pBt->pTmpSpace ){
1.1471 + pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
1.1472 + }
1.1473 +}
1.1474 +
1.1475 +/*
1.1476 +** Free the pBt->pTmpSpace allocation
1.1477 +*/
1.1478 +static void freeTempSpace(BtShared *pBt){
1.1479 + sqlite3PageFree( pBt->pTmpSpace);
1.1480 + pBt->pTmpSpace = 0;
1.1481 +}
1.1482 +
1.1483 +/*
1.1484 +** Close an open database and invalidate all cursors.
1.1485 +*/
1.1486 +int sqlite3BtreeClose(Btree *p){
1.1487 + BtShared *pBt = p->pBt;
1.1488 + BtCursor *pCur;
1.1489 +
1.1490 + /* Close all cursors opened via this handle. */
1.1491 + assert( sqlite3_mutex_held(p->db->mutex) );
1.1492 + sqlite3BtreeEnter(p);
1.1493 + pBt->db = p->db;
1.1494 + pCur = pBt->pCursor;
1.1495 + while( pCur ){
1.1496 + BtCursor *pTmp = pCur;
1.1497 + pCur = pCur->pNext;
1.1498 + if( pTmp->pBtree==p ){
1.1499 + sqlite3BtreeCloseCursor(pTmp);
1.1500 + }
1.1501 + }
1.1502 +
1.1503 + /* Rollback any active transaction and free the handle structure.
1.1504 + ** The call to sqlite3BtreeRollback() drops any table-locks held by
1.1505 + ** this handle.
1.1506 + */
1.1507 + sqlite3BtreeRollback(p);
1.1508 + sqlite3BtreeLeave(p);
1.1509 +
1.1510 + /* If there are still other outstanding references to the shared-btree
1.1511 + ** structure, return now. The remainder of this procedure cleans
1.1512 + ** up the shared-btree.
1.1513 + */
1.1514 + assert( p->wantToLock==0 && p->locked==0 );
1.1515 + if( !p->sharable || removeFromSharingList(pBt) ){
1.1516 + /* The pBt is no longer on the sharing list, so we can access
1.1517 + ** it without having to hold the mutex.
1.1518 + **
1.1519 + ** Clean out and delete the BtShared object.
1.1520 + */
1.1521 + assert( !pBt->pCursor );
1.1522 + sqlite3PagerClose(pBt->pPager);
1.1523 + if( pBt->xFreeSchema && pBt->pSchema ){
1.1524 + pBt->xFreeSchema(pBt->pSchema);
1.1525 + }
1.1526 + sqlite3_free(pBt->pSchema);
1.1527 + freeTempSpace(pBt);
1.1528 + sqlite3_free(pBt);
1.1529 + }
1.1530 +
1.1531 +#ifndef SQLITE_OMIT_SHARED_CACHE
1.1532 + assert( p->wantToLock==0 );
1.1533 + assert( p->locked==0 );
1.1534 + if( p->pPrev ) p->pPrev->pNext = p->pNext;
1.1535 + if( p->pNext ) p->pNext->pPrev = p->pPrev;
1.1536 +#endif
1.1537 +
1.1538 + sqlite3_free(p);
1.1539 + return SQLITE_OK;
1.1540 +}
1.1541 +
1.1542 +/*
1.1543 +** Change the limit on the number of pages allowed in the cache.
1.1544 +**
1.1545 +** The maximum number of cache pages is set to the absolute
1.1546 +** value of mxPage. If mxPage is negative, the pager will
1.1547 +** operate asynchronously - it will not stop to do fsync()s
1.1548 +** to insure data is written to the disk surface before
1.1549 +** continuing. Transactions still work if synchronous is off,
1.1550 +** and the database cannot be corrupted if this program
1.1551 +** crashes. But if the operating system crashes or there is
1.1552 +** an abrupt power failure when synchronous is off, the database
1.1553 +** could be left in an inconsistent and unrecoverable state.
1.1554 +** Synchronous is on by default so database corruption is not
1.1555 +** normally a worry.
1.1556 +*/
1.1557 +int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
1.1558 + BtShared *pBt = p->pBt;
1.1559 + assert( sqlite3_mutex_held(p->db->mutex) );
1.1560 + sqlite3BtreeEnter(p);
1.1561 + sqlite3PagerSetCachesize(pBt->pPager, mxPage);
1.1562 + sqlite3BtreeLeave(p);
1.1563 + return SQLITE_OK;
1.1564 +}
1.1565 +
1.1566 +/*
1.1567 +** Change the way data is synced to disk in order to increase or decrease
1.1568 +** how well the database resists damage due to OS crashes and power
1.1569 +** failures. Level 1 is the same as asynchronous (no syncs() occur and
1.1570 +** there is a high probability of damage) Level 2 is the default. There
1.1571 +** is a very low but non-zero probability of damage. Level 3 reduces the
1.1572 +** probability of damage to near zero but with a write performance reduction.
1.1573 +*/
1.1574 +#ifndef SQLITE_OMIT_PAGER_PRAGMAS
1.1575 +int sqlite3BtreeSetSafetyLevel(Btree *p, int level, int fullSync){
1.1576 + BtShared *pBt = p->pBt;
1.1577 + assert( sqlite3_mutex_held(p->db->mutex) );
1.1578 + sqlite3BtreeEnter(p);
1.1579 + sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync);
1.1580 + sqlite3BtreeLeave(p);
1.1581 + return SQLITE_OK;
1.1582 +}
1.1583 +#endif
1.1584 +
1.1585 +/*
1.1586 +** Return TRUE if the given btree is set to safety level 1. In other
1.1587 +** words, return TRUE if no sync() occurs on the disk files.
1.1588 +*/
1.1589 +int sqlite3BtreeSyncDisabled(Btree *p){
1.1590 + BtShared *pBt = p->pBt;
1.1591 + int rc;
1.1592 + assert( sqlite3_mutex_held(p->db->mutex) );
1.1593 + sqlite3BtreeEnter(p);
1.1594 + assert( pBt && pBt->pPager );
1.1595 + rc = sqlite3PagerNosync(pBt->pPager);
1.1596 + sqlite3BtreeLeave(p);
1.1597 + return rc;
1.1598 +}
1.1599 +
1.1600 +#if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
1.1601 +/*
1.1602 +** Change the default pages size and the number of reserved bytes per page.
1.1603 +**
1.1604 +** The page size must be a power of 2 between 512 and 65536. If the page
1.1605 +** size supplied does not meet this constraint then the page size is not
1.1606 +** changed.
1.1607 +**
1.1608 +** Page sizes are constrained to be a power of two so that the region
1.1609 +** of the database file used for locking (beginning at PENDING_BYTE,
1.1610 +** the first byte past the 1GB boundary, 0x40000000) needs to occur
1.1611 +** at the beginning of a page.
1.1612 +**
1.1613 +** If parameter nReserve is less than zero, then the number of reserved
1.1614 +** bytes per page is left unchanged.
1.1615 +*/
1.1616 +int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve){
1.1617 + int rc = SQLITE_OK;
1.1618 + BtShared *pBt = p->pBt;
1.1619 + sqlite3BtreeEnter(p);
1.1620 + if( pBt->pageSizeFixed ){
1.1621 + sqlite3BtreeLeave(p);
1.1622 + return SQLITE_READONLY;
1.1623 + }
1.1624 + if( nReserve<0 ){
1.1625 + nReserve = pBt->pageSize - pBt->usableSize;
1.1626 + }
1.1627 + if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
1.1628 + ((pageSize-1)&pageSize)==0 ){
1.1629 + assert( (pageSize & 7)==0 );
1.1630 + assert( !pBt->pPage1 && !pBt->pCursor );
1.1631 + pBt->pageSize = pageSize;
1.1632 + freeTempSpace(pBt);
1.1633 + rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1.1634 + }
1.1635 + pBt->usableSize = pBt->pageSize - nReserve;
1.1636 + sqlite3BtreeLeave(p);
1.1637 + return rc;
1.1638 +}
1.1639 +
1.1640 +/*
1.1641 +** Return the currently defined page size
1.1642 +*/
1.1643 +int sqlite3BtreeGetPageSize(Btree *p){
1.1644 + return p->pBt->pageSize;
1.1645 +}
1.1646 +int sqlite3BtreeGetReserve(Btree *p){
1.1647 + int n;
1.1648 + sqlite3BtreeEnter(p);
1.1649 + n = p->pBt->pageSize - p->pBt->usableSize;
1.1650 + sqlite3BtreeLeave(p);
1.1651 + return n;
1.1652 +}
1.1653 +
1.1654 +/*
1.1655 +** Set the maximum page count for a database if mxPage is positive.
1.1656 +** No changes are made if mxPage is 0 or negative.
1.1657 +** Regardless of the value of mxPage, return the maximum page count.
1.1658 +*/
1.1659 +int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
1.1660 + int n;
1.1661 + sqlite3BtreeEnter(p);
1.1662 + n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
1.1663 + sqlite3BtreeLeave(p);
1.1664 + return n;
1.1665 +}
1.1666 +#endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */
1.1667 +
1.1668 +/*
1.1669 +** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
1.1670 +** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
1.1671 +** is disabled. The default value for the auto-vacuum property is
1.1672 +** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
1.1673 +*/
1.1674 +int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
1.1675 +#ifdef SQLITE_OMIT_AUTOVACUUM
1.1676 + return SQLITE_READONLY;
1.1677 +#else
1.1678 + BtShared *pBt = p->pBt;
1.1679 + int rc = SQLITE_OK;
1.1680 + int av = (autoVacuum?1:0);
1.1681 +
1.1682 + sqlite3BtreeEnter(p);
1.1683 + if( pBt->pageSizeFixed && av!=pBt->autoVacuum ){
1.1684 + rc = SQLITE_READONLY;
1.1685 + }else{
1.1686 + pBt->autoVacuum = av;
1.1687 + }
1.1688 + sqlite3BtreeLeave(p);
1.1689 + return rc;
1.1690 +#endif
1.1691 +}
1.1692 +
1.1693 +/*
1.1694 +** Return the value of the 'auto-vacuum' property. If auto-vacuum is
1.1695 +** enabled 1 is returned. Otherwise 0.
1.1696 +*/
1.1697 +int sqlite3BtreeGetAutoVacuum(Btree *p){
1.1698 +#ifdef SQLITE_OMIT_AUTOVACUUM
1.1699 + return BTREE_AUTOVACUUM_NONE;
1.1700 +#else
1.1701 + int rc;
1.1702 + sqlite3BtreeEnter(p);
1.1703 + rc = (
1.1704 + (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
1.1705 + (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
1.1706 + BTREE_AUTOVACUUM_INCR
1.1707 + );
1.1708 + sqlite3BtreeLeave(p);
1.1709 + return rc;
1.1710 +#endif
1.1711 +}
1.1712 +
1.1713 +
1.1714 +/*
1.1715 +** Get a reference to pPage1 of the database file. This will
1.1716 +** also acquire a readlock on that file.
1.1717 +**
1.1718 +** SQLITE_OK is returned on success. If the file is not a
1.1719 +** well-formed database file, then SQLITE_CORRUPT is returned.
1.1720 +** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM
1.1721 +** is returned if we run out of memory.
1.1722 +*/
1.1723 +static int lockBtree(BtShared *pBt){
1.1724 + int rc;
1.1725 + MemPage *pPage1;
1.1726 + int nPage;
1.1727 +
1.1728 + assert( sqlite3_mutex_held(pBt->mutex) );
1.1729 + if( pBt->pPage1 ) return SQLITE_OK;
1.1730 + rc = sqlite3BtreeGetPage(pBt, 1, &pPage1, 0);
1.1731 + if( rc!=SQLITE_OK ) return rc;
1.1732 +
1.1733 + /* Do some checking to help insure the file we opened really is
1.1734 + ** a valid database file.
1.1735 + */
1.1736 + rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
1.1737 + if( rc!=SQLITE_OK ){
1.1738 + goto page1_init_failed;
1.1739 + }else if( nPage>0 ){
1.1740 + int pageSize;
1.1741 + int usableSize;
1.1742 + u8 *page1 = pPage1->aData;
1.1743 + rc = SQLITE_NOTADB;
1.1744 + if( memcmp(page1, zMagicHeader, 16)!=0 ){
1.1745 + goto page1_init_failed;
1.1746 + }
1.1747 + if( page1[18]>1 ){
1.1748 + pBt->readOnly = 1;
1.1749 + }
1.1750 + if( page1[19]>1 ){
1.1751 + goto page1_init_failed;
1.1752 + }
1.1753 +
1.1754 + /* The maximum embedded fraction must be exactly 25%. And the minimum
1.1755 + ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data.
1.1756 + ** The original design allowed these amounts to vary, but as of
1.1757 + ** version 3.6.0, we require them to be fixed.
1.1758 + */
1.1759 + if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
1.1760 + goto page1_init_failed;
1.1761 + }
1.1762 + pageSize = get2byte(&page1[16]);
1.1763 + if( ((pageSize-1)&pageSize)!=0 || pageSize<512 ||
1.1764 + (SQLITE_MAX_PAGE_SIZE<32768 && pageSize>SQLITE_MAX_PAGE_SIZE)
1.1765 + ){
1.1766 + goto page1_init_failed;
1.1767 + }
1.1768 + assert( (pageSize & 7)==0 );
1.1769 + usableSize = pageSize - page1[20];
1.1770 + if( pageSize!=pBt->pageSize ){
1.1771 + /* After reading the first page of the database assuming a page size
1.1772 + ** of BtShared.pageSize, we have discovered that the page-size is
1.1773 + ** actually pageSize. Unlock the database, leave pBt->pPage1 at
1.1774 + ** zero and return SQLITE_OK. The caller will call this function
1.1775 + ** again with the correct page-size.
1.1776 + */
1.1777 + releasePage(pPage1);
1.1778 + pBt->usableSize = usableSize;
1.1779 + pBt->pageSize = pageSize;
1.1780 + freeTempSpace(pBt);
1.1781 + sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1.1782 + return SQLITE_OK;
1.1783 + }
1.1784 + if( usableSize<500 ){
1.1785 + goto page1_init_failed;
1.1786 + }
1.1787 + pBt->pageSize = pageSize;
1.1788 + pBt->usableSize = usableSize;
1.1789 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.1790 + pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
1.1791 + pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
1.1792 +#endif
1.1793 + }
1.1794 +
1.1795 + /* maxLocal is the maximum amount of payload to store locally for
1.1796 + ** a cell. Make sure it is small enough so that at least minFanout
1.1797 + ** cells can will fit on one page. We assume a 10-byte page header.
1.1798 + ** Besides the payload, the cell must store:
1.1799 + ** 2-byte pointer to the cell
1.1800 + ** 4-byte child pointer
1.1801 + ** 9-byte nKey value
1.1802 + ** 4-byte nData value
1.1803 + ** 4-byte overflow page pointer
1.1804 + ** So a cell consists of a 2-byte poiner, a header which is as much as
1.1805 + ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
1.1806 + ** page pointer.
1.1807 + */
1.1808 + pBt->maxLocal = (pBt->usableSize-12)*64/255 - 23;
1.1809 + pBt->minLocal = (pBt->usableSize-12)*32/255 - 23;
1.1810 + pBt->maxLeaf = pBt->usableSize - 35;
1.1811 + pBt->minLeaf = (pBt->usableSize-12)*32/255 - 23;
1.1812 + assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
1.1813 + pBt->pPage1 = pPage1;
1.1814 + return SQLITE_OK;
1.1815 +
1.1816 +page1_init_failed:
1.1817 + releasePage(pPage1);
1.1818 + pBt->pPage1 = 0;
1.1819 + return rc;
1.1820 +}
1.1821 +
1.1822 +/*
1.1823 +** This routine works like lockBtree() except that it also invokes the
1.1824 +** busy callback if there is lock contention.
1.1825 +*/
1.1826 +static int lockBtreeWithRetry(Btree *pRef){
1.1827 + int rc = SQLITE_OK;
1.1828 +
1.1829 + assert( sqlite3BtreeHoldsMutex(pRef) );
1.1830 + if( pRef->inTrans==TRANS_NONE ){
1.1831 + u8 inTransaction = pRef->pBt->inTransaction;
1.1832 + btreeIntegrity(pRef);
1.1833 + rc = sqlite3BtreeBeginTrans(pRef, 0);
1.1834 + pRef->pBt->inTransaction = inTransaction;
1.1835 + pRef->inTrans = TRANS_NONE;
1.1836 + if( rc==SQLITE_OK ){
1.1837 + pRef->pBt->nTransaction--;
1.1838 + }
1.1839 + btreeIntegrity(pRef);
1.1840 + }
1.1841 + return rc;
1.1842 +}
1.1843 +
1.1844 +
1.1845 +/*
1.1846 +** If there are no outstanding cursors and we are not in the middle
1.1847 +** of a transaction but there is a read lock on the database, then
1.1848 +** this routine unrefs the first page of the database file which
1.1849 +** has the effect of releasing the read lock.
1.1850 +**
1.1851 +** If there are any outstanding cursors, this routine is a no-op.
1.1852 +**
1.1853 +** If there is a transaction in progress, this routine is a no-op.
1.1854 +*/
1.1855 +static void unlockBtreeIfUnused(BtShared *pBt){
1.1856 + assert( sqlite3_mutex_held(pBt->mutex) );
1.1857 + if( pBt->inTransaction==TRANS_NONE && pBt->pCursor==0 && pBt->pPage1!=0 ){
1.1858 + if( sqlite3PagerRefcount(pBt->pPager)>=1 ){
1.1859 + assert( pBt->pPage1->aData );
1.1860 +#if 0
1.1861 + if( pBt->pPage1->aData==0 ){
1.1862 + MemPage *pPage = pBt->pPage1;
1.1863 + pPage->aData = sqlite3PagerGetData(pPage->pDbPage);
1.1864 + pPage->pBt = pBt;
1.1865 + pPage->pgno = 1;
1.1866 + }
1.1867 +#endif
1.1868 + releasePage(pBt->pPage1);
1.1869 + }
1.1870 + pBt->pPage1 = 0;
1.1871 + pBt->inStmt = 0;
1.1872 + }
1.1873 +}
1.1874 +
1.1875 +/*
1.1876 +** Create a new database by initializing the first page of the
1.1877 +** file.
1.1878 +*/
1.1879 +static int newDatabase(BtShared *pBt){
1.1880 + MemPage *pP1;
1.1881 + unsigned char *data;
1.1882 + int rc;
1.1883 + int nPage;
1.1884 +
1.1885 + assert( sqlite3_mutex_held(pBt->mutex) );
1.1886 + rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
1.1887 + if( rc!=SQLITE_OK || nPage>0 ){
1.1888 + return rc;
1.1889 + }
1.1890 + pP1 = pBt->pPage1;
1.1891 + assert( pP1!=0 );
1.1892 + data = pP1->aData;
1.1893 + rc = sqlite3PagerWrite(pP1->pDbPage);
1.1894 + if( rc ) return rc;
1.1895 + memcpy(data, zMagicHeader, sizeof(zMagicHeader));
1.1896 + assert( sizeof(zMagicHeader)==16 );
1.1897 + put2byte(&data[16], pBt->pageSize);
1.1898 + data[18] = 1;
1.1899 + data[19] = 1;
1.1900 + data[20] = pBt->pageSize - pBt->usableSize;
1.1901 + data[21] = 64;
1.1902 + data[22] = 32;
1.1903 + data[23] = 32;
1.1904 + memset(&data[24], 0, 100-24);
1.1905 + zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
1.1906 + pBt->pageSizeFixed = 1;
1.1907 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.1908 + assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
1.1909 + assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
1.1910 + put4byte(&data[36 + 4*4], pBt->autoVacuum);
1.1911 + put4byte(&data[36 + 7*4], pBt->incrVacuum);
1.1912 +#endif
1.1913 + return SQLITE_OK;
1.1914 +}
1.1915 +
1.1916 +/*
1.1917 +** Attempt to start a new transaction. A write-transaction
1.1918 +** is started if the second argument is nonzero, otherwise a read-
1.1919 +** transaction. If the second argument is 2 or more and exclusive
1.1920 +** transaction is started, meaning that no other process is allowed
1.1921 +** to access the database. A preexisting transaction may not be
1.1922 +** upgraded to exclusive by calling this routine a second time - the
1.1923 +** exclusivity flag only works for a new transaction.
1.1924 +**
1.1925 +** A write-transaction must be started before attempting any
1.1926 +** changes to the database. None of the following routines
1.1927 +** will work unless a transaction is started first:
1.1928 +**
1.1929 +** sqlite3BtreeCreateTable()
1.1930 +** sqlite3BtreeCreateIndex()
1.1931 +** sqlite3BtreeClearTable()
1.1932 +** sqlite3BtreeDropTable()
1.1933 +** sqlite3BtreeInsert()
1.1934 +** sqlite3BtreeDelete()
1.1935 +** sqlite3BtreeUpdateMeta()
1.1936 +**
1.1937 +** If an initial attempt to acquire the lock fails because of lock contention
1.1938 +** and the database was previously unlocked, then invoke the busy handler
1.1939 +** if there is one. But if there was previously a read-lock, do not
1.1940 +** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is
1.1941 +** returned when there is already a read-lock in order to avoid a deadlock.
1.1942 +**
1.1943 +** Suppose there are two processes A and B. A has a read lock and B has
1.1944 +** a reserved lock. B tries to promote to exclusive but is blocked because
1.1945 +** of A's read lock. A tries to promote to reserved but is blocked by B.
1.1946 +** One or the other of the two processes must give way or there can be
1.1947 +** no progress. By returning SQLITE_BUSY and not invoking the busy callback
1.1948 +** when A already has a read lock, we encourage A to give up and let B
1.1949 +** proceed.
1.1950 +*/
1.1951 +int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
1.1952 + BtShared *pBt = p->pBt;
1.1953 + int rc = SQLITE_OK;
1.1954 +
1.1955 + sqlite3BtreeEnter(p);
1.1956 + pBt->db = p->db;
1.1957 + btreeIntegrity(p);
1.1958 +
1.1959 + /* If the btree is already in a write-transaction, or it
1.1960 + ** is already in a read-transaction and a read-transaction
1.1961 + ** is requested, this is a no-op.
1.1962 + */
1.1963 + if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
1.1964 + goto trans_begun;
1.1965 + }
1.1966 +
1.1967 + /* Write transactions are not possible on a read-only database */
1.1968 + if( pBt->readOnly && wrflag ){
1.1969 + rc = SQLITE_READONLY;
1.1970 + goto trans_begun;
1.1971 + }
1.1972 +
1.1973 + /* If another database handle has already opened a write transaction
1.1974 + ** on this shared-btree structure and a second write transaction is
1.1975 + ** requested, return SQLITE_BUSY.
1.1976 + */
1.1977 + if( pBt->inTransaction==TRANS_WRITE && wrflag ){
1.1978 + rc = SQLITE_BUSY;
1.1979 + goto trans_begun;
1.1980 + }
1.1981 +
1.1982 +#ifndef SQLITE_OMIT_SHARED_CACHE
1.1983 + if( wrflag>1 ){
1.1984 + BtLock *pIter;
1.1985 + for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
1.1986 + if( pIter->pBtree!=p ){
1.1987 + rc = SQLITE_BUSY;
1.1988 + goto trans_begun;
1.1989 + }
1.1990 + }
1.1991 + }
1.1992 +#endif
1.1993 +
1.1994 + do {
1.1995 + if( pBt->pPage1==0 ){
1.1996 + do{
1.1997 + rc = lockBtree(pBt);
1.1998 + }while( pBt->pPage1==0 && rc==SQLITE_OK );
1.1999 + }
1.2000 +
1.2001 + if( rc==SQLITE_OK && wrflag ){
1.2002 + if( pBt->readOnly ){
1.2003 + rc = SQLITE_READONLY;
1.2004 + }else{
1.2005 + rc = sqlite3PagerBegin(pBt->pPage1->pDbPage, wrflag>1);
1.2006 + if( rc==SQLITE_OK ){
1.2007 + rc = newDatabase(pBt);
1.2008 + }
1.2009 + }
1.2010 + }
1.2011 +
1.2012 + if( rc==SQLITE_OK ){
1.2013 + if( wrflag ) pBt->inStmt = 0;
1.2014 + }else{
1.2015 + unlockBtreeIfUnused(pBt);
1.2016 + }
1.2017 + }while( rc==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
1.2018 + sqlite3BtreeInvokeBusyHandler(pBt, 0) );
1.2019 +
1.2020 + if( rc==SQLITE_OK ){
1.2021 + if( p->inTrans==TRANS_NONE ){
1.2022 + pBt->nTransaction++;
1.2023 + }
1.2024 + p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
1.2025 + if( p->inTrans>pBt->inTransaction ){
1.2026 + pBt->inTransaction = p->inTrans;
1.2027 + }
1.2028 +#ifndef SQLITE_OMIT_SHARED_CACHE
1.2029 + if( wrflag>1 ){
1.2030 + assert( !pBt->pExclusive );
1.2031 + pBt->pExclusive = p;
1.2032 + }
1.2033 +#endif
1.2034 + }
1.2035 +
1.2036 +
1.2037 +trans_begun:
1.2038 + btreeIntegrity(p);
1.2039 + sqlite3BtreeLeave(p);
1.2040 + return rc;
1.2041 +}
1.2042 +
1.2043 +
1.2044 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.2045 +
1.2046 +/*
1.2047 +** Set the pointer-map entries for all children of page pPage. Also, if
1.2048 +** pPage contains cells that point to overflow pages, set the pointer
1.2049 +** map entries for the overflow pages as well.
1.2050 +*/
1.2051 +static int setChildPtrmaps(MemPage *pPage){
1.2052 + int i; /* Counter variable */
1.2053 + int nCell; /* Number of cells in page pPage */
1.2054 + int rc; /* Return code */
1.2055 + BtShared *pBt = pPage->pBt;
1.2056 + int isInitOrig = pPage->isInit;
1.2057 + Pgno pgno = pPage->pgno;
1.2058 +
1.2059 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.2060 + rc = sqlite3BtreeInitPage(pPage);
1.2061 + if( rc!=SQLITE_OK ){
1.2062 + goto set_child_ptrmaps_out;
1.2063 + }
1.2064 + nCell = pPage->nCell;
1.2065 +
1.2066 + for(i=0; i<nCell; i++){
1.2067 + u8 *pCell = findCell(pPage, i);
1.2068 +
1.2069 + rc = ptrmapPutOvflPtr(pPage, pCell);
1.2070 + if( rc!=SQLITE_OK ){
1.2071 + goto set_child_ptrmaps_out;
1.2072 + }
1.2073 +
1.2074 + if( !pPage->leaf ){
1.2075 + Pgno childPgno = get4byte(pCell);
1.2076 + rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
1.2077 + if( rc!=SQLITE_OK ) goto set_child_ptrmaps_out;
1.2078 + }
1.2079 + }
1.2080 +
1.2081 + if( !pPage->leaf ){
1.2082 + Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
1.2083 + rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
1.2084 + }
1.2085 +
1.2086 +set_child_ptrmaps_out:
1.2087 + pPage->isInit = isInitOrig;
1.2088 + return rc;
1.2089 +}
1.2090 +
1.2091 +/*
1.2092 +** Somewhere on pPage, which is guarenteed to be a btree page, not an overflow
1.2093 +** page, is a pointer to page iFrom. Modify this pointer so that it points to
1.2094 +** iTo. Parameter eType describes the type of pointer to be modified, as
1.2095 +** follows:
1.2096 +**
1.2097 +** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child
1.2098 +** page of pPage.
1.2099 +**
1.2100 +** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
1.2101 +** page pointed to by one of the cells on pPage.
1.2102 +**
1.2103 +** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
1.2104 +** overflow page in the list.
1.2105 +*/
1.2106 +static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
1.2107 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.2108 + if( eType==PTRMAP_OVERFLOW2 ){
1.2109 + /* The pointer is always the first 4 bytes of the page in this case. */
1.2110 + if( get4byte(pPage->aData)!=iFrom ){
1.2111 + return SQLITE_CORRUPT_BKPT;
1.2112 + }
1.2113 + put4byte(pPage->aData, iTo);
1.2114 + }else{
1.2115 + int isInitOrig = pPage->isInit;
1.2116 + int i;
1.2117 + int nCell;
1.2118 +
1.2119 + sqlite3BtreeInitPage(pPage);
1.2120 + nCell = pPage->nCell;
1.2121 +
1.2122 + for(i=0; i<nCell; i++){
1.2123 + u8 *pCell = findCell(pPage, i);
1.2124 + if( eType==PTRMAP_OVERFLOW1 ){
1.2125 + CellInfo info;
1.2126 + sqlite3BtreeParseCellPtr(pPage, pCell, &info);
1.2127 + if( info.iOverflow ){
1.2128 + if( iFrom==get4byte(&pCell[info.iOverflow]) ){
1.2129 + put4byte(&pCell[info.iOverflow], iTo);
1.2130 + break;
1.2131 + }
1.2132 + }
1.2133 + }else{
1.2134 + if( get4byte(pCell)==iFrom ){
1.2135 + put4byte(pCell, iTo);
1.2136 + break;
1.2137 + }
1.2138 + }
1.2139 + }
1.2140 +
1.2141 + if( i==nCell ){
1.2142 + if( eType!=PTRMAP_BTREE ||
1.2143 + get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
1.2144 + return SQLITE_CORRUPT_BKPT;
1.2145 + }
1.2146 + put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
1.2147 + }
1.2148 +
1.2149 + pPage->isInit = isInitOrig;
1.2150 + }
1.2151 + return SQLITE_OK;
1.2152 +}
1.2153 +
1.2154 +
1.2155 +/*
1.2156 +** Move the open database page pDbPage to location iFreePage in the
1.2157 +** database. The pDbPage reference remains valid.
1.2158 +*/
1.2159 +static int relocatePage(
1.2160 + BtShared *pBt, /* Btree */
1.2161 + MemPage *pDbPage, /* Open page to move */
1.2162 + u8 eType, /* Pointer map 'type' entry for pDbPage */
1.2163 + Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */
1.2164 + Pgno iFreePage, /* The location to move pDbPage to */
1.2165 + int isCommit
1.2166 +){
1.2167 + MemPage *pPtrPage; /* The page that contains a pointer to pDbPage */
1.2168 + Pgno iDbPage = pDbPage->pgno;
1.2169 + Pager *pPager = pBt->pPager;
1.2170 + int rc;
1.2171 +
1.2172 + assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
1.2173 + eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
1.2174 + assert( sqlite3_mutex_held(pBt->mutex) );
1.2175 + assert( pDbPage->pBt==pBt );
1.2176 +
1.2177 + /* Move page iDbPage from its current location to page number iFreePage */
1.2178 + TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
1.2179 + iDbPage, iFreePage, iPtrPage, eType));
1.2180 + rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
1.2181 + if( rc!=SQLITE_OK ){
1.2182 + return rc;
1.2183 + }
1.2184 + pDbPage->pgno = iFreePage;
1.2185 +
1.2186 + /* If pDbPage was a btree-page, then it may have child pages and/or cells
1.2187 + ** that point to overflow pages. The pointer map entries for all these
1.2188 + ** pages need to be changed.
1.2189 + **
1.2190 + ** If pDbPage is an overflow page, then the first 4 bytes may store a
1.2191 + ** pointer to a subsequent overflow page. If this is the case, then
1.2192 + ** the pointer map needs to be updated for the subsequent overflow page.
1.2193 + */
1.2194 + if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
1.2195 + rc = setChildPtrmaps(pDbPage);
1.2196 + if( rc!=SQLITE_OK ){
1.2197 + return rc;
1.2198 + }
1.2199 + }else{
1.2200 + Pgno nextOvfl = get4byte(pDbPage->aData);
1.2201 + if( nextOvfl!=0 ){
1.2202 + rc = ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage);
1.2203 + if( rc!=SQLITE_OK ){
1.2204 + return rc;
1.2205 + }
1.2206 + }
1.2207 + }
1.2208 +
1.2209 + /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
1.2210 + ** that it points at iFreePage. Also fix the pointer map entry for
1.2211 + ** iPtrPage.
1.2212 + */
1.2213 + if( eType!=PTRMAP_ROOTPAGE ){
1.2214 + rc = sqlite3BtreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
1.2215 + if( rc!=SQLITE_OK ){
1.2216 + return rc;
1.2217 + }
1.2218 + rc = sqlite3PagerWrite(pPtrPage->pDbPage);
1.2219 + if( rc!=SQLITE_OK ){
1.2220 + releasePage(pPtrPage);
1.2221 + return rc;
1.2222 + }
1.2223 + rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
1.2224 + releasePage(pPtrPage);
1.2225 + if( rc==SQLITE_OK ){
1.2226 + rc = ptrmapPut(pBt, iFreePage, eType, iPtrPage);
1.2227 + }
1.2228 + }
1.2229 + return rc;
1.2230 +}
1.2231 +
1.2232 +/* Forward declaration required by incrVacuumStep(). */
1.2233 +static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
1.2234 +
1.2235 +/*
1.2236 +** Perform a single step of an incremental-vacuum. If successful,
1.2237 +** return SQLITE_OK. If there is no work to do (and therefore no
1.2238 +** point in calling this function again), return SQLITE_DONE.
1.2239 +**
1.2240 +** More specificly, this function attempts to re-organize the
1.2241 +** database so that the last page of the file currently in use
1.2242 +** is no longer in use.
1.2243 +**
1.2244 +** If the nFin parameter is non-zero, the implementation assumes
1.2245 +** that the caller will keep calling incrVacuumStep() until
1.2246 +** it returns SQLITE_DONE or an error, and that nFin is the
1.2247 +** number of pages the database file will contain after this
1.2248 +** process is complete.
1.2249 +*/
1.2250 +static int incrVacuumStep(BtShared *pBt, Pgno nFin){
1.2251 + Pgno iLastPg; /* Last page in the database */
1.2252 + Pgno nFreeList; /* Number of pages still on the free-list */
1.2253 +
1.2254 + assert( sqlite3_mutex_held(pBt->mutex) );
1.2255 + iLastPg = pBt->nTrunc;
1.2256 + if( iLastPg==0 ){
1.2257 + iLastPg = pagerPagecount(pBt->pPager);
1.2258 + }
1.2259 +
1.2260 + if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
1.2261 + int rc;
1.2262 + u8 eType;
1.2263 + Pgno iPtrPage;
1.2264 +
1.2265 + nFreeList = get4byte(&pBt->pPage1->aData[36]);
1.2266 + if( nFreeList==0 || nFin==iLastPg ){
1.2267 + return SQLITE_DONE;
1.2268 + }
1.2269 +
1.2270 + rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
1.2271 + if( rc!=SQLITE_OK ){
1.2272 + return rc;
1.2273 + }
1.2274 + if( eType==PTRMAP_ROOTPAGE ){
1.2275 + return SQLITE_CORRUPT_BKPT;
1.2276 + }
1.2277 +
1.2278 + if( eType==PTRMAP_FREEPAGE ){
1.2279 + if( nFin==0 ){
1.2280 + /* Remove the page from the files free-list. This is not required
1.2281 + ** if nFin is non-zero. In that case, the free-list will be
1.2282 + ** truncated to zero after this function returns, so it doesn't
1.2283 + ** matter if it still contains some garbage entries.
1.2284 + */
1.2285 + Pgno iFreePg;
1.2286 + MemPage *pFreePg;
1.2287 + rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1);
1.2288 + if( rc!=SQLITE_OK ){
1.2289 + return rc;
1.2290 + }
1.2291 + assert( iFreePg==iLastPg );
1.2292 + releasePage(pFreePg);
1.2293 + }
1.2294 + } else {
1.2295 + Pgno iFreePg; /* Index of free page to move pLastPg to */
1.2296 + MemPage *pLastPg;
1.2297 +
1.2298 + rc = sqlite3BtreeGetPage(pBt, iLastPg, &pLastPg, 0);
1.2299 + if( rc!=SQLITE_OK ){
1.2300 + return rc;
1.2301 + }
1.2302 +
1.2303 + /* If nFin is zero, this loop runs exactly once and page pLastPg
1.2304 + ** is swapped with the first free page pulled off the free list.
1.2305 + **
1.2306 + ** On the other hand, if nFin is greater than zero, then keep
1.2307 + ** looping until a free-page located within the first nFin pages
1.2308 + ** of the file is found.
1.2309 + */
1.2310 + do {
1.2311 + MemPage *pFreePg;
1.2312 + rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0);
1.2313 + if( rc!=SQLITE_OK ){
1.2314 + releasePage(pLastPg);
1.2315 + return rc;
1.2316 + }
1.2317 + releasePage(pFreePg);
1.2318 + }while( nFin!=0 && iFreePg>nFin );
1.2319 + assert( iFreePg<iLastPg );
1.2320 +
1.2321 + rc = sqlite3PagerWrite(pLastPg->pDbPage);
1.2322 + if( rc==SQLITE_OK ){
1.2323 + rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, nFin!=0);
1.2324 + }
1.2325 + releasePage(pLastPg);
1.2326 + if( rc!=SQLITE_OK ){
1.2327 + return rc;
1.2328 + }
1.2329 + }
1.2330 + }
1.2331 +
1.2332 + pBt->nTrunc = iLastPg - 1;
1.2333 + while( pBt->nTrunc==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, pBt->nTrunc) ){
1.2334 + pBt->nTrunc--;
1.2335 + }
1.2336 + return SQLITE_OK;
1.2337 +}
1.2338 +
1.2339 +/*
1.2340 +** A write-transaction must be opened before calling this function.
1.2341 +** It performs a single unit of work towards an incremental vacuum.
1.2342 +**
1.2343 +** If the incremental vacuum is finished after this function has run,
1.2344 +** SQLITE_DONE is returned. If it is not finished, but no error occured,
1.2345 +** SQLITE_OK is returned. Otherwise an SQLite error code.
1.2346 +*/
1.2347 +int sqlite3BtreeIncrVacuum(Btree *p){
1.2348 + int rc;
1.2349 + BtShared *pBt = p->pBt;
1.2350 +
1.2351 + sqlite3BtreeEnter(p);
1.2352 + pBt->db = p->db;
1.2353 + assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
1.2354 + if( !pBt->autoVacuum ){
1.2355 + rc = SQLITE_DONE;
1.2356 + }else{
1.2357 + invalidateAllOverflowCache(pBt);
1.2358 + rc = incrVacuumStep(pBt, 0);
1.2359 + }
1.2360 + sqlite3BtreeLeave(p);
1.2361 + return rc;
1.2362 +}
1.2363 +
1.2364 +/*
1.2365 +** This routine is called prior to sqlite3PagerCommit when a transaction
1.2366 +** is commited for an auto-vacuum database.
1.2367 +**
1.2368 +** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
1.2369 +** the database file should be truncated to during the commit process.
1.2370 +** i.e. the database has been reorganized so that only the first *pnTrunc
1.2371 +** pages are in use.
1.2372 +*/
1.2373 +static int autoVacuumCommit(BtShared *pBt, Pgno *pnTrunc){
1.2374 + int rc = SQLITE_OK;
1.2375 + Pager *pPager = pBt->pPager;
1.2376 + VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager) );
1.2377 +
1.2378 + assert( sqlite3_mutex_held(pBt->mutex) );
1.2379 + invalidateAllOverflowCache(pBt);
1.2380 + assert(pBt->autoVacuum);
1.2381 + if( !pBt->incrVacuum ){
1.2382 + Pgno nFin = 0;
1.2383 +
1.2384 + if( pBt->nTrunc==0 ){
1.2385 + Pgno nFree;
1.2386 + Pgno nPtrmap;
1.2387 + const int pgsz = pBt->pageSize;
1.2388 + int nOrig = pagerPagecount(pBt->pPager);
1.2389 +
1.2390 + if( PTRMAP_ISPAGE(pBt, nOrig) ){
1.2391 + return SQLITE_CORRUPT_BKPT;
1.2392 + }
1.2393 + if( nOrig==PENDING_BYTE_PAGE(pBt) ){
1.2394 + nOrig--;
1.2395 + }
1.2396 + nFree = get4byte(&pBt->pPage1->aData[36]);
1.2397 + nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+pgsz/5)/(pgsz/5);
1.2398 + nFin = nOrig - nFree - nPtrmap;
1.2399 + if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<=PENDING_BYTE_PAGE(pBt) ){
1.2400 + nFin--;
1.2401 + }
1.2402 + while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
1.2403 + nFin--;
1.2404 + }
1.2405 + }
1.2406 +
1.2407 + while( rc==SQLITE_OK ){
1.2408 + rc = incrVacuumStep(pBt, nFin);
1.2409 + }
1.2410 + if( rc==SQLITE_DONE ){
1.2411 + assert(nFin==0 || pBt->nTrunc==0 || nFin<=pBt->nTrunc);
1.2412 + rc = SQLITE_OK;
1.2413 + if( pBt->nTrunc && nFin ){
1.2414 + rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
1.2415 + put4byte(&pBt->pPage1->aData[32], 0);
1.2416 + put4byte(&pBt->pPage1->aData[36], 0);
1.2417 + pBt->nTrunc = nFin;
1.2418 + }
1.2419 + }
1.2420 + if( rc!=SQLITE_OK ){
1.2421 + sqlite3PagerRollback(pPager);
1.2422 + }
1.2423 + }
1.2424 +
1.2425 + if( rc==SQLITE_OK ){
1.2426 + *pnTrunc = pBt->nTrunc;
1.2427 + pBt->nTrunc = 0;
1.2428 + }
1.2429 + assert( nRef==sqlite3PagerRefcount(pPager) );
1.2430 + return rc;
1.2431 +}
1.2432 +
1.2433 +#endif
1.2434 +
1.2435 +/*
1.2436 +** This routine does the first phase of a two-phase commit. This routine
1.2437 +** causes a rollback journal to be created (if it does not already exist)
1.2438 +** and populated with enough information so that if a power loss occurs
1.2439 +** the database can be restored to its original state by playing back
1.2440 +** the journal. Then the contents of the journal are flushed out to
1.2441 +** the disk. After the journal is safely on oxide, the changes to the
1.2442 +** database are written into the database file and flushed to oxide.
1.2443 +** At the end of this call, the rollback journal still exists on the
1.2444 +** disk and we are still holding all locks, so the transaction has not
1.2445 +** committed. See sqlite3BtreeCommit() for the second phase of the
1.2446 +** commit process.
1.2447 +**
1.2448 +** This call is a no-op if no write-transaction is currently active on pBt.
1.2449 +**
1.2450 +** Otherwise, sync the database file for the btree pBt. zMaster points to
1.2451 +** the name of a master journal file that should be written into the
1.2452 +** individual journal file, or is NULL, indicating no master journal file
1.2453 +** (single database transaction).
1.2454 +**
1.2455 +** When this is called, the master journal should already have been
1.2456 +** created, populated with this journal pointer and synced to disk.
1.2457 +**
1.2458 +** Once this is routine has returned, the only thing required to commit
1.2459 +** the write-transaction for this database file is to delete the journal.
1.2460 +*/
1.2461 +int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
1.2462 + int rc = SQLITE_OK;
1.2463 + if( p->inTrans==TRANS_WRITE ){
1.2464 + BtShared *pBt = p->pBt;
1.2465 + Pgno nTrunc = 0;
1.2466 + sqlite3BtreeEnter(p);
1.2467 + pBt->db = p->db;
1.2468 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.2469 + if( pBt->autoVacuum ){
1.2470 + rc = autoVacuumCommit(pBt, &nTrunc);
1.2471 + if( rc!=SQLITE_OK ){
1.2472 + sqlite3BtreeLeave(p);
1.2473 + return rc;
1.2474 + }
1.2475 + }
1.2476 +#endif
1.2477 + rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, nTrunc, 0);
1.2478 + sqlite3BtreeLeave(p);
1.2479 + }
1.2480 + return rc;
1.2481 +}
1.2482 +
1.2483 +/*
1.2484 +** Commit the transaction currently in progress.
1.2485 +**
1.2486 +** This routine implements the second phase of a 2-phase commit. The
1.2487 +** sqlite3BtreeSync() routine does the first phase and should be invoked
1.2488 +** prior to calling this routine. The sqlite3BtreeSync() routine did
1.2489 +** all the work of writing information out to disk and flushing the
1.2490 +** contents so that they are written onto the disk platter. All this
1.2491 +** routine has to do is delete or truncate the rollback journal
1.2492 +** (which causes the transaction to commit) and drop locks.
1.2493 +**
1.2494 +** This will release the write lock on the database file. If there
1.2495 +** are no active cursors, it also releases the read lock.
1.2496 +*/
1.2497 +int sqlite3BtreeCommitPhaseTwo(Btree *p){
1.2498 + BtShared *pBt = p->pBt;
1.2499 +
1.2500 + sqlite3BtreeEnter(p);
1.2501 + pBt->db = p->db;
1.2502 + btreeIntegrity(p);
1.2503 +
1.2504 + /* If the handle has a write-transaction open, commit the shared-btrees
1.2505 + ** transaction and set the shared state to TRANS_READ.
1.2506 + */
1.2507 + if( p->inTrans==TRANS_WRITE ){
1.2508 + int rc;
1.2509 + assert( pBt->inTransaction==TRANS_WRITE );
1.2510 + assert( pBt->nTransaction>0 );
1.2511 + rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
1.2512 + if( rc!=SQLITE_OK ){
1.2513 + sqlite3BtreeLeave(p);
1.2514 + return rc;
1.2515 + }
1.2516 + pBt->inTransaction = TRANS_READ;
1.2517 + pBt->inStmt = 0;
1.2518 + }
1.2519 + unlockAllTables(p);
1.2520 +
1.2521 + /* If the handle has any kind of transaction open, decrement the transaction
1.2522 + ** count of the shared btree. If the transaction count reaches 0, set
1.2523 + ** the shared state to TRANS_NONE. The unlockBtreeIfUnused() call below
1.2524 + ** will unlock the pager.
1.2525 + */
1.2526 + if( p->inTrans!=TRANS_NONE ){
1.2527 + pBt->nTransaction--;
1.2528 + if( 0==pBt->nTransaction ){
1.2529 + pBt->inTransaction = TRANS_NONE;
1.2530 + }
1.2531 + }
1.2532 +
1.2533 + /* Set the handles current transaction state to TRANS_NONE and unlock
1.2534 + ** the pager if this call closed the only read or write transaction.
1.2535 + */
1.2536 + p->inTrans = TRANS_NONE;
1.2537 + unlockBtreeIfUnused(pBt);
1.2538 +
1.2539 + btreeIntegrity(p);
1.2540 + sqlite3BtreeLeave(p);
1.2541 + return SQLITE_OK;
1.2542 +}
1.2543 +
1.2544 +/*
1.2545 +** Do both phases of a commit.
1.2546 +*/
1.2547 +int sqlite3BtreeCommit(Btree *p){
1.2548 + int rc;
1.2549 + sqlite3BtreeEnter(p);
1.2550 + rc = sqlite3BtreeCommitPhaseOne(p, 0);
1.2551 + if( rc==SQLITE_OK ){
1.2552 + rc = sqlite3BtreeCommitPhaseTwo(p);
1.2553 + }
1.2554 + sqlite3BtreeLeave(p);
1.2555 + return rc;
1.2556 +}
1.2557 +
1.2558 +#ifndef NDEBUG
1.2559 +/*
1.2560 +** Return the number of write-cursors open on this handle. This is for use
1.2561 +** in assert() expressions, so it is only compiled if NDEBUG is not
1.2562 +** defined.
1.2563 +**
1.2564 +** For the purposes of this routine, a write-cursor is any cursor that
1.2565 +** is capable of writing to the databse. That means the cursor was
1.2566 +** originally opened for writing and the cursor has not be disabled
1.2567 +** by having its state changed to CURSOR_FAULT.
1.2568 +*/
1.2569 +static int countWriteCursors(BtShared *pBt){
1.2570 + BtCursor *pCur;
1.2571 + int r = 0;
1.2572 + for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
1.2573 + if( pCur->wrFlag && pCur->eState!=CURSOR_FAULT ) r++;
1.2574 + }
1.2575 + return r;
1.2576 +}
1.2577 +#endif
1.2578 +
1.2579 +/*
1.2580 +** This routine sets the state to CURSOR_FAULT and the error
1.2581 +** code to errCode for every cursor on BtShared that pBtree
1.2582 +** references.
1.2583 +**
1.2584 +** Every cursor is tripped, including cursors that belong
1.2585 +** to other database connections that happen to be sharing
1.2586 +** the cache with pBtree.
1.2587 +**
1.2588 +** This routine gets called when a rollback occurs.
1.2589 +** All cursors using the same cache must be tripped
1.2590 +** to prevent them from trying to use the btree after
1.2591 +** the rollback. The rollback may have deleted tables
1.2592 +** or moved root pages, so it is not sufficient to
1.2593 +** save the state of the cursor. The cursor must be
1.2594 +** invalidated.
1.2595 +*/
1.2596 +void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){
1.2597 + BtCursor *p;
1.2598 + sqlite3BtreeEnter(pBtree);
1.2599 + for(p=pBtree->pBt->pCursor; p; p=p->pNext){
1.2600 + clearCursorPosition(p);
1.2601 + p->eState = CURSOR_FAULT;
1.2602 + p->skip = errCode;
1.2603 + }
1.2604 + sqlite3BtreeLeave(pBtree);
1.2605 +}
1.2606 +
1.2607 +/*
1.2608 +** Rollback the transaction in progress. All cursors will be
1.2609 +** invalided by this operation. Any attempt to use a cursor
1.2610 +** that was open at the beginning of this operation will result
1.2611 +** in an error.
1.2612 +**
1.2613 +** This will release the write lock on the database file. If there
1.2614 +** are no active cursors, it also releases the read lock.
1.2615 +*/
1.2616 +int sqlite3BtreeRollback(Btree *p){
1.2617 + int rc;
1.2618 + BtShared *pBt = p->pBt;
1.2619 + MemPage *pPage1;
1.2620 +
1.2621 + sqlite3BtreeEnter(p);
1.2622 + pBt->db = p->db;
1.2623 + rc = saveAllCursors(pBt, 0, 0);
1.2624 +#ifndef SQLITE_OMIT_SHARED_CACHE
1.2625 + if( rc!=SQLITE_OK ){
1.2626 + /* This is a horrible situation. An IO or malloc() error occured whilst
1.2627 + ** trying to save cursor positions. If this is an automatic rollback (as
1.2628 + ** the result of a constraint, malloc() failure or IO error) then
1.2629 + ** the cache may be internally inconsistent (not contain valid trees) so
1.2630 + ** we cannot simply return the error to the caller. Instead, abort
1.2631 + ** all queries that may be using any of the cursors that failed to save.
1.2632 + */
1.2633 + sqlite3BtreeTripAllCursors(p, rc);
1.2634 + }
1.2635 +#endif
1.2636 + btreeIntegrity(p);
1.2637 + unlockAllTables(p);
1.2638 +
1.2639 + if( p->inTrans==TRANS_WRITE ){
1.2640 + int rc2;
1.2641 +
1.2642 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.2643 + pBt->nTrunc = 0;
1.2644 +#endif
1.2645 +
1.2646 + assert( TRANS_WRITE==pBt->inTransaction );
1.2647 + rc2 = sqlite3PagerRollback(pBt->pPager);
1.2648 + if( rc2!=SQLITE_OK ){
1.2649 + rc = rc2;
1.2650 + }
1.2651 +
1.2652 + /* The rollback may have destroyed the pPage1->aData value. So
1.2653 + ** call sqlite3BtreeGetPage() on page 1 again to make
1.2654 + ** sure pPage1->aData is set correctly. */
1.2655 + if( sqlite3BtreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
1.2656 + releasePage(pPage1);
1.2657 + }
1.2658 + assert( countWriteCursors(pBt)==0 );
1.2659 + pBt->inTransaction = TRANS_READ;
1.2660 + }
1.2661 +
1.2662 + if( p->inTrans!=TRANS_NONE ){
1.2663 + assert( pBt->nTransaction>0 );
1.2664 + pBt->nTransaction--;
1.2665 + if( 0==pBt->nTransaction ){
1.2666 + pBt->inTransaction = TRANS_NONE;
1.2667 + }
1.2668 + }
1.2669 +
1.2670 + p->inTrans = TRANS_NONE;
1.2671 + pBt->inStmt = 0;
1.2672 + unlockBtreeIfUnused(pBt);
1.2673 +
1.2674 + btreeIntegrity(p);
1.2675 + sqlite3BtreeLeave(p);
1.2676 + return rc;
1.2677 +}
1.2678 +
1.2679 +/*
1.2680 +** Start a statement subtransaction. The subtransaction can
1.2681 +** can be rolled back independently of the main transaction.
1.2682 +** You must start a transaction before starting a subtransaction.
1.2683 +** The subtransaction is ended automatically if the main transaction
1.2684 +** commits or rolls back.
1.2685 +**
1.2686 +** Only one subtransaction may be active at a time. It is an error to try
1.2687 +** to start a new subtransaction if another subtransaction is already active.
1.2688 +**
1.2689 +** Statement subtransactions are used around individual SQL statements
1.2690 +** that are contained within a BEGIN...COMMIT block. If a constraint
1.2691 +** error occurs within the statement, the effect of that one statement
1.2692 +** can be rolled back without having to rollback the entire transaction.
1.2693 +*/
1.2694 +int sqlite3BtreeBeginStmt(Btree *p){
1.2695 + int rc;
1.2696 + BtShared *pBt = p->pBt;
1.2697 + sqlite3BtreeEnter(p);
1.2698 + pBt->db = p->db;
1.2699 + if( (p->inTrans!=TRANS_WRITE) || pBt->inStmt ){
1.2700 + rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
1.2701 + }else{
1.2702 + assert( pBt->inTransaction==TRANS_WRITE );
1.2703 + rc = pBt->readOnly ? SQLITE_OK : sqlite3PagerStmtBegin(pBt->pPager);
1.2704 + pBt->inStmt = 1;
1.2705 + }
1.2706 + sqlite3BtreeLeave(p);
1.2707 + return rc;
1.2708 +}
1.2709 +
1.2710 +
1.2711 +/*
1.2712 +** Commit the statment subtransaction currently in progress. If no
1.2713 +** subtransaction is active, this is a no-op.
1.2714 +*/
1.2715 +int sqlite3BtreeCommitStmt(Btree *p){
1.2716 + int rc;
1.2717 + BtShared *pBt = p->pBt;
1.2718 + sqlite3BtreeEnter(p);
1.2719 + pBt->db = p->db;
1.2720 + if( pBt->inStmt && !pBt->readOnly ){
1.2721 + rc = sqlite3PagerStmtCommit(pBt->pPager);
1.2722 + }else{
1.2723 + rc = SQLITE_OK;
1.2724 + }
1.2725 + pBt->inStmt = 0;
1.2726 + sqlite3BtreeLeave(p);
1.2727 + return rc;
1.2728 +}
1.2729 +
1.2730 +/*
1.2731 +** Rollback the active statement subtransaction. If no subtransaction
1.2732 +** is active this routine is a no-op.
1.2733 +**
1.2734 +** All cursors will be invalidated by this operation. Any attempt
1.2735 +** to use a cursor that was open at the beginning of this operation
1.2736 +** will result in an error.
1.2737 +*/
1.2738 +int sqlite3BtreeRollbackStmt(Btree *p){
1.2739 + int rc = SQLITE_OK;
1.2740 + BtShared *pBt = p->pBt;
1.2741 + sqlite3BtreeEnter(p);
1.2742 + pBt->db = p->db;
1.2743 + if( pBt->inStmt && !pBt->readOnly ){
1.2744 + rc = sqlite3PagerStmtRollback(pBt->pPager);
1.2745 + pBt->inStmt = 0;
1.2746 + }
1.2747 + sqlite3BtreeLeave(p);
1.2748 + return rc;
1.2749 +}
1.2750 +
1.2751 +/*
1.2752 +** Create a new cursor for the BTree whose root is on the page
1.2753 +** iTable. The act of acquiring a cursor gets a read lock on
1.2754 +** the database file.
1.2755 +**
1.2756 +** If wrFlag==0, then the cursor can only be used for reading.
1.2757 +** If wrFlag==1, then the cursor can be used for reading or for
1.2758 +** writing if other conditions for writing are also met. These
1.2759 +** are the conditions that must be met in order for writing to
1.2760 +** be allowed:
1.2761 +**
1.2762 +** 1: The cursor must have been opened with wrFlag==1
1.2763 +**
1.2764 +** 2: Other database connections that share the same pager cache
1.2765 +** but which are not in the READ_UNCOMMITTED state may not have
1.2766 +** cursors open with wrFlag==0 on the same table. Otherwise
1.2767 +** the changes made by this write cursor would be visible to
1.2768 +** the read cursors in the other database connection.
1.2769 +**
1.2770 +** 3: The database must be writable (not on read-only media)
1.2771 +**
1.2772 +** 4: There must be an active transaction.
1.2773 +**
1.2774 +** No checking is done to make sure that page iTable really is the
1.2775 +** root page of a b-tree. If it is not, then the cursor acquired
1.2776 +** will not work correctly.
1.2777 +**
1.2778 +** It is assumed that the sqlite3BtreeCursorSize() bytes of memory
1.2779 +** pointed to by pCur have been zeroed by the caller.
1.2780 +*/
1.2781 +static int btreeCursor(
1.2782 + Btree *p, /* The btree */
1.2783 + int iTable, /* Root page of table to open */
1.2784 + int wrFlag, /* 1 to write. 0 read-only */
1.2785 + struct KeyInfo *pKeyInfo, /* First arg to comparison function */
1.2786 + BtCursor *pCur /* Space for new cursor */
1.2787 +){
1.2788 + int rc;
1.2789 + BtShared *pBt = p->pBt;
1.2790 +
1.2791 + assert( sqlite3BtreeHoldsMutex(p) );
1.2792 + if( wrFlag ){
1.2793 + if( pBt->readOnly ){
1.2794 + return SQLITE_READONLY;
1.2795 + }
1.2796 + if( checkReadLocks(p, iTable, 0, 0) ){
1.2797 + return SQLITE_LOCKED;
1.2798 + }
1.2799 + }
1.2800 +
1.2801 + if( pBt->pPage1==0 ){
1.2802 + rc = lockBtreeWithRetry(p);
1.2803 + if( rc!=SQLITE_OK ){
1.2804 + return rc;
1.2805 + }
1.2806 + if( pBt->readOnly && wrFlag ){
1.2807 + return SQLITE_READONLY;
1.2808 + }
1.2809 + }
1.2810 + pCur->pgnoRoot = (Pgno)iTable;
1.2811 + if( iTable==1 && pagerPagecount(pBt->pPager)==0 ){
1.2812 + rc = SQLITE_EMPTY;
1.2813 + goto create_cursor_exception;
1.2814 + }
1.2815 + rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]);
1.2816 + if( rc!=SQLITE_OK ){
1.2817 + goto create_cursor_exception;
1.2818 + }
1.2819 +
1.2820 + /* Now that no other errors can occur, finish filling in the BtCursor
1.2821 + ** variables, link the cursor into the BtShared list and set *ppCur (the
1.2822 + ** output argument to this function).
1.2823 + */
1.2824 + pCur->pKeyInfo = pKeyInfo;
1.2825 + pCur->pBtree = p;
1.2826 + pCur->pBt = pBt;
1.2827 + pCur->wrFlag = wrFlag;
1.2828 + pCur->pNext = pBt->pCursor;
1.2829 + if( pCur->pNext ){
1.2830 + pCur->pNext->pPrev = pCur;
1.2831 + }
1.2832 + pBt->pCursor = pCur;
1.2833 + pCur->eState = CURSOR_INVALID;
1.2834 +
1.2835 + return SQLITE_OK;
1.2836 +
1.2837 +create_cursor_exception:
1.2838 + releasePage(pCur->apPage[0]);
1.2839 + unlockBtreeIfUnused(pBt);
1.2840 + return rc;
1.2841 +}
1.2842 +int sqlite3BtreeCursor(
1.2843 + Btree *p, /* The btree */
1.2844 + int iTable, /* Root page of table to open */
1.2845 + int wrFlag, /* 1 to write. 0 read-only */
1.2846 + struct KeyInfo *pKeyInfo, /* First arg to xCompare() */
1.2847 + BtCursor *pCur /* Write new cursor here */
1.2848 +){
1.2849 + int rc;
1.2850 + sqlite3BtreeEnter(p);
1.2851 + p->pBt->db = p->db;
1.2852 + rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
1.2853 + sqlite3BtreeLeave(p);
1.2854 + return rc;
1.2855 +}
1.2856 +int sqlite3BtreeCursorSize(){
1.2857 + return sizeof(BtCursor);
1.2858 +}
1.2859 +
1.2860 +
1.2861 +
1.2862 +/*
1.2863 +** Close a cursor. The read lock on the database file is released
1.2864 +** when the last cursor is closed.
1.2865 +*/
1.2866 +int sqlite3BtreeCloseCursor(BtCursor *pCur){
1.2867 + Btree *pBtree = pCur->pBtree;
1.2868 + if( pBtree ){
1.2869 + int i;
1.2870 + BtShared *pBt = pCur->pBt;
1.2871 + sqlite3BtreeEnter(pBtree);
1.2872 + pBt->db = pBtree->db;
1.2873 + clearCursorPosition(pCur);
1.2874 + if( pCur->pPrev ){
1.2875 + pCur->pPrev->pNext = pCur->pNext;
1.2876 + }else{
1.2877 + pBt->pCursor = pCur->pNext;
1.2878 + }
1.2879 + if( pCur->pNext ){
1.2880 + pCur->pNext->pPrev = pCur->pPrev;
1.2881 + }
1.2882 + for(i=0; i<=pCur->iPage; i++){
1.2883 + releasePage(pCur->apPage[i]);
1.2884 + }
1.2885 + unlockBtreeIfUnused(pBt);
1.2886 + invalidateOverflowCache(pCur);
1.2887 + /* sqlite3_free(pCur); */
1.2888 + sqlite3BtreeLeave(pBtree);
1.2889 + }
1.2890 + return SQLITE_OK;
1.2891 +}
1.2892 +
1.2893 +/*
1.2894 +** Make a temporary cursor by filling in the fields of pTempCur.
1.2895 +** The temporary cursor is not on the cursor list for the Btree.
1.2896 +*/
1.2897 +void sqlite3BtreeGetTempCursor(BtCursor *pCur, BtCursor *pTempCur){
1.2898 + int i;
1.2899 + assert( cursorHoldsMutex(pCur) );
1.2900 + memcpy(pTempCur, pCur, sizeof(BtCursor));
1.2901 + pTempCur->pNext = 0;
1.2902 + pTempCur->pPrev = 0;
1.2903 + for(i=0; i<=pTempCur->iPage; i++){
1.2904 + sqlite3PagerRef(pTempCur->apPage[i]->pDbPage);
1.2905 + }
1.2906 + assert( pCur->pKey==0 );
1.2907 +}
1.2908 +
1.2909 +/*
1.2910 +** Delete a temporary cursor such as was made by the CreateTemporaryCursor()
1.2911 +** function above.
1.2912 +*/
1.2913 +void sqlite3BtreeReleaseTempCursor(BtCursor *pCur){
1.2914 + int i;
1.2915 + assert( cursorHoldsMutex(pCur) );
1.2916 + for(i=0; i<=pCur->iPage; i++){
1.2917 + sqlite3PagerUnref(pCur->apPage[i]->pDbPage);
1.2918 + }
1.2919 + sqlite3_free(pCur->pKey);
1.2920 +}
1.2921 +
1.2922 +/*
1.2923 +** Make sure the BtCursor* given in the argument has a valid
1.2924 +** BtCursor.info structure. If it is not already valid, call
1.2925 +** sqlite3BtreeParseCell() to fill it in.
1.2926 +**
1.2927 +** BtCursor.info is a cache of the information in the current cell.
1.2928 +** Using this cache reduces the number of calls to sqlite3BtreeParseCell().
1.2929 +**
1.2930 +** 2007-06-25: There is a bug in some versions of MSVC that cause the
1.2931 +** compiler to crash when getCellInfo() is implemented as a macro.
1.2932 +** But there is a measureable speed advantage to using the macro on gcc
1.2933 +** (when less compiler optimizations like -Os or -O0 are used and the
1.2934 +** compiler is not doing agressive inlining.) So we use a real function
1.2935 +** for MSVC and a macro for everything else. Ticket #2457.
1.2936 +*/
1.2937 +#ifndef NDEBUG
1.2938 + static void assertCellInfo(BtCursor *pCur){
1.2939 + CellInfo info;
1.2940 + int iPage = pCur->iPage;
1.2941 + memset(&info, 0, sizeof(info));
1.2942 + sqlite3BtreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info);
1.2943 + assert( memcmp(&info, &pCur->info, sizeof(info))==0 );
1.2944 + }
1.2945 +#else
1.2946 + #define assertCellInfo(x)
1.2947 +#endif
1.2948 +#ifdef _MSC_VER
1.2949 + /* Use a real function in MSVC to work around bugs in that compiler. */
1.2950 + static void getCellInfo(BtCursor *pCur){
1.2951 + if( pCur->info.nSize==0 ){
1.2952 + int iPage = pCur->iPage;
1.2953 + sqlite3BtreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);
1.2954 + pCur->validNKey = 1;
1.2955 + }else{
1.2956 + assertCellInfo(pCur);
1.2957 + }
1.2958 + }
1.2959 +#else /* if not _MSC_VER */
1.2960 + /* Use a macro in all other compilers so that the function is inlined */
1.2961 +#define getCellInfo(pCur) \
1.2962 + if( pCur->info.nSize==0 ){ \
1.2963 + int iPage = pCur->iPage; \
1.2964 + sqlite3BtreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); \
1.2965 + pCur->validNKey = 1; \
1.2966 + }else{ \
1.2967 + assertCellInfo(pCur); \
1.2968 + }
1.2969 +#endif /* _MSC_VER */
1.2970 +
1.2971 +/*
1.2972 +** Set *pSize to the size of the buffer needed to hold the value of
1.2973 +** the key for the current entry. If the cursor is not pointing
1.2974 +** to a valid entry, *pSize is set to 0.
1.2975 +**
1.2976 +** For a table with the INTKEY flag set, this routine returns the key
1.2977 +** itself, not the number of bytes in the key.
1.2978 +*/
1.2979 +int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
1.2980 + int rc;
1.2981 +
1.2982 + assert( cursorHoldsMutex(pCur) );
1.2983 + rc = restoreCursorPosition(pCur);
1.2984 + if( rc==SQLITE_OK ){
1.2985 + assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
1.2986 + if( pCur->eState==CURSOR_INVALID ){
1.2987 + *pSize = 0;
1.2988 + }else{
1.2989 + getCellInfo(pCur);
1.2990 + *pSize = pCur->info.nKey;
1.2991 + }
1.2992 + }
1.2993 + return rc;
1.2994 +}
1.2995 +
1.2996 +/*
1.2997 +** Set *pSize to the number of bytes of data in the entry the
1.2998 +** cursor currently points to. Always return SQLITE_OK.
1.2999 +** Failure is not possible. If the cursor is not currently
1.3000 +** pointing to an entry (which can happen, for example, if
1.3001 +** the database is empty) then *pSize is set to 0.
1.3002 +*/
1.3003 +int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
1.3004 + int rc;
1.3005 +
1.3006 + assert( cursorHoldsMutex(pCur) );
1.3007 + rc = restoreCursorPosition(pCur);
1.3008 + if( rc==SQLITE_OK ){
1.3009 + assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
1.3010 + if( pCur->eState==CURSOR_INVALID ){
1.3011 + /* Not pointing at a valid entry - set *pSize to 0. */
1.3012 + *pSize = 0;
1.3013 + }else{
1.3014 + getCellInfo(pCur);
1.3015 + *pSize = pCur->info.nData;
1.3016 + }
1.3017 + }
1.3018 + return rc;
1.3019 +}
1.3020 +
1.3021 +/*
1.3022 +** Given the page number of an overflow page in the database (parameter
1.3023 +** ovfl), this function finds the page number of the next page in the
1.3024 +** linked list of overflow pages. If possible, it uses the auto-vacuum
1.3025 +** pointer-map data instead of reading the content of page ovfl to do so.
1.3026 +**
1.3027 +** If an error occurs an SQLite error code is returned. Otherwise:
1.3028 +**
1.3029 +** Unless pPgnoNext is NULL, the page number of the next overflow
1.3030 +** page in the linked list is written to *pPgnoNext. If page ovfl
1.3031 +** is the last page in its linked list, *pPgnoNext is set to zero.
1.3032 +**
1.3033 +** If ppPage is not NULL, *ppPage is set to the MemPage* handle
1.3034 +** for page ovfl. The underlying pager page may have been requested
1.3035 +** with the noContent flag set, so the page data accessable via
1.3036 +** this handle may not be trusted.
1.3037 +*/
1.3038 +static int getOverflowPage(
1.3039 + BtShared *pBt,
1.3040 + Pgno ovfl, /* Overflow page */
1.3041 + MemPage **ppPage, /* OUT: MemPage handle */
1.3042 + Pgno *pPgnoNext /* OUT: Next overflow page number */
1.3043 +){
1.3044 + Pgno next = 0;
1.3045 + int rc;
1.3046 +
1.3047 + assert( sqlite3_mutex_held(pBt->mutex) );
1.3048 + /* One of these must not be NULL. Otherwise, why call this function? */
1.3049 + assert(ppPage || pPgnoNext);
1.3050 +
1.3051 + /* If pPgnoNext is NULL, then this function is being called to obtain
1.3052 + ** a MemPage* reference only. No page-data is required in this case.
1.3053 + */
1.3054 + if( !pPgnoNext ){
1.3055 + return sqlite3BtreeGetPage(pBt, ovfl, ppPage, 1);
1.3056 + }
1.3057 +
1.3058 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.3059 + /* Try to find the next page in the overflow list using the
1.3060 + ** autovacuum pointer-map pages. Guess that the next page in
1.3061 + ** the overflow list is page number (ovfl+1). If that guess turns
1.3062 + ** out to be wrong, fall back to loading the data of page
1.3063 + ** number ovfl to determine the next page number.
1.3064 + */
1.3065 + if( pBt->autoVacuum ){
1.3066 + Pgno pgno;
1.3067 + Pgno iGuess = ovfl+1;
1.3068 + u8 eType;
1.3069 +
1.3070 + while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
1.3071 + iGuess++;
1.3072 + }
1.3073 +
1.3074 + if( iGuess<=pagerPagecount(pBt->pPager) ){
1.3075 + rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
1.3076 + if( rc!=SQLITE_OK ){
1.3077 + return rc;
1.3078 + }
1.3079 + if( eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
1.3080 + next = iGuess;
1.3081 + }
1.3082 + }
1.3083 + }
1.3084 +#endif
1.3085 +
1.3086 + if( next==0 || ppPage ){
1.3087 + MemPage *pPage = 0;
1.3088 +
1.3089 + rc = sqlite3BtreeGetPage(pBt, ovfl, &pPage, next!=0);
1.3090 + assert(rc==SQLITE_OK || pPage==0);
1.3091 + if( next==0 && rc==SQLITE_OK ){
1.3092 + next = get4byte(pPage->aData);
1.3093 + }
1.3094 +
1.3095 + if( ppPage ){
1.3096 + *ppPage = pPage;
1.3097 + }else{
1.3098 + releasePage(pPage);
1.3099 + }
1.3100 + }
1.3101 + *pPgnoNext = next;
1.3102 +
1.3103 + return rc;
1.3104 +}
1.3105 +
1.3106 +/*
1.3107 +** Copy data from a buffer to a page, or from a page to a buffer.
1.3108 +**
1.3109 +** pPayload is a pointer to data stored on database page pDbPage.
1.3110 +** If argument eOp is false, then nByte bytes of data are copied
1.3111 +** from pPayload to the buffer pointed at by pBuf. If eOp is true,
1.3112 +** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
1.3113 +** of data are copied from the buffer pBuf to pPayload.
1.3114 +**
1.3115 +** SQLITE_OK is returned on success, otherwise an error code.
1.3116 +*/
1.3117 +static int copyPayload(
1.3118 + void *pPayload, /* Pointer to page data */
1.3119 + void *pBuf, /* Pointer to buffer */
1.3120 + int nByte, /* Number of bytes to copy */
1.3121 + int eOp, /* 0 -> copy from page, 1 -> copy to page */
1.3122 + DbPage *pDbPage /* Page containing pPayload */
1.3123 +){
1.3124 + if( eOp ){
1.3125 + /* Copy data from buffer to page (a write operation) */
1.3126 + int rc = sqlite3PagerWrite(pDbPage);
1.3127 + if( rc!=SQLITE_OK ){
1.3128 + return rc;
1.3129 + }
1.3130 + memcpy(pPayload, pBuf, nByte);
1.3131 + }else{
1.3132 + /* Copy data from page to buffer (a read operation) */
1.3133 + memcpy(pBuf, pPayload, nByte);
1.3134 + }
1.3135 + return SQLITE_OK;
1.3136 +}
1.3137 +
1.3138 +/*
1.3139 +** This function is used to read or overwrite payload information
1.3140 +** for the entry that the pCur cursor is pointing to. If the eOp
1.3141 +** parameter is 0, this is a read operation (data copied into
1.3142 +** buffer pBuf). If it is non-zero, a write (data copied from
1.3143 +** buffer pBuf).
1.3144 +**
1.3145 +** A total of "amt" bytes are read or written beginning at "offset".
1.3146 +** Data is read to or from the buffer pBuf.
1.3147 +**
1.3148 +** This routine does not make a distinction between key and data.
1.3149 +** It just reads or writes bytes from the payload area. Data might
1.3150 +** appear on the main page or be scattered out on multiple overflow
1.3151 +** pages.
1.3152 +**
1.3153 +** If the BtCursor.isIncrblobHandle flag is set, and the current
1.3154 +** cursor entry uses one or more overflow pages, this function
1.3155 +** allocates space for and lazily popluates the overflow page-list
1.3156 +** cache array (BtCursor.aOverflow). Subsequent calls use this
1.3157 +** cache to make seeking to the supplied offset more efficient.
1.3158 +**
1.3159 +** Once an overflow page-list cache has been allocated, it may be
1.3160 +** invalidated if some other cursor writes to the same table, or if
1.3161 +** the cursor is moved to a different row. Additionally, in auto-vacuum
1.3162 +** mode, the following events may invalidate an overflow page-list cache.
1.3163 +**
1.3164 +** * An incremental vacuum,
1.3165 +** * A commit in auto_vacuum="full" mode,
1.3166 +** * Creating a table (may require moving an overflow page).
1.3167 +*/
1.3168 +static int accessPayload(
1.3169 + BtCursor *pCur, /* Cursor pointing to entry to read from */
1.3170 + int offset, /* Begin reading this far into payload */
1.3171 + int amt, /* Read this many bytes */
1.3172 + unsigned char *pBuf, /* Write the bytes into this buffer */
1.3173 + int skipKey, /* offset begins at data if this is true */
1.3174 + int eOp /* zero to read. non-zero to write. */
1.3175 +){
1.3176 + unsigned char *aPayload;
1.3177 + int rc = SQLITE_OK;
1.3178 + u32 nKey;
1.3179 + int iIdx = 0;
1.3180 + MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */
1.3181 + BtShared *pBt; /* Btree this cursor belongs to */
1.3182 +
1.3183 + assert( pPage );
1.3184 + assert( pCur->eState==CURSOR_VALID );
1.3185 + assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
1.3186 + assert( offset>=0 );
1.3187 + assert( cursorHoldsMutex(pCur) );
1.3188 +
1.3189 + getCellInfo(pCur);
1.3190 + aPayload = pCur->info.pCell + pCur->info.nHeader;
1.3191 + nKey = (pPage->intKey ? 0 : pCur->info.nKey);
1.3192 +
1.3193 + if( skipKey ){
1.3194 + offset += nKey;
1.3195 + }
1.3196 + if( offset+amt > nKey+pCur->info.nData ){
1.3197 + /* Trying to read or write past the end of the data is an error */
1.3198 + return SQLITE_CORRUPT_BKPT;
1.3199 + }
1.3200 +
1.3201 + /* Check if data must be read/written to/from the btree page itself. */
1.3202 + if( offset<pCur->info.nLocal ){
1.3203 + int a = amt;
1.3204 + if( a+offset>pCur->info.nLocal ){
1.3205 + a = pCur->info.nLocal - offset;
1.3206 + }
1.3207 + rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
1.3208 + offset = 0;
1.3209 + pBuf += a;
1.3210 + amt -= a;
1.3211 + }else{
1.3212 + offset -= pCur->info.nLocal;
1.3213 + }
1.3214 +
1.3215 + pBt = pCur->pBt;
1.3216 + if( rc==SQLITE_OK && amt>0 ){
1.3217 + const int ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */
1.3218 + Pgno nextPage;
1.3219 +
1.3220 + nextPage = get4byte(&aPayload[pCur->info.nLocal]);
1.3221 +
1.3222 +#ifndef SQLITE_OMIT_INCRBLOB
1.3223 + /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[]
1.3224 + ** has not been allocated, allocate it now. The array is sized at
1.3225 + ** one entry for each overflow page in the overflow chain. The
1.3226 + ** page number of the first overflow page is stored in aOverflow[0],
1.3227 + ** etc. A value of 0 in the aOverflow[] array means "not yet known"
1.3228 + ** (the cache is lazily populated).
1.3229 + */
1.3230 + if( pCur->isIncrblobHandle && !pCur->aOverflow ){
1.3231 + int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
1.3232 + pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl);
1.3233 + if( nOvfl && !pCur->aOverflow ){
1.3234 + rc = SQLITE_NOMEM;
1.3235 + }
1.3236 + }
1.3237 +
1.3238 + /* If the overflow page-list cache has been allocated and the
1.3239 + ** entry for the first required overflow page is valid, skip
1.3240 + ** directly to it.
1.3241 + */
1.3242 + if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){
1.3243 + iIdx = (offset/ovflSize);
1.3244 + nextPage = pCur->aOverflow[iIdx];
1.3245 + offset = (offset%ovflSize);
1.3246 + }
1.3247 +#endif
1.3248 +
1.3249 + for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){
1.3250 +
1.3251 +#ifndef SQLITE_OMIT_INCRBLOB
1.3252 + /* If required, populate the overflow page-list cache. */
1.3253 + if( pCur->aOverflow ){
1.3254 + assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage);
1.3255 + pCur->aOverflow[iIdx] = nextPage;
1.3256 + }
1.3257 +#endif
1.3258 +
1.3259 + if( offset>=ovflSize ){
1.3260 + /* The only reason to read this page is to obtain the page
1.3261 + ** number for the next page in the overflow chain. The page
1.3262 + ** data is not required. So first try to lookup the overflow
1.3263 + ** page-list cache, if any, then fall back to the getOverflowPage()
1.3264 + ** function.
1.3265 + */
1.3266 +#ifndef SQLITE_OMIT_INCRBLOB
1.3267 + if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){
1.3268 + nextPage = pCur->aOverflow[iIdx+1];
1.3269 + } else
1.3270 +#endif
1.3271 + rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
1.3272 + offset -= ovflSize;
1.3273 + }else{
1.3274 + /* Need to read this page properly. It contains some of the
1.3275 + ** range of data that is being read (eOp==0) or written (eOp!=0).
1.3276 + */
1.3277 + DbPage *pDbPage;
1.3278 + int a = amt;
1.3279 + rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage);
1.3280 + if( rc==SQLITE_OK ){
1.3281 + aPayload = sqlite3PagerGetData(pDbPage);
1.3282 + nextPage = get4byte(aPayload);
1.3283 + if( a + offset > ovflSize ){
1.3284 + a = ovflSize - offset;
1.3285 + }
1.3286 + rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
1.3287 + sqlite3PagerUnref(pDbPage);
1.3288 + offset = 0;
1.3289 + amt -= a;
1.3290 + pBuf += a;
1.3291 + }
1.3292 + }
1.3293 + }
1.3294 + }
1.3295 +
1.3296 + if( rc==SQLITE_OK && amt>0 ){
1.3297 + return SQLITE_CORRUPT_BKPT;
1.3298 + }
1.3299 + return rc;
1.3300 +}
1.3301 +
1.3302 +/*
1.3303 +** Read part of the key associated with cursor pCur. Exactly
1.3304 +** "amt" bytes will be transfered into pBuf[]. The transfer
1.3305 +** begins at "offset".
1.3306 +**
1.3307 +** Return SQLITE_OK on success or an error code if anything goes
1.3308 +** wrong. An error is returned if "offset+amt" is larger than
1.3309 +** the available payload.
1.3310 +*/
1.3311 +int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
1.3312 + int rc;
1.3313 +
1.3314 + assert( cursorHoldsMutex(pCur) );
1.3315 + rc = restoreCursorPosition(pCur);
1.3316 + if( rc==SQLITE_OK ){
1.3317 + assert( pCur->eState==CURSOR_VALID );
1.3318 + assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
1.3319 + if( pCur->apPage[0]->intKey ){
1.3320 + return SQLITE_CORRUPT_BKPT;
1.3321 + }
1.3322 + assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
1.3323 + rc = accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0, 0);
1.3324 + }
1.3325 + return rc;
1.3326 +}
1.3327 +
1.3328 +/*
1.3329 +** Read part of the data associated with cursor pCur. Exactly
1.3330 +** "amt" bytes will be transfered into pBuf[]. The transfer
1.3331 +** begins at "offset".
1.3332 +**
1.3333 +** Return SQLITE_OK on success or an error code if anything goes
1.3334 +** wrong. An error is returned if "offset+amt" is larger than
1.3335 +** the available payload.
1.3336 +*/
1.3337 +int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
1.3338 + int rc;
1.3339 +
1.3340 +#ifndef SQLITE_OMIT_INCRBLOB
1.3341 + if ( pCur->eState==CURSOR_INVALID ){
1.3342 + return SQLITE_ABORT;
1.3343 + }
1.3344 +#endif
1.3345 +
1.3346 + assert( cursorHoldsMutex(pCur) );
1.3347 + rc = restoreCursorPosition(pCur);
1.3348 + if( rc==SQLITE_OK ){
1.3349 + assert( pCur->eState==CURSOR_VALID );
1.3350 + assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
1.3351 + assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
1.3352 + rc = accessPayload(pCur, offset, amt, pBuf, 1, 0);
1.3353 + }
1.3354 + return rc;
1.3355 +}
1.3356 +
1.3357 +/*
1.3358 +** Return a pointer to payload information from the entry that the
1.3359 +** pCur cursor is pointing to. The pointer is to the beginning of
1.3360 +** the key if skipKey==0 and it points to the beginning of data if
1.3361 +** skipKey==1. The number of bytes of available key/data is written
1.3362 +** into *pAmt. If *pAmt==0, then the value returned will not be
1.3363 +** a valid pointer.
1.3364 +**
1.3365 +** This routine is an optimization. It is common for the entire key
1.3366 +** and data to fit on the local page and for there to be no overflow
1.3367 +** pages. When that is so, this routine can be used to access the
1.3368 +** key and data without making a copy. If the key and/or data spills
1.3369 +** onto overflow pages, then accessPayload() must be used to reassembly
1.3370 +** the key/data and copy it into a preallocated buffer.
1.3371 +**
1.3372 +** The pointer returned by this routine looks directly into the cached
1.3373 +** page of the database. The data might change or move the next time
1.3374 +** any btree routine is called.
1.3375 +*/
1.3376 +static const unsigned char *fetchPayload(
1.3377 + BtCursor *pCur, /* Cursor pointing to entry to read from */
1.3378 + int *pAmt, /* Write the number of available bytes here */
1.3379 + int skipKey /* read beginning at data if this is true */
1.3380 +){
1.3381 + unsigned char *aPayload;
1.3382 + MemPage *pPage;
1.3383 + u32 nKey;
1.3384 + int nLocal;
1.3385 +
1.3386 + assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]);
1.3387 + assert( pCur->eState==CURSOR_VALID );
1.3388 + assert( cursorHoldsMutex(pCur) );
1.3389 + pPage = pCur->apPage[pCur->iPage];
1.3390 + assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
1.3391 + getCellInfo(pCur);
1.3392 + aPayload = pCur->info.pCell;
1.3393 + aPayload += pCur->info.nHeader;
1.3394 + if( pPage->intKey ){
1.3395 + nKey = 0;
1.3396 + }else{
1.3397 + nKey = pCur->info.nKey;
1.3398 + }
1.3399 + if( skipKey ){
1.3400 + aPayload += nKey;
1.3401 + nLocal = pCur->info.nLocal - nKey;
1.3402 + }else{
1.3403 + nLocal = pCur->info.nLocal;
1.3404 + if( nLocal>nKey ){
1.3405 + nLocal = nKey;
1.3406 + }
1.3407 + }
1.3408 + *pAmt = nLocal;
1.3409 + return aPayload;
1.3410 +}
1.3411 +
1.3412 +
1.3413 +/*
1.3414 +** For the entry that cursor pCur is point to, return as
1.3415 +** many bytes of the key or data as are available on the local
1.3416 +** b-tree page. Write the number of available bytes into *pAmt.
1.3417 +**
1.3418 +** The pointer returned is ephemeral. The key/data may move
1.3419 +** or be destroyed on the next call to any Btree routine,
1.3420 +** including calls from other threads against the same cache.
1.3421 +** Hence, a mutex on the BtShared should be held prior to calling
1.3422 +** this routine.
1.3423 +**
1.3424 +** These routines is used to get quick access to key and data
1.3425 +** in the common case where no overflow pages are used.
1.3426 +*/
1.3427 +const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){
1.3428 + assert( cursorHoldsMutex(pCur) );
1.3429 + if( pCur->eState==CURSOR_VALID ){
1.3430 + return (const void*)fetchPayload(pCur, pAmt, 0);
1.3431 + }
1.3432 + return 0;
1.3433 +}
1.3434 +const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){
1.3435 + assert( cursorHoldsMutex(pCur) );
1.3436 + if( pCur->eState==CURSOR_VALID ){
1.3437 + return (const void*)fetchPayload(pCur, pAmt, 1);
1.3438 + }
1.3439 + return 0;
1.3440 +}
1.3441 +
1.3442 +
1.3443 +/*
1.3444 +** Move the cursor down to a new child page. The newPgno argument is the
1.3445 +** page number of the child page to move to.
1.3446 +*/
1.3447 +static int moveToChild(BtCursor *pCur, u32 newPgno){
1.3448 + int rc;
1.3449 + int i = pCur->iPage;
1.3450 + MemPage *pNewPage;
1.3451 + BtShared *pBt = pCur->pBt;
1.3452 +
1.3453 + assert( cursorHoldsMutex(pCur) );
1.3454 + assert( pCur->eState==CURSOR_VALID );
1.3455 + assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
1.3456 + if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
1.3457 + return SQLITE_CORRUPT_BKPT;
1.3458 + }
1.3459 + rc = getAndInitPage(pBt, newPgno, &pNewPage);
1.3460 + if( rc ) return rc;
1.3461 + pCur->apPage[i+1] = pNewPage;
1.3462 + pCur->aiIdx[i+1] = 0;
1.3463 + pCur->iPage++;
1.3464 +
1.3465 + pCur->info.nSize = 0;
1.3466 + pCur->validNKey = 0;
1.3467 + if( pNewPage->nCell<1 ){
1.3468 + return SQLITE_CORRUPT_BKPT;
1.3469 + }
1.3470 + return SQLITE_OK;
1.3471 +}
1.3472 +
1.3473 +#ifndef NDEBUG
1.3474 +/*
1.3475 +** Page pParent is an internal (non-leaf) tree page. This function
1.3476 +** asserts that page number iChild is the left-child if the iIdx'th
1.3477 +** cell in page pParent. Or, if iIdx is equal to the total number of
1.3478 +** cells in pParent, that page number iChild is the right-child of
1.3479 +** the page.
1.3480 +*/
1.3481 +static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
1.3482 + assert( iIdx<=pParent->nCell );
1.3483 + if( iIdx==pParent->nCell ){
1.3484 + assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
1.3485 + }else{
1.3486 + assert( get4byte(findCell(pParent, iIdx))==iChild );
1.3487 + }
1.3488 +}
1.3489 +#else
1.3490 +# define assertParentIndex(x,y,z)
1.3491 +#endif
1.3492 +
1.3493 +/*
1.3494 +** Move the cursor up to the parent page.
1.3495 +**
1.3496 +** pCur->idx is set to the cell index that contains the pointer
1.3497 +** to the page we are coming from. If we are coming from the
1.3498 +** right-most child page then pCur->idx is set to one more than
1.3499 +** the largest cell index.
1.3500 +*/
1.3501 +void sqlite3BtreeMoveToParent(BtCursor *pCur){
1.3502 + assert( cursorHoldsMutex(pCur) );
1.3503 + assert( pCur->eState==CURSOR_VALID );
1.3504 + assert( pCur->iPage>0 );
1.3505 + assert( pCur->apPage[pCur->iPage] );
1.3506 + assertParentIndex(
1.3507 + pCur->apPage[pCur->iPage-1],
1.3508 + pCur->aiIdx[pCur->iPage-1],
1.3509 + pCur->apPage[pCur->iPage]->pgno
1.3510 + );
1.3511 + releasePage(pCur->apPage[pCur->iPage]);
1.3512 + pCur->iPage--;
1.3513 + pCur->info.nSize = 0;
1.3514 + pCur->validNKey = 0;
1.3515 +}
1.3516 +
1.3517 +/*
1.3518 +** Move the cursor to the root page
1.3519 +*/
1.3520 +static int moveToRoot(BtCursor *pCur){
1.3521 + MemPage *pRoot;
1.3522 + int rc = SQLITE_OK;
1.3523 + Btree *p = pCur->pBtree;
1.3524 + BtShared *pBt = p->pBt;
1.3525 +
1.3526 + assert( cursorHoldsMutex(pCur) );
1.3527 + assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
1.3528 + assert( CURSOR_VALID < CURSOR_REQUIRESEEK );
1.3529 + assert( CURSOR_FAULT > CURSOR_REQUIRESEEK );
1.3530 + if( pCur->eState>=CURSOR_REQUIRESEEK ){
1.3531 + if( pCur->eState==CURSOR_FAULT ){
1.3532 + return pCur->skip;
1.3533 + }
1.3534 + clearCursorPosition(pCur);
1.3535 + }
1.3536 +
1.3537 + if( pCur->iPage>=0 ){
1.3538 + int i;
1.3539 + for(i=1; i<=pCur->iPage; i++){
1.3540 + releasePage(pCur->apPage[i]);
1.3541 + }
1.3542 + }else{
1.3543 + if(
1.3544 + SQLITE_OK!=(rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]))
1.3545 + ){
1.3546 + pCur->eState = CURSOR_INVALID;
1.3547 + return rc;
1.3548 + }
1.3549 + }
1.3550 +
1.3551 + pRoot = pCur->apPage[0];
1.3552 + assert( pRoot->pgno==pCur->pgnoRoot );
1.3553 + pCur->iPage = 0;
1.3554 + pCur->aiIdx[0] = 0;
1.3555 + pCur->info.nSize = 0;
1.3556 + pCur->atLast = 0;
1.3557 + pCur->validNKey = 0;
1.3558 +
1.3559 + if( pRoot->nCell==0 && !pRoot->leaf ){
1.3560 + Pgno subpage;
1.3561 + assert( pRoot->pgno==1 );
1.3562 + subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
1.3563 + assert( subpage>0 );
1.3564 + pCur->eState = CURSOR_VALID;
1.3565 + rc = moveToChild(pCur, subpage);
1.3566 + }else{
1.3567 + pCur->eState = ((pRoot->nCell>0)?CURSOR_VALID:CURSOR_INVALID);
1.3568 + }
1.3569 + return rc;
1.3570 +}
1.3571 +
1.3572 +/*
1.3573 +** Move the cursor down to the left-most leaf entry beneath the
1.3574 +** entry to which it is currently pointing.
1.3575 +**
1.3576 +** The left-most leaf is the one with the smallest key - the first
1.3577 +** in ascending order.
1.3578 +*/
1.3579 +static int moveToLeftmost(BtCursor *pCur){
1.3580 + Pgno pgno;
1.3581 + int rc = SQLITE_OK;
1.3582 + MemPage *pPage;
1.3583 +
1.3584 + assert( cursorHoldsMutex(pCur) );
1.3585 + assert( pCur->eState==CURSOR_VALID );
1.3586 + while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
1.3587 + assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
1.3588 + pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage]));
1.3589 + rc = moveToChild(pCur, pgno);
1.3590 + }
1.3591 + return rc;
1.3592 +}
1.3593 +
1.3594 +/*
1.3595 +** Move the cursor down to the right-most leaf entry beneath the
1.3596 +** page to which it is currently pointing. Notice the difference
1.3597 +** between moveToLeftmost() and moveToRightmost(). moveToLeftmost()
1.3598 +** finds the left-most entry beneath the *entry* whereas moveToRightmost()
1.3599 +** finds the right-most entry beneath the *page*.
1.3600 +**
1.3601 +** The right-most entry is the one with the largest key - the last
1.3602 +** key in ascending order.
1.3603 +*/
1.3604 +static int moveToRightmost(BtCursor *pCur){
1.3605 + Pgno pgno;
1.3606 + int rc = SQLITE_OK;
1.3607 + MemPage *pPage;
1.3608 +
1.3609 + assert( cursorHoldsMutex(pCur) );
1.3610 + assert( pCur->eState==CURSOR_VALID );
1.3611 + while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
1.3612 + pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
1.3613 + pCur->aiIdx[pCur->iPage] = pPage->nCell;
1.3614 + rc = moveToChild(pCur, pgno);
1.3615 + }
1.3616 + if( rc==SQLITE_OK ){
1.3617 + pCur->aiIdx[pCur->iPage] = pPage->nCell-1;
1.3618 + pCur->info.nSize = 0;
1.3619 + pCur->validNKey = 0;
1.3620 + }
1.3621 + return rc;
1.3622 +}
1.3623 +
1.3624 +/* Move the cursor to the first entry in the table. Return SQLITE_OK
1.3625 +** on success. Set *pRes to 0 if the cursor actually points to something
1.3626 +** or set *pRes to 1 if the table is empty.
1.3627 +*/
1.3628 +int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
1.3629 + int rc;
1.3630 +
1.3631 + assert( cursorHoldsMutex(pCur) );
1.3632 + assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
1.3633 + rc = moveToRoot(pCur);
1.3634 + if( rc==SQLITE_OK ){
1.3635 + if( pCur->eState==CURSOR_INVALID ){
1.3636 + assert( pCur->apPage[pCur->iPage]->nCell==0 );
1.3637 + *pRes = 1;
1.3638 + rc = SQLITE_OK;
1.3639 + }else{
1.3640 + assert( pCur->apPage[pCur->iPage]->nCell>0 );
1.3641 + *pRes = 0;
1.3642 + rc = moveToLeftmost(pCur);
1.3643 + }
1.3644 + }
1.3645 + return rc;
1.3646 +}
1.3647 +
1.3648 +/* Move the cursor to the last entry in the table. Return SQLITE_OK
1.3649 +** on success. Set *pRes to 0 if the cursor actually points to something
1.3650 +** or set *pRes to 1 if the table is empty.
1.3651 +*/
1.3652 +int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
1.3653 + int rc;
1.3654 +
1.3655 + assert( cursorHoldsMutex(pCur) );
1.3656 + assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
1.3657 + rc = moveToRoot(pCur);
1.3658 + if( rc==SQLITE_OK ){
1.3659 + if( CURSOR_INVALID==pCur->eState ){
1.3660 + assert( pCur->apPage[pCur->iPage]->nCell==0 );
1.3661 + *pRes = 1;
1.3662 + }else{
1.3663 + assert( pCur->eState==CURSOR_VALID );
1.3664 + *pRes = 0;
1.3665 + rc = moveToRightmost(pCur);
1.3666 + getCellInfo(pCur);
1.3667 + pCur->atLast = rc==SQLITE_OK;
1.3668 + }
1.3669 + }
1.3670 + return rc;
1.3671 +}
1.3672 +
1.3673 +/* Move the cursor so that it points to an entry near the key
1.3674 +** specified by pIdxKey or intKey. Return a success code.
1.3675 +**
1.3676 +** For INTKEY tables, the intKey parameter is used. pIdxKey
1.3677 +** must be NULL. For index tables, pIdxKey is used and intKey
1.3678 +** is ignored.
1.3679 +**
1.3680 +** If an exact match is not found, then the cursor is always
1.3681 +** left pointing at a leaf page which would hold the entry if it
1.3682 +** were present. The cursor might point to an entry that comes
1.3683 +** before or after the key.
1.3684 +**
1.3685 +** The result of comparing the key with the entry to which the
1.3686 +** cursor is written to *pRes if pRes!=NULL. The meaning of
1.3687 +** this value is as follows:
1.3688 +**
1.3689 +** *pRes<0 The cursor is left pointing at an entry that
1.3690 +** is smaller than pKey or if the table is empty
1.3691 +** and the cursor is therefore left point to nothing.
1.3692 +**
1.3693 +** *pRes==0 The cursor is left pointing at an entry that
1.3694 +** exactly matches pKey.
1.3695 +**
1.3696 +** *pRes>0 The cursor is left pointing at an entry that
1.3697 +** is larger than pKey.
1.3698 +**
1.3699 +*/
1.3700 +int sqlite3BtreeMovetoUnpacked(
1.3701 + BtCursor *pCur, /* The cursor to be moved */
1.3702 + UnpackedRecord *pIdxKey, /* Unpacked index key */
1.3703 + i64 intKey, /* The table key */
1.3704 + int biasRight, /* If true, bias the search to the high end */
1.3705 + int *pRes /* Write search results here */
1.3706 +){
1.3707 + int rc;
1.3708 +
1.3709 + assert( cursorHoldsMutex(pCur) );
1.3710 + assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
1.3711 +
1.3712 + /* If the cursor is already positioned at the point we are trying
1.3713 + ** to move to, then just return without doing any work */
1.3714 + if( pCur->eState==CURSOR_VALID && pCur->validNKey
1.3715 + && pCur->apPage[0]->intKey
1.3716 + ){
1.3717 + if( pCur->info.nKey==intKey ){
1.3718 + *pRes = 0;
1.3719 + return SQLITE_OK;
1.3720 + }
1.3721 + if( pCur->atLast && pCur->info.nKey<intKey ){
1.3722 + *pRes = -1;
1.3723 + return SQLITE_OK;
1.3724 + }
1.3725 + }
1.3726 +
1.3727 + rc = moveToRoot(pCur);
1.3728 + if( rc ){
1.3729 + return rc;
1.3730 + }
1.3731 + assert( pCur->apPage[pCur->iPage] );
1.3732 + assert( pCur->apPage[pCur->iPage]->isInit );
1.3733 + if( pCur->eState==CURSOR_INVALID ){
1.3734 + *pRes = -1;
1.3735 + assert( pCur->apPage[pCur->iPage]->nCell==0 );
1.3736 + return SQLITE_OK;
1.3737 + }
1.3738 + assert( pCur->apPage[0]->intKey || pIdxKey );
1.3739 + for(;;){
1.3740 + int lwr, upr;
1.3741 + Pgno chldPg;
1.3742 + MemPage *pPage = pCur->apPage[pCur->iPage];
1.3743 + int c = -1; /* pRes return if table is empty must be -1 */
1.3744 + lwr = 0;
1.3745 + upr = pPage->nCell-1;
1.3746 + if( !pPage->intKey && pIdxKey==0 ){
1.3747 + rc = SQLITE_CORRUPT_BKPT;
1.3748 + goto moveto_finish;
1.3749 + }
1.3750 + if( biasRight ){
1.3751 + pCur->aiIdx[pCur->iPage] = upr;
1.3752 + }else{
1.3753 + pCur->aiIdx[pCur->iPage] = (upr+lwr)/2;
1.3754 + }
1.3755 + if( lwr<=upr ) for(;;){
1.3756 + void *pCellKey;
1.3757 + i64 nCellKey;
1.3758 + int idx = pCur->aiIdx[pCur->iPage];
1.3759 + pCur->info.nSize = 0;
1.3760 + pCur->validNKey = 1;
1.3761 + if( pPage->intKey ){
1.3762 + u8 *pCell;
1.3763 + pCell = findCell(pPage, idx) + pPage->childPtrSize;
1.3764 + if( pPage->hasData ){
1.3765 + u32 dummy;
1.3766 + pCell += getVarint32(pCell, dummy);
1.3767 + }
1.3768 + getVarint(pCell, (u64*)&nCellKey);
1.3769 + if( nCellKey==intKey ){
1.3770 + c = 0;
1.3771 + }else if( nCellKey<intKey ){
1.3772 + c = -1;
1.3773 + }else{
1.3774 + assert( nCellKey>intKey );
1.3775 + c = +1;
1.3776 + }
1.3777 + }else{
1.3778 + int available;
1.3779 + pCellKey = (void *)fetchPayload(pCur, &available, 0);
1.3780 + nCellKey = pCur->info.nKey;
1.3781 + if( available>=nCellKey ){
1.3782 + c = sqlite3VdbeRecordCompare(nCellKey, pCellKey, pIdxKey);
1.3783 + }else{
1.3784 + pCellKey = sqlite3Malloc( nCellKey );
1.3785 + if( pCellKey==0 ){
1.3786 + rc = SQLITE_NOMEM;
1.3787 + goto moveto_finish;
1.3788 + }
1.3789 + rc = sqlite3BtreeKey(pCur, 0, nCellKey, (void *)pCellKey);
1.3790 + c = sqlite3VdbeRecordCompare(nCellKey, pCellKey, pIdxKey);
1.3791 + sqlite3_free(pCellKey);
1.3792 + if( rc ) goto moveto_finish;
1.3793 + }
1.3794 + }
1.3795 + if( c==0 ){
1.3796 + pCur->info.nKey = nCellKey;
1.3797 + if( pPage->intKey && !pPage->leaf ){
1.3798 + lwr = idx;
1.3799 + upr = lwr - 1;
1.3800 + break;
1.3801 + }else{
1.3802 + if( pRes ) *pRes = 0;
1.3803 + rc = SQLITE_OK;
1.3804 + goto moveto_finish;
1.3805 + }
1.3806 + }
1.3807 + if( c<0 ){
1.3808 + lwr = idx+1;
1.3809 + }else{
1.3810 + upr = idx-1;
1.3811 + }
1.3812 + if( lwr>upr ){
1.3813 + pCur->info.nKey = nCellKey;
1.3814 + break;
1.3815 + }
1.3816 + pCur->aiIdx[pCur->iPage] = (lwr+upr)/2;
1.3817 + }
1.3818 + assert( lwr==upr+1 );
1.3819 + assert( pPage->isInit );
1.3820 + if( pPage->leaf ){
1.3821 + chldPg = 0;
1.3822 + }else if( lwr>=pPage->nCell ){
1.3823 + chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
1.3824 + }else{
1.3825 + chldPg = get4byte(findCell(pPage, lwr));
1.3826 + }
1.3827 + if( chldPg==0 ){
1.3828 + assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
1.3829 + if( pRes ) *pRes = c;
1.3830 + rc = SQLITE_OK;
1.3831 + goto moveto_finish;
1.3832 + }
1.3833 + pCur->aiIdx[pCur->iPage] = lwr;
1.3834 + pCur->info.nSize = 0;
1.3835 + pCur->validNKey = 0;
1.3836 + rc = moveToChild(pCur, chldPg);
1.3837 + if( rc ) goto moveto_finish;
1.3838 + }
1.3839 +moveto_finish:
1.3840 + return rc;
1.3841 +}
1.3842 +
1.3843 +/*
1.3844 +** In this version of BtreeMoveto, pKey is a packed index record
1.3845 +** such as is generated by the OP_MakeRecord opcode. Unpack the
1.3846 +** record and then call BtreeMovetoUnpacked() to do the work.
1.3847 +*/
1.3848 +int sqlite3BtreeMoveto(
1.3849 + BtCursor *pCur, /* Cursor open on the btree to be searched */
1.3850 + const void *pKey, /* Packed key if the btree is an index */
1.3851 + i64 nKey, /* Integer key for tables. Size of pKey for indices */
1.3852 + int bias, /* Bias search to the high end */
1.3853 + int *pRes /* Write search results here */
1.3854 +){
1.3855 + int rc; /* Status code */
1.3856 + UnpackedRecord *pIdxKey; /* Unpacked index key */
1.3857 + UnpackedRecord aSpace[16]; /* Temp space for pIdxKey - to avoid a malloc */
1.3858 +
1.3859 + if( pKey ){
1.3860 + pIdxKey = sqlite3VdbeRecordUnpack(pCur->pKeyInfo, nKey, pKey,
1.3861 + aSpace, sizeof(aSpace));
1.3862 + if( pIdxKey==0 ) return SQLITE_NOMEM;
1.3863 + }else{
1.3864 + pIdxKey = 0;
1.3865 + }
1.3866 + rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
1.3867 + if( pKey ){
1.3868 + sqlite3VdbeDeleteUnpackedRecord(pIdxKey);
1.3869 + }
1.3870 + return rc;
1.3871 +}
1.3872 +
1.3873 +
1.3874 +/*
1.3875 +** Return TRUE if the cursor is not pointing at an entry of the table.
1.3876 +**
1.3877 +** TRUE will be returned after a call to sqlite3BtreeNext() moves
1.3878 +** past the last entry in the table or sqlite3BtreePrev() moves past
1.3879 +** the first entry. TRUE is also returned if the table is empty.
1.3880 +*/
1.3881 +int sqlite3BtreeEof(BtCursor *pCur){
1.3882 + /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
1.3883 + ** have been deleted? This API will need to change to return an error code
1.3884 + ** as well as the boolean result value.
1.3885 + */
1.3886 + return (CURSOR_VALID!=pCur->eState);
1.3887 +}
1.3888 +
1.3889 +/*
1.3890 +** Return the database connection handle for a cursor.
1.3891 +*/
1.3892 +sqlite3 *sqlite3BtreeCursorDb(const BtCursor *pCur){
1.3893 + assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
1.3894 + return pCur->pBtree->db;
1.3895 +}
1.3896 +
1.3897 +/*
1.3898 +** Advance the cursor to the next entry in the database. If
1.3899 +** successful then set *pRes=0. If the cursor
1.3900 +** was already pointing to the last entry in the database before
1.3901 +** this routine was called, then set *pRes=1.
1.3902 +*/
1.3903 +int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
1.3904 + int rc;
1.3905 + int idx;
1.3906 + MemPage *pPage;
1.3907 +
1.3908 + assert( cursorHoldsMutex(pCur) );
1.3909 + rc = restoreCursorPosition(pCur);
1.3910 + if( rc!=SQLITE_OK ){
1.3911 + return rc;
1.3912 + }
1.3913 + assert( pRes!=0 );
1.3914 + if( CURSOR_INVALID==pCur->eState ){
1.3915 + *pRes = 1;
1.3916 + return SQLITE_OK;
1.3917 + }
1.3918 + if( pCur->skip>0 ){
1.3919 + pCur->skip = 0;
1.3920 + *pRes = 0;
1.3921 + return SQLITE_OK;
1.3922 + }
1.3923 + pCur->skip = 0;
1.3924 +
1.3925 + pPage = pCur->apPage[pCur->iPage];
1.3926 + idx = ++pCur->aiIdx[pCur->iPage];
1.3927 + assert( pPage->isInit );
1.3928 + assert( idx<=pPage->nCell );
1.3929 +
1.3930 + pCur->info.nSize = 0;
1.3931 + pCur->validNKey = 0;
1.3932 + if( idx>=pPage->nCell ){
1.3933 + if( !pPage->leaf ){
1.3934 + rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
1.3935 + if( rc ) return rc;
1.3936 + rc = moveToLeftmost(pCur);
1.3937 + *pRes = 0;
1.3938 + return rc;
1.3939 + }
1.3940 + do{
1.3941 + if( pCur->iPage==0 ){
1.3942 + *pRes = 1;
1.3943 + pCur->eState = CURSOR_INVALID;
1.3944 + return SQLITE_OK;
1.3945 + }
1.3946 + sqlite3BtreeMoveToParent(pCur);
1.3947 + pPage = pCur->apPage[pCur->iPage];
1.3948 + }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell );
1.3949 + *pRes = 0;
1.3950 + if( pPage->intKey ){
1.3951 + rc = sqlite3BtreeNext(pCur, pRes);
1.3952 + }else{
1.3953 + rc = SQLITE_OK;
1.3954 + }
1.3955 + return rc;
1.3956 + }
1.3957 + *pRes = 0;
1.3958 + if( pPage->leaf ){
1.3959 + return SQLITE_OK;
1.3960 + }
1.3961 + rc = moveToLeftmost(pCur);
1.3962 + return rc;
1.3963 +}
1.3964 +
1.3965 +
1.3966 +/*
1.3967 +** Step the cursor to the back to the previous entry in the database. If
1.3968 +** successful then set *pRes=0. If the cursor
1.3969 +** was already pointing to the first entry in the database before
1.3970 +** this routine was called, then set *pRes=1.
1.3971 +*/
1.3972 +int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
1.3973 + int rc;
1.3974 + MemPage *pPage;
1.3975 +
1.3976 + assert( cursorHoldsMutex(pCur) );
1.3977 + rc = restoreCursorPosition(pCur);
1.3978 + if( rc!=SQLITE_OK ){
1.3979 + return rc;
1.3980 + }
1.3981 + pCur->atLast = 0;
1.3982 + if( CURSOR_INVALID==pCur->eState ){
1.3983 + *pRes = 1;
1.3984 + return SQLITE_OK;
1.3985 + }
1.3986 + if( pCur->skip<0 ){
1.3987 + pCur->skip = 0;
1.3988 + *pRes = 0;
1.3989 + return SQLITE_OK;
1.3990 + }
1.3991 + pCur->skip = 0;
1.3992 +
1.3993 + pPage = pCur->apPage[pCur->iPage];
1.3994 + assert( pPage->isInit );
1.3995 + if( !pPage->leaf ){
1.3996 + int idx = pCur->aiIdx[pCur->iPage];
1.3997 + rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
1.3998 + if( rc ){
1.3999 + return rc;
1.4000 + }
1.4001 + rc = moveToRightmost(pCur);
1.4002 + }else{
1.4003 + while( pCur->aiIdx[pCur->iPage]==0 ){
1.4004 + if( pCur->iPage==0 ){
1.4005 + pCur->eState = CURSOR_INVALID;
1.4006 + *pRes = 1;
1.4007 + return SQLITE_OK;
1.4008 + }
1.4009 + sqlite3BtreeMoveToParent(pCur);
1.4010 + }
1.4011 + pCur->info.nSize = 0;
1.4012 + pCur->validNKey = 0;
1.4013 +
1.4014 + pCur->aiIdx[pCur->iPage]--;
1.4015 + pPage = pCur->apPage[pCur->iPage];
1.4016 + if( pPage->intKey && !pPage->leaf ){
1.4017 + rc = sqlite3BtreePrevious(pCur, pRes);
1.4018 + }else{
1.4019 + rc = SQLITE_OK;
1.4020 + }
1.4021 + }
1.4022 + *pRes = 0;
1.4023 + return rc;
1.4024 +}
1.4025 +
1.4026 +/*
1.4027 +** Allocate a new page from the database file.
1.4028 +**
1.4029 +** The new page is marked as dirty. (In other words, sqlite3PagerWrite()
1.4030 +** has already been called on the new page.) The new page has also
1.4031 +** been referenced and the calling routine is responsible for calling
1.4032 +** sqlite3PagerUnref() on the new page when it is done.
1.4033 +**
1.4034 +** SQLITE_OK is returned on success. Any other return value indicates
1.4035 +** an error. *ppPage and *pPgno are undefined in the event of an error.
1.4036 +** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned.
1.4037 +**
1.4038 +** If the "nearby" parameter is not 0, then a (feeble) effort is made to
1.4039 +** locate a page close to the page number "nearby". This can be used in an
1.4040 +** attempt to keep related pages close to each other in the database file,
1.4041 +** which in turn can make database access faster.
1.4042 +**
1.4043 +** If the "exact" parameter is not 0, and the page-number nearby exists
1.4044 +** anywhere on the free-list, then it is guarenteed to be returned. This
1.4045 +** is only used by auto-vacuum databases when allocating a new table.
1.4046 +*/
1.4047 +static int allocateBtreePage(
1.4048 + BtShared *pBt,
1.4049 + MemPage **ppPage,
1.4050 + Pgno *pPgno,
1.4051 + Pgno nearby,
1.4052 + u8 exact
1.4053 +){
1.4054 + MemPage *pPage1;
1.4055 + int rc;
1.4056 + int n; /* Number of pages on the freelist */
1.4057 + int k; /* Number of leaves on the trunk of the freelist */
1.4058 + MemPage *pTrunk = 0;
1.4059 + MemPage *pPrevTrunk = 0;
1.4060 +
1.4061 + assert( sqlite3_mutex_held(pBt->mutex) );
1.4062 + pPage1 = pBt->pPage1;
1.4063 + n = get4byte(&pPage1->aData[36]);
1.4064 + if( n>0 ){
1.4065 + /* There are pages on the freelist. Reuse one of those pages. */
1.4066 + Pgno iTrunk;
1.4067 + u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
1.4068 +
1.4069 + /* If the 'exact' parameter was true and a query of the pointer-map
1.4070 + ** shows that the page 'nearby' is somewhere on the free-list, then
1.4071 + ** the entire-list will be searched for that page.
1.4072 + */
1.4073 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.4074 + if( exact && nearby<=pagerPagecount(pBt->pPager) ){
1.4075 + u8 eType;
1.4076 + assert( nearby>0 );
1.4077 + assert( pBt->autoVacuum );
1.4078 + rc = ptrmapGet(pBt, nearby, &eType, 0);
1.4079 + if( rc ) return rc;
1.4080 + if( eType==PTRMAP_FREEPAGE ){
1.4081 + searchList = 1;
1.4082 + }
1.4083 + *pPgno = nearby;
1.4084 + }
1.4085 +#endif
1.4086 +
1.4087 + /* Decrement the free-list count by 1. Set iTrunk to the index of the
1.4088 + ** first free-list trunk page. iPrevTrunk is initially 1.
1.4089 + */
1.4090 + rc = sqlite3PagerWrite(pPage1->pDbPage);
1.4091 + if( rc ) return rc;
1.4092 + put4byte(&pPage1->aData[36], n-1);
1.4093 +
1.4094 + /* The code within this loop is run only once if the 'searchList' variable
1.4095 + ** is not true. Otherwise, it runs once for each trunk-page on the
1.4096 + ** free-list until the page 'nearby' is located.
1.4097 + */
1.4098 + do {
1.4099 + pPrevTrunk = pTrunk;
1.4100 + if( pPrevTrunk ){
1.4101 + iTrunk = get4byte(&pPrevTrunk->aData[0]);
1.4102 + }else{
1.4103 + iTrunk = get4byte(&pPage1->aData[32]);
1.4104 + }
1.4105 + rc = sqlite3BtreeGetPage(pBt, iTrunk, &pTrunk, 0);
1.4106 + if( rc ){
1.4107 + pTrunk = 0;
1.4108 + goto end_allocate_page;
1.4109 + }
1.4110 +
1.4111 + k = get4byte(&pTrunk->aData[4]);
1.4112 + if( k==0 && !searchList ){
1.4113 + /* The trunk has no leaves and the list is not being searched.
1.4114 + ** So extract the trunk page itself and use it as the newly
1.4115 + ** allocated page */
1.4116 + assert( pPrevTrunk==0 );
1.4117 + rc = sqlite3PagerWrite(pTrunk->pDbPage);
1.4118 + if( rc ){
1.4119 + goto end_allocate_page;
1.4120 + }
1.4121 + *pPgno = iTrunk;
1.4122 + memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
1.4123 + *ppPage = pTrunk;
1.4124 + pTrunk = 0;
1.4125 + TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
1.4126 + }else if( k>pBt->usableSize/4 - 2 ){
1.4127 + /* Value of k is out of range. Database corruption */
1.4128 + rc = SQLITE_CORRUPT_BKPT;
1.4129 + goto end_allocate_page;
1.4130 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.4131 + }else if( searchList && nearby==iTrunk ){
1.4132 + /* The list is being searched and this trunk page is the page
1.4133 + ** to allocate, regardless of whether it has leaves.
1.4134 + */
1.4135 + assert( *pPgno==iTrunk );
1.4136 + *ppPage = pTrunk;
1.4137 + searchList = 0;
1.4138 + rc = sqlite3PagerWrite(pTrunk->pDbPage);
1.4139 + if( rc ){
1.4140 + goto end_allocate_page;
1.4141 + }
1.4142 + if( k==0 ){
1.4143 + if( !pPrevTrunk ){
1.4144 + memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
1.4145 + }else{
1.4146 + memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
1.4147 + }
1.4148 + }else{
1.4149 + /* The trunk page is required by the caller but it contains
1.4150 + ** pointers to free-list leaves. The first leaf becomes a trunk
1.4151 + ** page in this case.
1.4152 + */
1.4153 + MemPage *pNewTrunk;
1.4154 + Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
1.4155 + rc = sqlite3BtreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0);
1.4156 + if( rc!=SQLITE_OK ){
1.4157 + goto end_allocate_page;
1.4158 + }
1.4159 + rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
1.4160 + if( rc!=SQLITE_OK ){
1.4161 + releasePage(pNewTrunk);
1.4162 + goto end_allocate_page;
1.4163 + }
1.4164 + memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
1.4165 + put4byte(&pNewTrunk->aData[4], k-1);
1.4166 + memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
1.4167 + releasePage(pNewTrunk);
1.4168 + if( !pPrevTrunk ){
1.4169 + put4byte(&pPage1->aData[32], iNewTrunk);
1.4170 + }else{
1.4171 + rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
1.4172 + if( rc ){
1.4173 + goto end_allocate_page;
1.4174 + }
1.4175 + put4byte(&pPrevTrunk->aData[0], iNewTrunk);
1.4176 + }
1.4177 + }
1.4178 + pTrunk = 0;
1.4179 + TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
1.4180 +#endif
1.4181 + }else{
1.4182 + /* Extract a leaf from the trunk */
1.4183 + int closest;
1.4184 + Pgno iPage;
1.4185 + unsigned char *aData = pTrunk->aData;
1.4186 + rc = sqlite3PagerWrite(pTrunk->pDbPage);
1.4187 + if( rc ){
1.4188 + goto end_allocate_page;
1.4189 + }
1.4190 + if( nearby>0 ){
1.4191 + int i, dist;
1.4192 + closest = 0;
1.4193 + dist = get4byte(&aData[8]) - nearby;
1.4194 + if( dist<0 ) dist = -dist;
1.4195 + for(i=1; i<k; i++){
1.4196 + int d2 = get4byte(&aData[8+i*4]) - nearby;
1.4197 + if( d2<0 ) d2 = -d2;
1.4198 + if( d2<dist ){
1.4199 + closest = i;
1.4200 + dist = d2;
1.4201 + }
1.4202 + }
1.4203 + }else{
1.4204 + closest = 0;
1.4205 + }
1.4206 +
1.4207 + iPage = get4byte(&aData[8+closest*4]);
1.4208 + if( !searchList || iPage==nearby ){
1.4209 + int nPage;
1.4210 + *pPgno = iPage;
1.4211 + nPage = pagerPagecount(pBt->pPager);
1.4212 + if( *pPgno>nPage ){
1.4213 + /* Free page off the end of the file */
1.4214 + rc = SQLITE_CORRUPT_BKPT;
1.4215 + goto end_allocate_page;
1.4216 + }
1.4217 + TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
1.4218 + ": %d more free pages\n",
1.4219 + *pPgno, closest+1, k, pTrunk->pgno, n-1));
1.4220 + if( closest<k-1 ){
1.4221 + memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
1.4222 + }
1.4223 + put4byte(&aData[4], k-1);
1.4224 + rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 1);
1.4225 + if( rc==SQLITE_OK ){
1.4226 + sqlite3PagerDontRollback((*ppPage)->pDbPage);
1.4227 + rc = sqlite3PagerWrite((*ppPage)->pDbPage);
1.4228 + if( rc!=SQLITE_OK ){
1.4229 + releasePage(*ppPage);
1.4230 + }
1.4231 + }
1.4232 + searchList = 0;
1.4233 + }
1.4234 + }
1.4235 + releasePage(pPrevTrunk);
1.4236 + pPrevTrunk = 0;
1.4237 + }while( searchList );
1.4238 + }else{
1.4239 + /* There are no pages on the freelist, so create a new page at the
1.4240 + ** end of the file */
1.4241 + int nPage = pagerPagecount(pBt->pPager);
1.4242 + *pPgno = nPage + 1;
1.4243 +
1.4244 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.4245 + if( pBt->nTrunc ){
1.4246 + /* An incr-vacuum has already run within this transaction. So the
1.4247 + ** page to allocate is not from the physical end of the file, but
1.4248 + ** at pBt->nTrunc.
1.4249 + */
1.4250 + *pPgno = pBt->nTrunc+1;
1.4251 + if( *pPgno==PENDING_BYTE_PAGE(pBt) ){
1.4252 + (*pPgno)++;
1.4253 + }
1.4254 + }
1.4255 + if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, *pPgno) ){
1.4256 + /* If *pPgno refers to a pointer-map page, allocate two new pages
1.4257 + ** at the end of the file instead of one. The first allocated page
1.4258 + ** becomes a new pointer-map page, the second is used by the caller.
1.4259 + */
1.4260 + TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", *pPgno));
1.4261 + assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
1.4262 + (*pPgno)++;
1.4263 + if( *pPgno==PENDING_BYTE_PAGE(pBt) ){ (*pPgno)++; }
1.4264 + }
1.4265 + if( pBt->nTrunc ){
1.4266 + pBt->nTrunc = *pPgno;
1.4267 + }
1.4268 +#endif
1.4269 +
1.4270 + assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
1.4271 + rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 0);
1.4272 + if( rc ) return rc;
1.4273 + rc = sqlite3PagerWrite((*ppPage)->pDbPage);
1.4274 + if( rc!=SQLITE_OK ){
1.4275 + releasePage(*ppPage);
1.4276 + }
1.4277 + TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
1.4278 + }
1.4279 +
1.4280 + assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
1.4281 +
1.4282 +end_allocate_page:
1.4283 + releasePage(pTrunk);
1.4284 + releasePage(pPrevTrunk);
1.4285 + if( rc==SQLITE_OK && sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
1.4286 + releasePage(*ppPage);
1.4287 + return SQLITE_CORRUPT_BKPT;
1.4288 + }
1.4289 + return rc;
1.4290 +}
1.4291 +
1.4292 +/*
1.4293 +** Add a page of the database file to the freelist.
1.4294 +**
1.4295 +** sqlite3PagerUnref() is NOT called for pPage.
1.4296 +*/
1.4297 +static int freePage(MemPage *pPage){
1.4298 + BtShared *pBt = pPage->pBt;
1.4299 + MemPage *pPage1 = pBt->pPage1;
1.4300 + int rc, n, k;
1.4301 +
1.4302 + /* Prepare the page for freeing */
1.4303 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.4304 + assert( pPage->pgno>1 );
1.4305 + pPage->isInit = 0;
1.4306 +
1.4307 + /* Increment the free page count on pPage1 */
1.4308 + rc = sqlite3PagerWrite(pPage1->pDbPage);
1.4309 + if( rc ) return rc;
1.4310 + n = get4byte(&pPage1->aData[36]);
1.4311 + put4byte(&pPage1->aData[36], n+1);
1.4312 +
1.4313 +#ifdef SQLITE_SECURE_DELETE
1.4314 + /* If the SQLITE_SECURE_DELETE compile-time option is enabled, then
1.4315 + ** always fully overwrite deleted information with zeros.
1.4316 + */
1.4317 + rc = sqlite3PagerWrite(pPage->pDbPage);
1.4318 + if( rc ) return rc;
1.4319 + memset(pPage->aData, 0, pPage->pBt->pageSize);
1.4320 +#endif
1.4321 +
1.4322 + /* If the database supports auto-vacuum, write an entry in the pointer-map
1.4323 + ** to indicate that the page is free.
1.4324 + */
1.4325 + if( ISAUTOVACUUM ){
1.4326 + rc = ptrmapPut(pBt, pPage->pgno, PTRMAP_FREEPAGE, 0);
1.4327 + if( rc ) return rc;
1.4328 + }
1.4329 +
1.4330 + if( n==0 ){
1.4331 + /* This is the first free page */
1.4332 + rc = sqlite3PagerWrite(pPage->pDbPage);
1.4333 + if( rc ) return rc;
1.4334 + memset(pPage->aData, 0, 8);
1.4335 + put4byte(&pPage1->aData[32], pPage->pgno);
1.4336 + TRACE(("FREE-PAGE: %d first\n", pPage->pgno));
1.4337 + }else{
1.4338 + /* Other free pages already exist. Retrive the first trunk page
1.4339 + ** of the freelist and find out how many leaves it has. */
1.4340 + MemPage *pTrunk;
1.4341 + rc = sqlite3BtreeGetPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk, 0);
1.4342 + if( rc ) return rc;
1.4343 + k = get4byte(&pTrunk->aData[4]);
1.4344 + if( k>=pBt->usableSize/4 - 8 ){
1.4345 + /* The trunk is full. Turn the page being freed into a new
1.4346 + ** trunk page with no leaves.
1.4347 + **
1.4348 + ** Note that the trunk page is not really full until it contains
1.4349 + ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
1.4350 + ** coded. But due to a coding error in versions of SQLite prior to
1.4351 + ** 3.6.0, databases with freelist trunk pages holding more than
1.4352 + ** usableSize/4 - 8 entries will be reported as corrupt. In order
1.4353 + ** to maintain backwards compatibility with older versions of SQLite,
1.4354 + ** we will contain to restrict the number of entries to usableSize/4 - 8
1.4355 + ** for now. At some point in the future (once everyone has upgraded
1.4356 + ** to 3.6.0 or later) we should consider fixing the conditional above
1.4357 + ** to read "usableSize/4-2" instead of "usableSize/4-8".
1.4358 + */
1.4359 + rc = sqlite3PagerWrite(pPage->pDbPage);
1.4360 + if( rc==SQLITE_OK ){
1.4361 + put4byte(pPage->aData, pTrunk->pgno);
1.4362 + put4byte(&pPage->aData[4], 0);
1.4363 + put4byte(&pPage1->aData[32], pPage->pgno);
1.4364 + TRACE(("FREE-PAGE: %d new trunk page replacing %d\n",
1.4365 + pPage->pgno, pTrunk->pgno));
1.4366 + }
1.4367 + }else if( k<0 ){
1.4368 + rc = SQLITE_CORRUPT;
1.4369 + }else{
1.4370 + /* Add the newly freed page as a leaf on the current trunk */
1.4371 + rc = sqlite3PagerWrite(pTrunk->pDbPage);
1.4372 + if( rc==SQLITE_OK ){
1.4373 + put4byte(&pTrunk->aData[4], k+1);
1.4374 + put4byte(&pTrunk->aData[8+k*4], pPage->pgno);
1.4375 +#ifndef SQLITE_SECURE_DELETE
1.4376 + rc = sqlite3PagerDontWrite(pPage->pDbPage);
1.4377 +#endif
1.4378 + }
1.4379 + TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
1.4380 + }
1.4381 + releasePage(pTrunk);
1.4382 + }
1.4383 + return rc;
1.4384 +}
1.4385 +
1.4386 +/*
1.4387 +** Free any overflow pages associated with the given Cell.
1.4388 +*/
1.4389 +static int clearCell(MemPage *pPage, unsigned char *pCell){
1.4390 + BtShared *pBt = pPage->pBt;
1.4391 + CellInfo info;
1.4392 + Pgno ovflPgno;
1.4393 + int rc;
1.4394 + int nOvfl;
1.4395 + int ovflPageSize;
1.4396 +
1.4397 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.4398 + sqlite3BtreeParseCellPtr(pPage, pCell, &info);
1.4399 + if( info.iOverflow==0 ){
1.4400 + return SQLITE_OK; /* No overflow pages. Return without doing anything */
1.4401 + }
1.4402 + ovflPgno = get4byte(&pCell[info.iOverflow]);
1.4403 + ovflPageSize = pBt->usableSize - 4;
1.4404 + nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;
1.4405 + assert( ovflPgno==0 || nOvfl>0 );
1.4406 + while( nOvfl-- ){
1.4407 + MemPage *pOvfl;
1.4408 + if( ovflPgno==0 || ovflPgno>pagerPagecount(pBt->pPager) ){
1.4409 + return SQLITE_CORRUPT_BKPT;
1.4410 + }
1.4411 +
1.4412 + rc = getOverflowPage(pBt, ovflPgno, &pOvfl, (nOvfl==0)?0:&ovflPgno);
1.4413 + if( rc ) return rc;
1.4414 + rc = freePage(pOvfl);
1.4415 + sqlite3PagerUnref(pOvfl->pDbPage);
1.4416 + if( rc ) return rc;
1.4417 + }
1.4418 + return SQLITE_OK;
1.4419 +}
1.4420 +
1.4421 +/*
1.4422 +** Create the byte sequence used to represent a cell on page pPage
1.4423 +** and write that byte sequence into pCell[]. Overflow pages are
1.4424 +** allocated and filled in as necessary. The calling procedure
1.4425 +** is responsible for making sure sufficient space has been allocated
1.4426 +** for pCell[].
1.4427 +**
1.4428 +** Note that pCell does not necessary need to point to the pPage->aData
1.4429 +** area. pCell might point to some temporary storage. The cell will
1.4430 +** be constructed in this temporary area then copied into pPage->aData
1.4431 +** later.
1.4432 +*/
1.4433 +static int fillInCell(
1.4434 + MemPage *pPage, /* The page that contains the cell */
1.4435 + unsigned char *pCell, /* Complete text of the cell */
1.4436 + const void *pKey, i64 nKey, /* The key */
1.4437 + const void *pData,int nData, /* The data */
1.4438 + int nZero, /* Extra zero bytes to append to pData */
1.4439 + int *pnSize /* Write cell size here */
1.4440 +){
1.4441 + int nPayload;
1.4442 + const u8 *pSrc;
1.4443 + int nSrc, n, rc;
1.4444 + int spaceLeft;
1.4445 + MemPage *pOvfl = 0;
1.4446 + MemPage *pToRelease = 0;
1.4447 + unsigned char *pPrior;
1.4448 + unsigned char *pPayload;
1.4449 + BtShared *pBt = pPage->pBt;
1.4450 + Pgno pgnoOvfl = 0;
1.4451 + int nHeader;
1.4452 + CellInfo info;
1.4453 +
1.4454 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.4455 +
1.4456 + /* Fill in the header. */
1.4457 + nHeader = 0;
1.4458 + if( !pPage->leaf ){
1.4459 + nHeader += 4;
1.4460 + }
1.4461 + if( pPage->hasData ){
1.4462 + nHeader += putVarint(&pCell[nHeader], nData+nZero);
1.4463 + }else{
1.4464 + nData = nZero = 0;
1.4465 + }
1.4466 + nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
1.4467 + sqlite3BtreeParseCellPtr(pPage, pCell, &info);
1.4468 + assert( info.nHeader==nHeader );
1.4469 + assert( info.nKey==nKey );
1.4470 + assert( info.nData==nData+nZero );
1.4471 +
1.4472 + /* Fill in the payload */
1.4473 + nPayload = nData + nZero;
1.4474 + if( pPage->intKey ){
1.4475 + pSrc = pData;
1.4476 + nSrc = nData;
1.4477 + nData = 0;
1.4478 + }else{
1.4479 + nPayload += nKey;
1.4480 + pSrc = pKey;
1.4481 + nSrc = nKey;
1.4482 + }
1.4483 + *pnSize = info.nSize;
1.4484 + spaceLeft = info.nLocal;
1.4485 + pPayload = &pCell[nHeader];
1.4486 + pPrior = &pCell[info.iOverflow];
1.4487 +
1.4488 + while( nPayload>0 ){
1.4489 + if( spaceLeft==0 ){
1.4490 + int isExact = 0;
1.4491 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.4492 + Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
1.4493 + if( pBt->autoVacuum ){
1.4494 + do{
1.4495 + pgnoOvfl++;
1.4496 + } while(
1.4497 + PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
1.4498 + );
1.4499 + if( pgnoOvfl>1 ){
1.4500 + /* isExact = 1; */
1.4501 + }
1.4502 + }
1.4503 +#endif
1.4504 + rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, isExact);
1.4505 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.4506 + /* If the database supports auto-vacuum, and the second or subsequent
1.4507 + ** overflow page is being allocated, add an entry to the pointer-map
1.4508 + ** for that page now.
1.4509 + **
1.4510 + ** If this is the first overflow page, then write a partial entry
1.4511 + ** to the pointer-map. If we write nothing to this pointer-map slot,
1.4512 + ** then the optimistic overflow chain processing in clearCell()
1.4513 + ** may misinterpret the uninitialised values and delete the
1.4514 + ** wrong pages from the database.
1.4515 + */
1.4516 + if( pBt->autoVacuum && rc==SQLITE_OK ){
1.4517 + u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
1.4518 + rc = ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap);
1.4519 + if( rc ){
1.4520 + releasePage(pOvfl);
1.4521 + }
1.4522 + }
1.4523 +#endif
1.4524 + if( rc ){
1.4525 + releasePage(pToRelease);
1.4526 + return rc;
1.4527 + }
1.4528 + put4byte(pPrior, pgnoOvfl);
1.4529 + releasePage(pToRelease);
1.4530 + pToRelease = pOvfl;
1.4531 + pPrior = pOvfl->aData;
1.4532 + put4byte(pPrior, 0);
1.4533 + pPayload = &pOvfl->aData[4];
1.4534 + spaceLeft = pBt->usableSize - 4;
1.4535 + }
1.4536 + n = nPayload;
1.4537 + if( n>spaceLeft ) n = spaceLeft;
1.4538 + if( nSrc>0 ){
1.4539 + if( n>nSrc ) n = nSrc;
1.4540 + assert( pSrc );
1.4541 + memcpy(pPayload, pSrc, n);
1.4542 + }else{
1.4543 + memset(pPayload, 0, n);
1.4544 + }
1.4545 + nPayload -= n;
1.4546 + pPayload += n;
1.4547 + pSrc += n;
1.4548 + nSrc -= n;
1.4549 + spaceLeft -= n;
1.4550 + if( nSrc==0 ){
1.4551 + nSrc = nData;
1.4552 + pSrc = pData;
1.4553 + }
1.4554 + }
1.4555 + releasePage(pToRelease);
1.4556 + return SQLITE_OK;
1.4557 +}
1.4558 +
1.4559 +/*
1.4560 +** Remove the i-th cell from pPage. This routine effects pPage only.
1.4561 +** The cell content is not freed or deallocated. It is assumed that
1.4562 +** the cell content has been copied someplace else. This routine just
1.4563 +** removes the reference to the cell from pPage.
1.4564 +**
1.4565 +** "sz" must be the number of bytes in the cell.
1.4566 +*/
1.4567 +static void dropCell(MemPage *pPage, int idx, int sz){
1.4568 + int i; /* Loop counter */
1.4569 + int pc; /* Offset to cell content of cell being deleted */
1.4570 + u8 *data; /* pPage->aData */
1.4571 + u8 *ptr; /* Used to move bytes around within data[] */
1.4572 +
1.4573 + assert( idx>=0 && idx<pPage->nCell );
1.4574 + assert( sz==cellSize(pPage, idx) );
1.4575 + assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1.4576 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.4577 + data = pPage->aData;
1.4578 + ptr = &data[pPage->cellOffset + 2*idx];
1.4579 + pc = get2byte(ptr);
1.4580 + assert( pc>10 && pc+sz<=pPage->pBt->usableSize );
1.4581 + freeSpace(pPage, pc, sz);
1.4582 + for(i=idx+1; i<pPage->nCell; i++, ptr+=2){
1.4583 + ptr[0] = ptr[2];
1.4584 + ptr[1] = ptr[3];
1.4585 + }
1.4586 + pPage->nCell--;
1.4587 + put2byte(&data[pPage->hdrOffset+3], pPage->nCell);
1.4588 + pPage->nFree += 2;
1.4589 +}
1.4590 +
1.4591 +/*
1.4592 +** Insert a new cell on pPage at cell index "i". pCell points to the
1.4593 +** content of the cell.
1.4594 +**
1.4595 +** If the cell content will fit on the page, then put it there. If it
1.4596 +** will not fit, then make a copy of the cell content into pTemp if
1.4597 +** pTemp is not null. Regardless of pTemp, allocate a new entry
1.4598 +** in pPage->aOvfl[] and make it point to the cell content (either
1.4599 +** in pTemp or the original pCell) and also record its index.
1.4600 +** Allocating a new entry in pPage->aCell[] implies that
1.4601 +** pPage->nOverflow is incremented.
1.4602 +**
1.4603 +** If nSkip is non-zero, then do not copy the first nSkip bytes of the
1.4604 +** cell. The caller will overwrite them after this function returns. If
1.4605 +** nSkip is non-zero, then pCell may not point to an invalid memory location
1.4606 +** (but pCell+nSkip is always valid).
1.4607 +*/
1.4608 +static int insertCell(
1.4609 + MemPage *pPage, /* Page into which we are copying */
1.4610 + int i, /* New cell becomes the i-th cell of the page */
1.4611 + u8 *pCell, /* Content of the new cell */
1.4612 + int sz, /* Bytes of content in pCell */
1.4613 + u8 *pTemp, /* Temp storage space for pCell, if needed */
1.4614 + u8 nSkip /* Do not write the first nSkip bytes of the cell */
1.4615 +){
1.4616 + int idx; /* Where to write new cell content in data[] */
1.4617 + int j; /* Loop counter */
1.4618 + int top; /* First byte of content for any cell in data[] */
1.4619 + int end; /* First byte past the last cell pointer in data[] */
1.4620 + int ins; /* Index in data[] where new cell pointer is inserted */
1.4621 + int hdr; /* Offset into data[] of the page header */
1.4622 + int cellOffset; /* Address of first cell pointer in data[] */
1.4623 + u8 *data; /* The content of the whole page */
1.4624 + u8 *ptr; /* Used for moving information around in data[] */
1.4625 +
1.4626 + assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
1.4627 + assert( sz==cellSizePtr(pPage, pCell) );
1.4628 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.4629 + if( pPage->nOverflow || sz+2>pPage->nFree ){
1.4630 + if( pTemp ){
1.4631 + memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip);
1.4632 + pCell = pTemp;
1.4633 + }
1.4634 + j = pPage->nOverflow++;
1.4635 + assert( j<sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0]) );
1.4636 + pPage->aOvfl[j].pCell = pCell;
1.4637 + pPage->aOvfl[j].idx = i;
1.4638 + pPage->nFree = 0;
1.4639 + }else{
1.4640 + int rc = sqlite3PagerWrite(pPage->pDbPage);
1.4641 + if( rc!=SQLITE_OK ){
1.4642 + return rc;
1.4643 + }
1.4644 + assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1.4645 + data = pPage->aData;
1.4646 + hdr = pPage->hdrOffset;
1.4647 + top = get2byte(&data[hdr+5]);
1.4648 + cellOffset = pPage->cellOffset;
1.4649 + end = cellOffset + 2*pPage->nCell + 2;
1.4650 + ins = cellOffset + 2*i;
1.4651 + if( end > top - sz ){
1.4652 + defragmentPage(pPage);
1.4653 + top = get2byte(&data[hdr+5]);
1.4654 + assert( end + sz <= top );
1.4655 + }
1.4656 + idx = allocateSpace(pPage, sz);
1.4657 + assert( idx>0 );
1.4658 + assert( end <= get2byte(&data[hdr+5]) );
1.4659 + pPage->nCell++;
1.4660 + pPage->nFree -= 2;
1.4661 + memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip);
1.4662 + for(j=end-2, ptr=&data[j]; j>ins; j-=2, ptr-=2){
1.4663 + ptr[0] = ptr[-2];
1.4664 + ptr[1] = ptr[-1];
1.4665 + }
1.4666 + put2byte(&data[ins], idx);
1.4667 + put2byte(&data[hdr+3], pPage->nCell);
1.4668 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.4669 + if( pPage->pBt->autoVacuum ){
1.4670 + /* The cell may contain a pointer to an overflow page. If so, write
1.4671 + ** the entry for the overflow page into the pointer map.
1.4672 + */
1.4673 + CellInfo info;
1.4674 + sqlite3BtreeParseCellPtr(pPage, pCell, &info);
1.4675 + assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
1.4676 + if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
1.4677 + Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
1.4678 + rc = ptrmapPut(pPage->pBt, pgnoOvfl, PTRMAP_OVERFLOW1, pPage->pgno);
1.4679 + if( rc!=SQLITE_OK ) return rc;
1.4680 + }
1.4681 + }
1.4682 +#endif
1.4683 + }
1.4684 +
1.4685 + return SQLITE_OK;
1.4686 +}
1.4687 +
1.4688 +/*
1.4689 +** Add a list of cells to a page. The page should be initially empty.
1.4690 +** The cells are guaranteed to fit on the page.
1.4691 +*/
1.4692 +static void assemblePage(
1.4693 + MemPage *pPage, /* The page to be assemblied */
1.4694 + int nCell, /* The number of cells to add to this page */
1.4695 + u8 **apCell, /* Pointers to cell bodies */
1.4696 + u16 *aSize /* Sizes of the cells */
1.4697 +){
1.4698 + int i; /* Loop counter */
1.4699 + int totalSize; /* Total size of all cells */
1.4700 + int hdr; /* Index of page header */
1.4701 + int cellptr; /* Address of next cell pointer */
1.4702 + int cellbody; /* Address of next cell body */
1.4703 + u8 *data; /* Data for the page */
1.4704 +
1.4705 + assert( pPage->nOverflow==0 );
1.4706 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.4707 + totalSize = 0;
1.4708 + for(i=0; i<nCell; i++){
1.4709 + totalSize += aSize[i];
1.4710 + }
1.4711 + assert( totalSize+2*nCell<=pPage->nFree );
1.4712 + assert( pPage->nCell==0 );
1.4713 + cellptr = pPage->cellOffset;
1.4714 + data = pPage->aData;
1.4715 + hdr = pPage->hdrOffset;
1.4716 + put2byte(&data[hdr+3], nCell);
1.4717 + if( nCell ){
1.4718 + cellbody = allocateSpace(pPage, totalSize);
1.4719 + assert( cellbody>0 );
1.4720 + assert( pPage->nFree >= 2*nCell );
1.4721 + pPage->nFree -= 2*nCell;
1.4722 + for(i=0; i<nCell; i++){
1.4723 + put2byte(&data[cellptr], cellbody);
1.4724 + memcpy(&data[cellbody], apCell[i], aSize[i]);
1.4725 + cellptr += 2;
1.4726 + cellbody += aSize[i];
1.4727 + }
1.4728 + assert( cellbody==pPage->pBt->usableSize );
1.4729 + }
1.4730 + pPage->nCell = nCell;
1.4731 +}
1.4732 +
1.4733 +/*
1.4734 +** The following parameters determine how many adjacent pages get involved
1.4735 +** in a balancing operation. NN is the number of neighbors on either side
1.4736 +** of the page that participate in the balancing operation. NB is the
1.4737 +** total number of pages that participate, including the target page and
1.4738 +** NN neighbors on either side.
1.4739 +**
1.4740 +** The minimum value of NN is 1 (of course). Increasing NN above 1
1.4741 +** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
1.4742 +** in exchange for a larger degradation in INSERT and UPDATE performance.
1.4743 +** The value of NN appears to give the best results overall.
1.4744 +*/
1.4745 +#define NN 1 /* Number of neighbors on either side of pPage */
1.4746 +#define NB (NN*2+1) /* Total pages involved in the balance */
1.4747 +
1.4748 +/* Forward reference */
1.4749 +static int balance(BtCursor*, int);
1.4750 +
1.4751 +#ifndef SQLITE_OMIT_QUICKBALANCE
1.4752 +/*
1.4753 +** This version of balance() handles the common special case where
1.4754 +** a new entry is being inserted on the extreme right-end of the
1.4755 +** tree, in other words, when the new entry will become the largest
1.4756 +** entry in the tree.
1.4757 +**
1.4758 +** Instead of trying balance the 3 right-most leaf pages, just add
1.4759 +** a new page to the right-hand side and put the one new entry in
1.4760 +** that page. This leaves the right side of the tree somewhat
1.4761 +** unbalanced. But odds are that we will be inserting new entries
1.4762 +** at the end soon afterwards so the nearly empty page will quickly
1.4763 +** fill up. On average.
1.4764 +**
1.4765 +** pPage is the leaf page which is the right-most page in the tree.
1.4766 +** pParent is its parent. pPage must have a single overflow entry
1.4767 +** which is also the right-most entry on the page.
1.4768 +*/
1.4769 +static int balance_quick(BtCursor *pCur){
1.4770 + int rc;
1.4771 + MemPage *pNew = 0;
1.4772 + Pgno pgnoNew;
1.4773 + u8 *pCell;
1.4774 + u16 szCell;
1.4775 + CellInfo info;
1.4776 + MemPage *pPage = pCur->apPage[pCur->iPage];
1.4777 + MemPage *pParent = pCur->apPage[pCur->iPage-1];
1.4778 + BtShared *pBt = pPage->pBt;
1.4779 + int parentIdx = pParent->nCell; /* pParent new divider cell index */
1.4780 + int parentSize; /* Size of new divider cell */
1.4781 + u8 parentCell[64]; /* Space for the new divider cell */
1.4782 +
1.4783 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.4784 +
1.4785 + /* Allocate a new page. Insert the overflow cell from pPage
1.4786 + ** into it. Then remove the overflow cell from pPage.
1.4787 + */
1.4788 + rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
1.4789 + if( rc==SQLITE_OK ){
1.4790 + pCell = pPage->aOvfl[0].pCell;
1.4791 + szCell = cellSizePtr(pPage, pCell);
1.4792 + zeroPage(pNew, pPage->aData[0]);
1.4793 + assemblePage(pNew, 1, &pCell, &szCell);
1.4794 + pPage->nOverflow = 0;
1.4795 +
1.4796 + /* pPage is currently the right-child of pParent. Change this
1.4797 + ** so that the right-child is the new page allocated above and
1.4798 + ** pPage is the next-to-right child.
1.4799 + **
1.4800 + ** Ignore the return value of the call to fillInCell(). fillInCell()
1.4801 + ** may only return other than SQLITE_OK if it is required to allocate
1.4802 + ** one or more overflow pages. Since an internal table B-Tree cell
1.4803 + ** may never spill over onto an overflow page (it is a maximum of
1.4804 + ** 13 bytes in size), it is not neccessary to check the return code.
1.4805 + **
1.4806 + ** Similarly, the insertCell() function cannot fail if the page
1.4807 + ** being inserted into is already writable and the cell does not
1.4808 + ** contain an overflow pointer. So ignore this return code too.
1.4809 + */
1.4810 + assert( pPage->nCell>0 );
1.4811 + pCell = findCell(pPage, pPage->nCell-1);
1.4812 + sqlite3BtreeParseCellPtr(pPage, pCell, &info);
1.4813 + fillInCell(pParent, parentCell, 0, info.nKey, 0, 0, 0, &parentSize);
1.4814 + assert( parentSize<64 );
1.4815 + assert( sqlite3PagerIswriteable(pParent->pDbPage) );
1.4816 + insertCell(pParent, parentIdx, parentCell, parentSize, 0, 4);
1.4817 + put4byte(findOverflowCell(pParent,parentIdx), pPage->pgno);
1.4818 + put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
1.4819 +
1.4820 + /* If this is an auto-vacuum database, update the pointer map
1.4821 + ** with entries for the new page, and any pointer from the
1.4822 + ** cell on the page to an overflow page.
1.4823 + */
1.4824 + if( ISAUTOVACUUM ){
1.4825 + rc = ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno);
1.4826 + if( rc==SQLITE_OK ){
1.4827 + rc = ptrmapPutOvfl(pNew, 0);
1.4828 + }
1.4829 + }
1.4830 +
1.4831 + /* Release the reference to the new page. */
1.4832 + releasePage(pNew);
1.4833 + }
1.4834 +
1.4835 + /* At this point the pPage->nFree variable is not set correctly with
1.4836 + ** respect to the content of the page (because it was set to 0 by
1.4837 + ** insertCell). So call sqlite3BtreeInitPage() to make sure it is
1.4838 + ** correct.
1.4839 + **
1.4840 + ** This has to be done even if an error will be returned. Normally, if
1.4841 + ** an error occurs during tree balancing, the contents of MemPage are
1.4842 + ** not important, as they will be recalculated when the page is rolled
1.4843 + ** back. But here, in balance_quick(), it is possible that pPage has
1.4844 + ** not yet been marked dirty or written into the journal file. Therefore
1.4845 + ** it will not be rolled back and so it is important to make sure that
1.4846 + ** the page data and contents of MemPage are consistent.
1.4847 + */
1.4848 + pPage->isInit = 0;
1.4849 + sqlite3BtreeInitPage(pPage);
1.4850 +
1.4851 + /* If everything else succeeded, balance the parent page, in
1.4852 + ** case the divider cell inserted caused it to become overfull.
1.4853 + */
1.4854 + if( rc==SQLITE_OK ){
1.4855 + releasePage(pPage);
1.4856 + pCur->iPage--;
1.4857 + rc = balance(pCur, 0);
1.4858 + }
1.4859 + return rc;
1.4860 +}
1.4861 +#endif /* SQLITE_OMIT_QUICKBALANCE */
1.4862 +
1.4863 +/*
1.4864 +** This routine redistributes Cells on pPage and up to NN*2 siblings
1.4865 +** of pPage so that all pages have about the same amount of free space.
1.4866 +** Usually NN siblings on either side of pPage is used in the balancing,
1.4867 +** though more siblings might come from one side if pPage is the first
1.4868 +** or last child of its parent. If pPage has fewer than 2*NN siblings
1.4869 +** (something which can only happen if pPage is the root page or a
1.4870 +** child of root) then all available siblings participate in the balancing.
1.4871 +**
1.4872 +** The number of siblings of pPage might be increased or decreased by one or
1.4873 +** two in an effort to keep pages nearly full but not over full. The root page
1.4874 +** is special and is allowed to be nearly empty. If pPage is
1.4875 +** the root page, then the depth of the tree might be increased
1.4876 +** or decreased by one, as necessary, to keep the root page from being
1.4877 +** overfull or completely empty.
1.4878 +**
1.4879 +** Note that when this routine is called, some of the Cells on pPage
1.4880 +** might not actually be stored in pPage->aData[]. This can happen
1.4881 +** if the page is overfull. Part of the job of this routine is to
1.4882 +** make sure all Cells for pPage once again fit in pPage->aData[].
1.4883 +**
1.4884 +** In the course of balancing the siblings of pPage, the parent of pPage
1.4885 +** might become overfull or underfull. If that happens, then this routine
1.4886 +** is called recursively on the parent.
1.4887 +**
1.4888 +** If this routine fails for any reason, it might leave the database
1.4889 +** in a corrupted state. So if this routine fails, the database should
1.4890 +** be rolled back.
1.4891 +*/
1.4892 +static int balance_nonroot(BtCursor *pCur){
1.4893 + MemPage *pPage; /* The over or underfull page to balance */
1.4894 + MemPage *pParent; /* The parent of pPage */
1.4895 + BtShared *pBt; /* The whole database */
1.4896 + int nCell = 0; /* Number of cells in apCell[] */
1.4897 + int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */
1.4898 + int nOld; /* Number of pages in apOld[] */
1.4899 + int nNew; /* Number of pages in apNew[] */
1.4900 + int nDiv; /* Number of cells in apDiv[] */
1.4901 + int i, j, k; /* Loop counters */
1.4902 + int idx; /* Index of pPage in pParent->aCell[] */
1.4903 + int nxDiv; /* Next divider slot in pParent->aCell[] */
1.4904 + int rc; /* The return code */
1.4905 + int leafCorrection; /* 4 if pPage is a leaf. 0 if not */
1.4906 + int leafData; /* True if pPage is a leaf of a LEAFDATA tree */
1.4907 + int usableSpace; /* Bytes in pPage beyond the header */
1.4908 + int pageFlags; /* Value of pPage->aData[0] */
1.4909 + int subtotal; /* Subtotal of bytes in cells on one page */
1.4910 + int iSpace1 = 0; /* First unused byte of aSpace1[] */
1.4911 + int iSpace2 = 0; /* First unused byte of aSpace2[] */
1.4912 + int szScratch; /* Size of scratch memory requested */
1.4913 + MemPage *apOld[NB]; /* pPage and up to two siblings */
1.4914 + Pgno pgnoOld[NB]; /* Page numbers for each page in apOld[] */
1.4915 + MemPage *apCopy[NB]; /* Private copies of apOld[] pages */
1.4916 + MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */
1.4917 + Pgno pgnoNew[NB+2]; /* Page numbers for each page in apNew[] */
1.4918 + u8 *apDiv[NB]; /* Divider cells in pParent */
1.4919 + int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */
1.4920 + int szNew[NB+2]; /* Combined size of cells place on i-th page */
1.4921 + u8 **apCell = 0; /* All cells begin balanced */
1.4922 + u16 *szCell; /* Local size of all cells in apCell[] */
1.4923 + u8 *aCopy[NB]; /* Space for holding data of apCopy[] */
1.4924 + u8 *aSpace1; /* Space for copies of dividers cells before balance */
1.4925 + u8 *aSpace2 = 0; /* Space for overflow dividers cells after balance */
1.4926 + u8 *aFrom = 0;
1.4927 +
1.4928 + pPage = pCur->apPage[pCur->iPage];
1.4929 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.4930 + VVA_ONLY( pCur->pagesShuffled = 1 );
1.4931 +
1.4932 + /*
1.4933 + ** Find the parent page.
1.4934 + */
1.4935 + assert( pCur->iPage>0 );
1.4936 + assert( pPage->isInit );
1.4937 + assert( sqlite3PagerIswriteable(pPage->pDbPage) || pPage->nOverflow==1 );
1.4938 + pBt = pPage->pBt;
1.4939 + pParent = pCur->apPage[pCur->iPage-1];
1.4940 + assert( pParent );
1.4941 + if( SQLITE_OK!=(rc = sqlite3PagerWrite(pParent->pDbPage)) ){
1.4942 + return rc;
1.4943 + }
1.4944 +
1.4945 + TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
1.4946 +
1.4947 +#ifndef SQLITE_OMIT_QUICKBALANCE
1.4948 + /*
1.4949 + ** A special case: If a new entry has just been inserted into a
1.4950 + ** table (that is, a btree with integer keys and all data at the leaves)
1.4951 + ** and the new entry is the right-most entry in the tree (it has the
1.4952 + ** largest key) then use the special balance_quick() routine for
1.4953 + ** balancing. balance_quick() is much faster and results in a tighter
1.4954 + ** packing of data in the common case.
1.4955 + */
1.4956 + if( pPage->leaf &&
1.4957 + pPage->intKey &&
1.4958 + pPage->nOverflow==1 &&
1.4959 + pPage->aOvfl[0].idx==pPage->nCell &&
1.4960 + pParent->pgno!=1 &&
1.4961 + get4byte(&pParent->aData[pParent->hdrOffset+8])==pPage->pgno
1.4962 + ){
1.4963 + assert( pPage->intKey );
1.4964 + /*
1.4965 + ** TODO: Check the siblings to the left of pPage. It may be that
1.4966 + ** they are not full and no new page is required.
1.4967 + */
1.4968 + return balance_quick(pCur);
1.4969 + }
1.4970 +#endif
1.4971 +
1.4972 + if( SQLITE_OK!=(rc = sqlite3PagerWrite(pPage->pDbPage)) ){
1.4973 + return rc;
1.4974 + }
1.4975 +
1.4976 + /*
1.4977 + ** Find the cell in the parent page whose left child points back
1.4978 + ** to pPage. The "idx" variable is the index of that cell. If pPage
1.4979 + ** is the rightmost child of pParent then set idx to pParent->nCell
1.4980 + */
1.4981 + idx = pCur->aiIdx[pCur->iPage-1];
1.4982 + assertParentIndex(pParent, idx, pPage->pgno);
1.4983 +
1.4984 + /*
1.4985 + ** Initialize variables so that it will be safe to jump
1.4986 + ** directly to balance_cleanup at any moment.
1.4987 + */
1.4988 + nOld = nNew = 0;
1.4989 +
1.4990 + /*
1.4991 + ** Find sibling pages to pPage and the cells in pParent that divide
1.4992 + ** the siblings. An attempt is made to find NN siblings on either
1.4993 + ** side of pPage. More siblings are taken from one side, however, if
1.4994 + ** pPage there are fewer than NN siblings on the other side. If pParent
1.4995 + ** has NB or fewer children then all children of pParent are taken.
1.4996 + */
1.4997 + nxDiv = idx - NN;
1.4998 + if( nxDiv + NB > pParent->nCell ){
1.4999 + nxDiv = pParent->nCell - NB + 1;
1.5000 + }
1.5001 + if( nxDiv<0 ){
1.5002 + nxDiv = 0;
1.5003 + }
1.5004 + nDiv = 0;
1.5005 + for(i=0, k=nxDiv; i<NB; i++, k++){
1.5006 + if( k<pParent->nCell ){
1.5007 + apDiv[i] = findCell(pParent, k);
1.5008 + nDiv++;
1.5009 + assert( !pParent->leaf );
1.5010 + pgnoOld[i] = get4byte(apDiv[i]);
1.5011 + }else if( k==pParent->nCell ){
1.5012 + pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+8]);
1.5013 + }else{
1.5014 + break;
1.5015 + }
1.5016 + rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i]);
1.5017 + if( rc ) goto balance_cleanup;
1.5018 + /* apOld[i]->idxParent = k; */
1.5019 + apCopy[i] = 0;
1.5020 + assert( i==nOld );
1.5021 + nOld++;
1.5022 + nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
1.5023 + }
1.5024 +
1.5025 + /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
1.5026 + ** alignment */
1.5027 + nMaxCells = (nMaxCells + 3)&~3;
1.5028 +
1.5029 + /*
1.5030 + ** Allocate space for memory structures
1.5031 + */
1.5032 + szScratch =
1.5033 + nMaxCells*sizeof(u8*) /* apCell */
1.5034 + + nMaxCells*sizeof(u16) /* szCell */
1.5035 + + (ROUND8(sizeof(MemPage))+pBt->pageSize)*NB /* aCopy */
1.5036 + + pBt->pageSize /* aSpace1 */
1.5037 + + (ISAUTOVACUUM ? nMaxCells : 0); /* aFrom */
1.5038 + apCell = sqlite3ScratchMalloc( szScratch );
1.5039 + if( apCell==0 ){
1.5040 + rc = SQLITE_NOMEM;
1.5041 + goto balance_cleanup;
1.5042 + }
1.5043 + szCell = (u16*)&apCell[nMaxCells];
1.5044 + aCopy[0] = (u8*)&szCell[nMaxCells];
1.5045 + assert( ((aCopy[0] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
1.5046 + for(i=1; i<NB; i++){
1.5047 + aCopy[i] = &aCopy[i-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
1.5048 + assert( ((aCopy[i] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
1.5049 + }
1.5050 + aSpace1 = &aCopy[NB-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
1.5051 + assert( ((aSpace1 - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
1.5052 + if( ISAUTOVACUUM ){
1.5053 + aFrom = &aSpace1[pBt->pageSize];
1.5054 + }
1.5055 + aSpace2 = sqlite3PageMalloc(pBt->pageSize);
1.5056 + if( aSpace2==0 ){
1.5057 + rc = SQLITE_NOMEM;
1.5058 + goto balance_cleanup;
1.5059 + }
1.5060 +
1.5061 + /*
1.5062 + ** Make copies of the content of pPage and its siblings into aOld[].
1.5063 + ** The rest of this function will use data from the copies rather
1.5064 + ** that the original pages since the original pages will be in the
1.5065 + ** process of being overwritten.
1.5066 + */
1.5067 + for(i=0; i<nOld; i++){
1.5068 + MemPage *p = apCopy[i] = (MemPage*)aCopy[i];
1.5069 + memcpy(p, apOld[i], sizeof(MemPage));
1.5070 + p->aData = (void*)&p[1];
1.5071 + memcpy(p->aData, apOld[i]->aData, pBt->pageSize);
1.5072 + }
1.5073 +
1.5074 + /*
1.5075 + ** Load pointers to all cells on sibling pages and the divider cells
1.5076 + ** into the local apCell[] array. Make copies of the divider cells
1.5077 + ** into space obtained form aSpace1[] and remove the the divider Cells
1.5078 + ** from pParent.
1.5079 + **
1.5080 + ** If the siblings are on leaf pages, then the child pointers of the
1.5081 + ** divider cells are stripped from the cells before they are copied
1.5082 + ** into aSpace1[]. In this way, all cells in apCell[] are without
1.5083 + ** child pointers. If siblings are not leaves, then all cell in
1.5084 + ** apCell[] include child pointers. Either way, all cells in apCell[]
1.5085 + ** are alike.
1.5086 + **
1.5087 + ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf.
1.5088 + ** leafData: 1 if pPage holds key+data and pParent holds only keys.
1.5089 + */
1.5090 + nCell = 0;
1.5091 + leafCorrection = pPage->leaf*4;
1.5092 + leafData = pPage->hasData;
1.5093 + for(i=0; i<nOld; i++){
1.5094 + MemPage *pOld = apCopy[i];
1.5095 + int limit = pOld->nCell+pOld->nOverflow;
1.5096 + for(j=0; j<limit; j++){
1.5097 + assert( nCell<nMaxCells );
1.5098 + apCell[nCell] = findOverflowCell(pOld, j);
1.5099 + szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
1.5100 + if( ISAUTOVACUUM ){
1.5101 + int a;
1.5102 + aFrom[nCell] = i;
1.5103 + for(a=0; a<pOld->nOverflow; a++){
1.5104 + if( pOld->aOvfl[a].pCell==apCell[nCell] ){
1.5105 + aFrom[nCell] = 0xFF;
1.5106 + break;
1.5107 + }
1.5108 + }
1.5109 + }
1.5110 + nCell++;
1.5111 + }
1.5112 + if( i<nOld-1 ){
1.5113 + u16 sz = cellSizePtr(pParent, apDiv[i]);
1.5114 + if( leafData ){
1.5115 + /* With the LEAFDATA flag, pParent cells hold only INTKEYs that
1.5116 + ** are duplicates of keys on the child pages. We need to remove
1.5117 + ** the divider cells from pParent, but the dividers cells are not
1.5118 + ** added to apCell[] because they are duplicates of child cells.
1.5119 + */
1.5120 + dropCell(pParent, nxDiv, sz);
1.5121 + }else{
1.5122 + u8 *pTemp;
1.5123 + assert( nCell<nMaxCells );
1.5124 + szCell[nCell] = sz;
1.5125 + pTemp = &aSpace1[iSpace1];
1.5126 + iSpace1 += sz;
1.5127 + assert( sz<=pBt->pageSize/4 );
1.5128 + assert( iSpace1<=pBt->pageSize );
1.5129 + memcpy(pTemp, apDiv[i], sz);
1.5130 + apCell[nCell] = pTemp+leafCorrection;
1.5131 + if( ISAUTOVACUUM ){
1.5132 + aFrom[nCell] = 0xFF;
1.5133 + }
1.5134 + dropCell(pParent, nxDiv, sz);
1.5135 + szCell[nCell] -= leafCorrection;
1.5136 + assert( get4byte(pTemp)==pgnoOld[i] );
1.5137 + if( !pOld->leaf ){
1.5138 + assert( leafCorrection==0 );
1.5139 + /* The right pointer of the child page pOld becomes the left
1.5140 + ** pointer of the divider cell */
1.5141 + memcpy(apCell[nCell], &pOld->aData[pOld->hdrOffset+8], 4);
1.5142 + }else{
1.5143 + assert( leafCorrection==4 );
1.5144 + if( szCell[nCell]<4 ){
1.5145 + /* Do not allow any cells smaller than 4 bytes. */
1.5146 + szCell[nCell] = 4;
1.5147 + }
1.5148 + }
1.5149 + nCell++;
1.5150 + }
1.5151 + }
1.5152 + }
1.5153 +
1.5154 + /*
1.5155 + ** Figure out the number of pages needed to hold all nCell cells.
1.5156 + ** Store this number in "k". Also compute szNew[] which is the total
1.5157 + ** size of all cells on the i-th page and cntNew[] which is the index
1.5158 + ** in apCell[] of the cell that divides page i from page i+1.
1.5159 + ** cntNew[k] should equal nCell.
1.5160 + **
1.5161 + ** Values computed by this block:
1.5162 + **
1.5163 + ** k: The total number of sibling pages
1.5164 + ** szNew[i]: Spaced used on the i-th sibling page.
1.5165 + ** cntNew[i]: Index in apCell[] and szCell[] for the first cell to
1.5166 + ** the right of the i-th sibling page.
1.5167 + ** usableSpace: Number of bytes of space available on each sibling.
1.5168 + **
1.5169 + */
1.5170 + usableSpace = pBt->usableSize - 12 + leafCorrection;
1.5171 + for(subtotal=k=i=0; i<nCell; i++){
1.5172 + assert( i<nMaxCells );
1.5173 + subtotal += szCell[i] + 2;
1.5174 + if( subtotal > usableSpace ){
1.5175 + szNew[k] = subtotal - szCell[i];
1.5176 + cntNew[k] = i;
1.5177 + if( leafData ){ i--; }
1.5178 + subtotal = 0;
1.5179 + k++;
1.5180 + }
1.5181 + }
1.5182 + szNew[k] = subtotal;
1.5183 + cntNew[k] = nCell;
1.5184 + k++;
1.5185 +
1.5186 + /*
1.5187 + ** The packing computed by the previous block is biased toward the siblings
1.5188 + ** on the left side. The left siblings are always nearly full, while the
1.5189 + ** right-most sibling might be nearly empty. This block of code attempts
1.5190 + ** to adjust the packing of siblings to get a better balance.
1.5191 + **
1.5192 + ** This adjustment is more than an optimization. The packing above might
1.5193 + ** be so out of balance as to be illegal. For example, the right-most
1.5194 + ** sibling might be completely empty. This adjustment is not optional.
1.5195 + */
1.5196 + for(i=k-1; i>0; i--){
1.5197 + int szRight = szNew[i]; /* Size of sibling on the right */
1.5198 + int szLeft = szNew[i-1]; /* Size of sibling on the left */
1.5199 + int r; /* Index of right-most cell in left sibling */
1.5200 + int d; /* Index of first cell to the left of right sibling */
1.5201 +
1.5202 + r = cntNew[i-1] - 1;
1.5203 + d = r + 1 - leafData;
1.5204 + assert( d<nMaxCells );
1.5205 + assert( r<nMaxCells );
1.5206 + while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){
1.5207 + szRight += szCell[d] + 2;
1.5208 + szLeft -= szCell[r] + 2;
1.5209 + cntNew[i-1]--;
1.5210 + r = cntNew[i-1] - 1;
1.5211 + d = r + 1 - leafData;
1.5212 + }
1.5213 + szNew[i] = szRight;
1.5214 + szNew[i-1] = szLeft;
1.5215 + }
1.5216 +
1.5217 + /* Either we found one or more cells (cntnew[0])>0) or we are the
1.5218 + ** a virtual root page. A virtual root page is when the real root
1.5219 + ** page is page 1 and we are the only child of that page.
1.5220 + */
1.5221 + assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
1.5222 +
1.5223 + /*
1.5224 + ** Allocate k new pages. Reuse old pages where possible.
1.5225 + */
1.5226 + assert( pPage->pgno>1 );
1.5227 + pageFlags = pPage->aData[0];
1.5228 + for(i=0; i<k; i++){
1.5229 + MemPage *pNew;
1.5230 + if( i<nOld ){
1.5231 + pNew = apNew[i] = apOld[i];
1.5232 + pgnoNew[i] = pgnoOld[i];
1.5233 + apOld[i] = 0;
1.5234 + rc = sqlite3PagerWrite(pNew->pDbPage);
1.5235 + nNew++;
1.5236 + if( rc ) goto balance_cleanup;
1.5237 + }else{
1.5238 + assert( i>0 );
1.5239 + rc = allocateBtreePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1], 0);
1.5240 + if( rc ) goto balance_cleanup;
1.5241 + apNew[i] = pNew;
1.5242 + nNew++;
1.5243 + }
1.5244 + }
1.5245 +
1.5246 + /* Free any old pages that were not reused as new pages.
1.5247 + */
1.5248 + while( i<nOld ){
1.5249 + rc = freePage(apOld[i]);
1.5250 + if( rc ) goto balance_cleanup;
1.5251 + releasePage(apOld[i]);
1.5252 + apOld[i] = 0;
1.5253 + i++;
1.5254 + }
1.5255 +
1.5256 + /*
1.5257 + ** Put the new pages in accending order. This helps to
1.5258 + ** keep entries in the disk file in order so that a scan
1.5259 + ** of the table is a linear scan through the file. That
1.5260 + ** in turn helps the operating system to deliver pages
1.5261 + ** from the disk more rapidly.
1.5262 + **
1.5263 + ** An O(n^2) insertion sort algorithm is used, but since
1.5264 + ** n is never more than NB (a small constant), that should
1.5265 + ** not be a problem.
1.5266 + **
1.5267 + ** When NB==3, this one optimization makes the database
1.5268 + ** about 25% faster for large insertions and deletions.
1.5269 + */
1.5270 + for(i=0; i<k-1; i++){
1.5271 + int minV = pgnoNew[i];
1.5272 + int minI = i;
1.5273 + for(j=i+1; j<k; j++){
1.5274 + if( pgnoNew[j]<(unsigned)minV ){
1.5275 + minI = j;
1.5276 + minV = pgnoNew[j];
1.5277 + }
1.5278 + }
1.5279 + if( minI>i ){
1.5280 + int t;
1.5281 + MemPage *pT;
1.5282 + t = pgnoNew[i];
1.5283 + pT = apNew[i];
1.5284 + pgnoNew[i] = pgnoNew[minI];
1.5285 + apNew[i] = apNew[minI];
1.5286 + pgnoNew[minI] = t;
1.5287 + apNew[minI] = pT;
1.5288 + }
1.5289 + }
1.5290 + TRACE(("BALANCE: old: %d %d %d new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
1.5291 + pgnoOld[0],
1.5292 + nOld>=2 ? pgnoOld[1] : 0,
1.5293 + nOld>=3 ? pgnoOld[2] : 0,
1.5294 + pgnoNew[0], szNew[0],
1.5295 + nNew>=2 ? pgnoNew[1] : 0, nNew>=2 ? szNew[1] : 0,
1.5296 + nNew>=3 ? pgnoNew[2] : 0, nNew>=3 ? szNew[2] : 0,
1.5297 + nNew>=4 ? pgnoNew[3] : 0, nNew>=4 ? szNew[3] : 0,
1.5298 + nNew>=5 ? pgnoNew[4] : 0, nNew>=5 ? szNew[4] : 0));
1.5299 +
1.5300 + /*
1.5301 + ** Evenly distribute the data in apCell[] across the new pages.
1.5302 + ** Insert divider cells into pParent as necessary.
1.5303 + */
1.5304 + j = 0;
1.5305 + for(i=0; i<nNew; i++){
1.5306 + /* Assemble the new sibling page. */
1.5307 + MemPage *pNew = apNew[i];
1.5308 + assert( j<nMaxCells );
1.5309 + assert( pNew->pgno==pgnoNew[i] );
1.5310 + zeroPage(pNew, pageFlags);
1.5311 + assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
1.5312 + assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );
1.5313 + assert( pNew->nOverflow==0 );
1.5314 +
1.5315 + /* If this is an auto-vacuum database, update the pointer map entries
1.5316 + ** that point to the siblings that were rearranged. These can be: left
1.5317 + ** children of cells, the right-child of the page, or overflow pages
1.5318 + ** pointed to by cells.
1.5319 + */
1.5320 + if( ISAUTOVACUUM ){
1.5321 + for(k=j; k<cntNew[i]; k++){
1.5322 + assert( k<nMaxCells );
1.5323 + if( aFrom[k]==0xFF || apCopy[aFrom[k]]->pgno!=pNew->pgno ){
1.5324 + rc = ptrmapPutOvfl(pNew, k-j);
1.5325 + if( rc==SQLITE_OK && leafCorrection==0 ){
1.5326 + rc = ptrmapPut(pBt, get4byte(apCell[k]), PTRMAP_BTREE, pNew->pgno);
1.5327 + }
1.5328 + if( rc!=SQLITE_OK ){
1.5329 + goto balance_cleanup;
1.5330 + }
1.5331 + }
1.5332 + }
1.5333 + }
1.5334 +
1.5335 + j = cntNew[i];
1.5336 +
1.5337 + /* If the sibling page assembled above was not the right-most sibling,
1.5338 + ** insert a divider cell into the parent page.
1.5339 + */
1.5340 + if( i<nNew-1 && j<nCell ){
1.5341 + u8 *pCell;
1.5342 + u8 *pTemp;
1.5343 + int sz;
1.5344 +
1.5345 + assert( j<nMaxCells );
1.5346 + pCell = apCell[j];
1.5347 + sz = szCell[j] + leafCorrection;
1.5348 + pTemp = &aSpace2[iSpace2];
1.5349 + if( !pNew->leaf ){
1.5350 + memcpy(&pNew->aData[8], pCell, 4);
1.5351 + if( ISAUTOVACUUM
1.5352 + && (aFrom[j]==0xFF || apCopy[aFrom[j]]->pgno!=pNew->pgno)
1.5353 + ){
1.5354 + rc = ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno);
1.5355 + if( rc!=SQLITE_OK ){
1.5356 + goto balance_cleanup;
1.5357 + }
1.5358 + }
1.5359 + }else if( leafData ){
1.5360 + /* If the tree is a leaf-data tree, and the siblings are leaves,
1.5361 + ** then there is no divider cell in apCell[]. Instead, the divider
1.5362 + ** cell consists of the integer key for the right-most cell of
1.5363 + ** the sibling-page assembled above only.
1.5364 + */
1.5365 + CellInfo info;
1.5366 + j--;
1.5367 + sqlite3BtreeParseCellPtr(pNew, apCell[j], &info);
1.5368 + pCell = pTemp;
1.5369 + fillInCell(pParent, pCell, 0, info.nKey, 0, 0, 0, &sz);
1.5370 + pTemp = 0;
1.5371 + }else{
1.5372 + pCell -= 4;
1.5373 + /* Obscure case for non-leaf-data trees: If the cell at pCell was
1.5374 + ** previously stored on a leaf node, and its reported size was 4
1.5375 + ** bytes, then it may actually be smaller than this
1.5376 + ** (see sqlite3BtreeParseCellPtr(), 4 bytes is the minimum size of
1.5377 + ** any cell). But it is important to pass the correct size to
1.5378 + ** insertCell(), so reparse the cell now.
1.5379 + **
1.5380 + ** Note that this can never happen in an SQLite data file, as all
1.5381 + ** cells are at least 4 bytes. It only happens in b-trees used
1.5382 + ** to evaluate "IN (SELECT ...)" and similar clauses.
1.5383 + */
1.5384 + if( szCell[j]==4 ){
1.5385 + assert(leafCorrection==4);
1.5386 + sz = cellSizePtr(pParent, pCell);
1.5387 + }
1.5388 + }
1.5389 + iSpace2 += sz;
1.5390 + assert( sz<=pBt->pageSize/4 );
1.5391 + assert( iSpace2<=pBt->pageSize );
1.5392 + rc = insertCell(pParent, nxDiv, pCell, sz, pTemp, 4);
1.5393 + if( rc!=SQLITE_OK ) goto balance_cleanup;
1.5394 + put4byte(findOverflowCell(pParent,nxDiv), pNew->pgno);
1.5395 +
1.5396 + /* If this is an auto-vacuum database, and not a leaf-data tree,
1.5397 + ** then update the pointer map with an entry for the overflow page
1.5398 + ** that the cell just inserted points to (if any).
1.5399 + */
1.5400 + if( ISAUTOVACUUM && !leafData ){
1.5401 + rc = ptrmapPutOvfl(pParent, nxDiv);
1.5402 + if( rc!=SQLITE_OK ){
1.5403 + goto balance_cleanup;
1.5404 + }
1.5405 + }
1.5406 + j++;
1.5407 + nxDiv++;
1.5408 + }
1.5409 +
1.5410 + /* Set the pointer-map entry for the new sibling page. */
1.5411 + if( ISAUTOVACUUM ){
1.5412 + rc = ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno);
1.5413 + if( rc!=SQLITE_OK ){
1.5414 + goto balance_cleanup;
1.5415 + }
1.5416 + }
1.5417 + }
1.5418 + assert( j==nCell );
1.5419 + assert( nOld>0 );
1.5420 + assert( nNew>0 );
1.5421 + if( (pageFlags & PTF_LEAF)==0 ){
1.5422 + u8 *zChild = &apCopy[nOld-1]->aData[8];
1.5423 + memcpy(&apNew[nNew-1]->aData[8], zChild, 4);
1.5424 + if( ISAUTOVACUUM ){
1.5425 + rc = ptrmapPut(pBt, get4byte(zChild), PTRMAP_BTREE, apNew[nNew-1]->pgno);
1.5426 + if( rc!=SQLITE_OK ){
1.5427 + goto balance_cleanup;
1.5428 + }
1.5429 + }
1.5430 + }
1.5431 + if( nxDiv==pParent->nCell+pParent->nOverflow ){
1.5432 + /* Right-most sibling is the right-most child of pParent */
1.5433 + put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew[nNew-1]);
1.5434 + }else{
1.5435 + /* Right-most sibling is the left child of the first entry in pParent
1.5436 + ** past the right-most divider entry */
1.5437 + put4byte(findOverflowCell(pParent, nxDiv), pgnoNew[nNew-1]);
1.5438 + }
1.5439 +
1.5440 + /*
1.5441 + ** Balance the parent page. Note that the current page (pPage) might
1.5442 + ** have been added to the freelist so it might no longer be initialized.
1.5443 + ** But the parent page will always be initialized.
1.5444 + */
1.5445 + assert( pParent->isInit );
1.5446 + sqlite3ScratchFree(apCell);
1.5447 + apCell = 0;
1.5448 + releasePage(pPage);
1.5449 + pCur->iPage--;
1.5450 + rc = balance(pCur, 0);
1.5451 +
1.5452 + /*
1.5453 + ** Cleanup before returning.
1.5454 + */
1.5455 +balance_cleanup:
1.5456 + sqlite3PageFree(aSpace2);
1.5457 + sqlite3ScratchFree(apCell);
1.5458 + for(i=0; i<nOld; i++){
1.5459 + releasePage(apOld[i]);
1.5460 + }
1.5461 + for(i=0; i<nNew; i++){
1.5462 + releasePage(apNew[i]);
1.5463 + }
1.5464 +
1.5465 + /* releasePage(pParent); */
1.5466 + TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n",
1.5467 + pPage->pgno, nOld, nNew, nCell));
1.5468 +
1.5469 + return rc;
1.5470 +}
1.5471 +
1.5472 +/*
1.5473 +** This routine is called for the root page of a btree when the root
1.5474 +** page contains no cells. This is an opportunity to make the tree
1.5475 +** shallower by one level.
1.5476 +*/
1.5477 +static int balance_shallower(BtCursor *pCur){
1.5478 + MemPage *pPage; /* Root page of B-Tree */
1.5479 + MemPage *pChild; /* The only child page of pPage */
1.5480 + Pgno pgnoChild; /* Page number for pChild */
1.5481 + int rc = SQLITE_OK; /* Return code from subprocedures */
1.5482 + BtShared *pBt; /* The main BTree structure */
1.5483 + int mxCellPerPage; /* Maximum number of cells per page */
1.5484 + u8 **apCell; /* All cells from pages being balanced */
1.5485 + u16 *szCell; /* Local size of all cells */
1.5486 +
1.5487 + assert( pCur->iPage==0 );
1.5488 + pPage = pCur->apPage[0];
1.5489 +
1.5490 + assert( pPage->nCell==0 );
1.5491 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.5492 + pBt = pPage->pBt;
1.5493 + mxCellPerPage = MX_CELL(pBt);
1.5494 + apCell = sqlite3Malloc( mxCellPerPage*(sizeof(u8*)+sizeof(u16)) );
1.5495 + if( apCell==0 ) return SQLITE_NOMEM;
1.5496 + szCell = (u16*)&apCell[mxCellPerPage];
1.5497 + if( pPage->leaf ){
1.5498 + /* The table is completely empty */
1.5499 + TRACE(("BALANCE: empty table %d\n", pPage->pgno));
1.5500 + }else{
1.5501 + /* The root page is empty but has one child. Transfer the
1.5502 + ** information from that one child into the root page if it
1.5503 + ** will fit. This reduces the depth of the tree by one.
1.5504 + **
1.5505 + ** If the root page is page 1, it has less space available than
1.5506 + ** its child (due to the 100 byte header that occurs at the beginning
1.5507 + ** of the database fle), so it might not be able to hold all of the
1.5508 + ** information currently contained in the child. If this is the
1.5509 + ** case, then do not do the transfer. Leave page 1 empty except
1.5510 + ** for the right-pointer to the child page. The child page becomes
1.5511 + ** the virtual root of the tree.
1.5512 + */
1.5513 + VVA_ONLY( pCur->pagesShuffled = 1 );
1.5514 + pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+8]);
1.5515 + assert( pgnoChild>0 );
1.5516 + assert( pgnoChild<=pagerPagecount(pPage->pBt->pPager) );
1.5517 + rc = sqlite3BtreeGetPage(pPage->pBt, pgnoChild, &pChild, 0);
1.5518 + if( rc ) goto end_shallow_balance;
1.5519 + if( pPage->pgno==1 ){
1.5520 + rc = sqlite3BtreeInitPage(pChild);
1.5521 + if( rc ) goto end_shallow_balance;
1.5522 + assert( pChild->nOverflow==0 );
1.5523 + if( pChild->nFree>=100 ){
1.5524 + /* The child information will fit on the root page, so do the
1.5525 + ** copy */
1.5526 + int i;
1.5527 + zeroPage(pPage, pChild->aData[0]);
1.5528 + for(i=0; i<pChild->nCell; i++){
1.5529 + apCell[i] = findCell(pChild,i);
1.5530 + szCell[i] = cellSizePtr(pChild, apCell[i]);
1.5531 + }
1.5532 + assemblePage(pPage, pChild->nCell, apCell, szCell);
1.5533 + /* Copy the right-pointer of the child to the parent. */
1.5534 + put4byte(&pPage->aData[pPage->hdrOffset+8],
1.5535 + get4byte(&pChild->aData[pChild->hdrOffset+8]));
1.5536 + freePage(pChild);
1.5537 + TRACE(("BALANCE: child %d transfer to page 1\n", pChild->pgno));
1.5538 + }else{
1.5539 + /* The child has more information that will fit on the root.
1.5540 + ** The tree is already balanced. Do nothing. */
1.5541 + TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno));
1.5542 + }
1.5543 + }else{
1.5544 + memcpy(pPage->aData, pChild->aData, pPage->pBt->usableSize);
1.5545 + pPage->isInit = 0;
1.5546 + rc = sqlite3BtreeInitPage(pPage);
1.5547 + assert( rc==SQLITE_OK );
1.5548 + freePage(pChild);
1.5549 + TRACE(("BALANCE: transfer child %d into root %d\n",
1.5550 + pChild->pgno, pPage->pgno));
1.5551 + }
1.5552 + assert( pPage->nOverflow==0 );
1.5553 + if( ISAUTOVACUUM ){
1.5554 + rc = setChildPtrmaps(pPage);
1.5555 + }
1.5556 + releasePage(pChild);
1.5557 + }
1.5558 +end_shallow_balance:
1.5559 + sqlite3_free(apCell);
1.5560 + return rc;
1.5561 +}
1.5562 +
1.5563 +
1.5564 +/*
1.5565 +** The root page is overfull
1.5566 +**
1.5567 +** When this happens, Create a new child page and copy the
1.5568 +** contents of the root into the child. Then make the root
1.5569 +** page an empty page with rightChild pointing to the new
1.5570 +** child. Finally, call balance_internal() on the new child
1.5571 +** to cause it to split.
1.5572 +*/
1.5573 +static int balance_deeper(BtCursor *pCur){
1.5574 + int rc; /* Return value from subprocedures */
1.5575 + MemPage *pPage; /* Pointer to the root page */
1.5576 + MemPage *pChild; /* Pointer to a new child page */
1.5577 + Pgno pgnoChild; /* Page number of the new child page */
1.5578 + BtShared *pBt; /* The BTree */
1.5579 + int usableSize; /* Total usable size of a page */
1.5580 + u8 *data; /* Content of the parent page */
1.5581 + u8 *cdata; /* Content of the child page */
1.5582 + int hdr; /* Offset to page header in parent */
1.5583 + int cbrk; /* Offset to content of first cell in parent */
1.5584 +
1.5585 + assert( pCur->iPage==0 );
1.5586 + assert( pCur->apPage[0]->nOverflow>0 );
1.5587 +
1.5588 + VVA_ONLY( pCur->pagesShuffled = 1 );
1.5589 + pPage = pCur->apPage[0];
1.5590 + pBt = pPage->pBt;
1.5591 + assert( sqlite3_mutex_held(pBt->mutex) );
1.5592 + rc = allocateBtreePage(pBt, &pChild, &pgnoChild, pPage->pgno, 0);
1.5593 + if( rc ) return rc;
1.5594 + assert( sqlite3PagerIswriteable(pChild->pDbPage) );
1.5595 + usableSize = pBt->usableSize;
1.5596 + data = pPage->aData;
1.5597 + hdr = pPage->hdrOffset;
1.5598 + cbrk = get2byte(&data[hdr+5]);
1.5599 + cdata = pChild->aData;
1.5600 + memcpy(cdata, &data[hdr], pPage->cellOffset+2*pPage->nCell-hdr);
1.5601 + memcpy(&cdata[cbrk], &data[cbrk], usableSize-cbrk);
1.5602 +
1.5603 + rc = sqlite3BtreeInitPage(pChild);
1.5604 + if( rc==SQLITE_OK ){
1.5605 + int nCopy = pPage->nOverflow*sizeof(pPage->aOvfl[0]);
1.5606 + memcpy(pChild->aOvfl, pPage->aOvfl, nCopy);
1.5607 + pChild->nOverflow = pPage->nOverflow;
1.5608 + if( pChild->nOverflow ){
1.5609 + pChild->nFree = 0;
1.5610 + }
1.5611 + assert( pChild->nCell==pPage->nCell );
1.5612 + zeroPage(pPage, pChild->aData[0] & ~PTF_LEAF);
1.5613 + put4byte(&pPage->aData[pPage->hdrOffset+8], pgnoChild);
1.5614 + TRACE(("BALANCE: copy root %d into %d\n", pPage->pgno, pChild->pgno));
1.5615 + if( ISAUTOVACUUM ){
1.5616 + rc = ptrmapPut(pBt, pChild->pgno, PTRMAP_BTREE, pPage->pgno);
1.5617 + if( rc==SQLITE_OK ){
1.5618 + rc = setChildPtrmaps(pChild);
1.5619 + }
1.5620 + }
1.5621 + }
1.5622 +
1.5623 + if( rc==SQLITE_OK ){
1.5624 + pCur->iPage++;
1.5625 + pCur->apPage[1] = pChild;
1.5626 + pCur->aiIdx[0] = 0;
1.5627 + rc = balance_nonroot(pCur);
1.5628 + }else{
1.5629 + releasePage(pChild);
1.5630 + }
1.5631 +
1.5632 + return rc;
1.5633 +}
1.5634 +
1.5635 +/*
1.5636 +** The page that pCur currently points to has just been modified in
1.5637 +** some way. This function figures out if this modification means the
1.5638 +** tree needs to be balanced, and if so calls the appropriate balancing
1.5639 +** routine.
1.5640 +**
1.5641 +** Parameter isInsert is true if a new cell was just inserted into the
1.5642 +** page, or false otherwise.
1.5643 +*/
1.5644 +static int balance(BtCursor *pCur, int isInsert){
1.5645 + int rc = SQLITE_OK;
1.5646 + MemPage *pPage = pCur->apPage[pCur->iPage];
1.5647 +
1.5648 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.5649 + if( pCur->iPage==0 ){
1.5650 + rc = sqlite3PagerWrite(pPage->pDbPage);
1.5651 + if( rc==SQLITE_OK && pPage->nOverflow>0 ){
1.5652 + rc = balance_deeper(pCur);
1.5653 + }
1.5654 + if( rc==SQLITE_OK && pPage->nCell==0 ){
1.5655 + rc = balance_shallower(pCur);
1.5656 + }
1.5657 + }else{
1.5658 + if( pPage->nOverflow>0 ||
1.5659 + (!isInsert && pPage->nFree>pPage->pBt->usableSize*2/3) ){
1.5660 + rc = balance_nonroot(pCur);
1.5661 + }
1.5662 + }
1.5663 + return rc;
1.5664 +}
1.5665 +
1.5666 +/*
1.5667 +** This routine checks all cursors that point to table pgnoRoot.
1.5668 +** If any of those cursors were opened with wrFlag==0 in a different
1.5669 +** database connection (a database connection that shares the pager
1.5670 +** cache with the current connection) and that other connection
1.5671 +** is not in the ReadUncommmitted state, then this routine returns
1.5672 +** SQLITE_LOCKED.
1.5673 +**
1.5674 +** As well as cursors with wrFlag==0, cursors with wrFlag==1 and
1.5675 +** isIncrblobHandle==1 are also considered 'read' cursors. Incremental
1.5676 +** blob cursors are used for both reading and writing.
1.5677 +**
1.5678 +** When pgnoRoot is the root page of an intkey table, this function is also
1.5679 +** responsible for invalidating incremental blob cursors when the table row
1.5680 +** on which they are opened is deleted or modified. Cursors are invalidated
1.5681 +** according to the following rules:
1.5682 +**
1.5683 +** 1) When BtreeClearTable() is called to completely delete the contents
1.5684 +** of a B-Tree table, pExclude is set to zero and parameter iRow is
1.5685 +** set to non-zero. In this case all incremental blob cursors open
1.5686 +** on the table rooted at pgnoRoot are invalidated.
1.5687 +**
1.5688 +** 2) When BtreeInsert(), BtreeDelete() or BtreePutData() is called to
1.5689 +** modify a table row via an SQL statement, pExclude is set to the
1.5690 +** write cursor used to do the modification and parameter iRow is set
1.5691 +** to the integer row id of the B-Tree entry being modified. Unless
1.5692 +** pExclude is itself an incremental blob cursor, then all incremental
1.5693 +** blob cursors open on row iRow of the B-Tree are invalidated.
1.5694 +**
1.5695 +** 3) If both pExclude and iRow are set to zero, no incremental blob
1.5696 +** cursors are invalidated.
1.5697 +*/
1.5698 +static int checkReadLocks(
1.5699 + Btree *pBtree,
1.5700 + Pgno pgnoRoot,
1.5701 + BtCursor *pExclude,
1.5702 + i64 iRow
1.5703 +){
1.5704 + BtCursor *p;
1.5705 + BtShared *pBt = pBtree->pBt;
1.5706 + sqlite3 *db = pBtree->db;
1.5707 + assert( sqlite3BtreeHoldsMutex(pBtree) );
1.5708 + for(p=pBt->pCursor; p; p=p->pNext){
1.5709 + if( p==pExclude ) continue;
1.5710 + if( p->pgnoRoot!=pgnoRoot ) continue;
1.5711 +#ifndef SQLITE_OMIT_INCRBLOB
1.5712 + if( p->isIncrblobHandle && (
1.5713 + (!pExclude && iRow)
1.5714 + || (pExclude && !pExclude->isIncrblobHandle && p->info.nKey==iRow)
1.5715 + )){
1.5716 + p->eState = CURSOR_INVALID;
1.5717 + }
1.5718 +#endif
1.5719 + if( p->eState!=CURSOR_VALID ) continue;
1.5720 + if( p->wrFlag==0
1.5721 +#ifndef SQLITE_OMIT_INCRBLOB
1.5722 + || p->isIncrblobHandle
1.5723 +#endif
1.5724 + ){
1.5725 + sqlite3 *dbOther = p->pBtree->db;
1.5726 + if( dbOther==0 ||
1.5727 + (dbOther!=db && (dbOther->flags & SQLITE_ReadUncommitted)==0) ){
1.5728 + return SQLITE_LOCKED;
1.5729 + }
1.5730 + }
1.5731 + }
1.5732 + return SQLITE_OK;
1.5733 +}
1.5734 +
1.5735 +/*
1.5736 +** Insert a new record into the BTree. The key is given by (pKey,nKey)
1.5737 +** and the data is given by (pData,nData). The cursor is used only to
1.5738 +** define what table the record should be inserted into. The cursor
1.5739 +** is left pointing at a random location.
1.5740 +**
1.5741 +** For an INTKEY table, only the nKey value of the key is used. pKey is
1.5742 +** ignored. For a ZERODATA table, the pData and nData are both ignored.
1.5743 +*/
1.5744 +int sqlite3BtreeInsert(
1.5745 + BtCursor *pCur, /* Insert data into the table of this cursor */
1.5746 + const void *pKey, i64 nKey, /* The key of the new record */
1.5747 + const void *pData, int nData, /* The data of the new record */
1.5748 + int nZero, /* Number of extra 0 bytes to append to data */
1.5749 + int appendBias /* True if this is likely an append */
1.5750 +){
1.5751 + int rc;
1.5752 + int loc;
1.5753 + int szNew;
1.5754 + int idx;
1.5755 + MemPage *pPage;
1.5756 + Btree *p = pCur->pBtree;
1.5757 + BtShared *pBt = p->pBt;
1.5758 + unsigned char *oldCell;
1.5759 + unsigned char *newCell = 0;
1.5760 +
1.5761 + assert( cursorHoldsMutex(pCur) );
1.5762 + if( pBt->inTransaction!=TRANS_WRITE ){
1.5763 + /* Must start a transaction before doing an insert */
1.5764 + rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
1.5765 + return rc;
1.5766 + }
1.5767 + assert( !pBt->readOnly );
1.5768 + if( !pCur->wrFlag ){
1.5769 + return SQLITE_PERM; /* Cursor not open for writing */
1.5770 + }
1.5771 + if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur, nKey) ){
1.5772 + return SQLITE_LOCKED; /* The table pCur points to has a read lock */
1.5773 + }
1.5774 + if( pCur->eState==CURSOR_FAULT ){
1.5775 + return pCur->skip;
1.5776 + }
1.5777 +
1.5778 + /* Save the positions of any other cursors open on this table */
1.5779 + clearCursorPosition(pCur);
1.5780 + if(
1.5781 + SQLITE_OK!=(rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur)) ||
1.5782 + SQLITE_OK!=(rc = sqlite3BtreeMoveto(pCur, pKey, nKey, appendBias, &loc))
1.5783 + ){
1.5784 + return rc;
1.5785 + }
1.5786 +
1.5787 + pPage = pCur->apPage[pCur->iPage];
1.5788 + assert( pPage->intKey || nKey>=0 );
1.5789 + assert( pPage->leaf || !pPage->intKey );
1.5790 + TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
1.5791 + pCur->pgnoRoot, nKey, nData, pPage->pgno,
1.5792 + loc==0 ? "overwrite" : "new entry"));
1.5793 + assert( pPage->isInit );
1.5794 + allocateTempSpace(pBt);
1.5795 + newCell = pBt->pTmpSpace;
1.5796 + if( newCell==0 ) return SQLITE_NOMEM;
1.5797 + rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);
1.5798 + if( rc ) goto end_insert;
1.5799 + assert( szNew==cellSizePtr(pPage, newCell) );
1.5800 + assert( szNew<=MX_CELL_SIZE(pBt) );
1.5801 + idx = pCur->aiIdx[pCur->iPage];
1.5802 + if( loc==0 && CURSOR_VALID==pCur->eState ){
1.5803 + u16 szOld;
1.5804 + assert( idx<pPage->nCell );
1.5805 + rc = sqlite3PagerWrite(pPage->pDbPage);
1.5806 + if( rc ){
1.5807 + goto end_insert;
1.5808 + }
1.5809 + oldCell = findCell(pPage, idx);
1.5810 + if( !pPage->leaf ){
1.5811 + memcpy(newCell, oldCell, 4);
1.5812 + }
1.5813 + szOld = cellSizePtr(pPage, oldCell);
1.5814 + rc = clearCell(pPage, oldCell);
1.5815 + if( rc ) goto end_insert;
1.5816 + dropCell(pPage, idx, szOld);
1.5817 + }else if( loc<0 && pPage->nCell>0 ){
1.5818 + assert( pPage->leaf );
1.5819 + idx = ++pCur->aiIdx[pCur->iPage];
1.5820 + pCur->info.nSize = 0;
1.5821 + pCur->validNKey = 0;
1.5822 + }else{
1.5823 + assert( pPage->leaf );
1.5824 + }
1.5825 + rc = insertCell(pPage, idx, newCell, szNew, 0, 0);
1.5826 + if( rc!=SQLITE_OK ) goto end_insert;
1.5827 + rc = balance(pCur, 1);
1.5828 + if( rc==SQLITE_OK ){
1.5829 + moveToRoot(pCur);
1.5830 + }
1.5831 +end_insert:
1.5832 + return rc;
1.5833 +}
1.5834 +
1.5835 +/*
1.5836 +** Delete the entry that the cursor is pointing to. The cursor
1.5837 +** is left pointing at a arbitrary location.
1.5838 +*/
1.5839 +int sqlite3BtreeDelete(BtCursor *pCur){
1.5840 + MemPage *pPage = pCur->apPage[pCur->iPage];
1.5841 + int idx;
1.5842 + unsigned char *pCell;
1.5843 + int rc;
1.5844 + Pgno pgnoChild = 0;
1.5845 + Btree *p = pCur->pBtree;
1.5846 + BtShared *pBt = p->pBt;
1.5847 +
1.5848 + assert( cursorHoldsMutex(pCur) );
1.5849 + assert( pPage->isInit );
1.5850 + if( pBt->inTransaction!=TRANS_WRITE ){
1.5851 + /* Must start a transaction before doing a delete */
1.5852 + rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
1.5853 + return rc;
1.5854 + }
1.5855 + assert( !pBt->readOnly );
1.5856 + if( pCur->eState==CURSOR_FAULT ){
1.5857 + return pCur->skip;
1.5858 + }
1.5859 + if( pCur->aiIdx[pCur->iPage]>=pPage->nCell ){
1.5860 + return SQLITE_ERROR; /* The cursor is not pointing to anything */
1.5861 + }
1.5862 + if( !pCur->wrFlag ){
1.5863 + return SQLITE_PERM; /* Did not open this cursor for writing */
1.5864 + }
1.5865 + if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur, pCur->info.nKey) ){
1.5866 + return SQLITE_LOCKED; /* The table pCur points to has a read lock */
1.5867 + }
1.5868 +
1.5869 + /* Restore the current cursor position (a no-op if the cursor is not in
1.5870 + ** CURSOR_REQUIRESEEK state) and save the positions of any other cursors
1.5871 + ** open on the same table. Then call sqlite3PagerWrite() on the page
1.5872 + ** that the entry will be deleted from.
1.5873 + */
1.5874 + if(
1.5875 + (rc = restoreCursorPosition(pCur))!=0 ||
1.5876 + (rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur))!=0 ||
1.5877 + (rc = sqlite3PagerWrite(pPage->pDbPage))!=0
1.5878 + ){
1.5879 + return rc;
1.5880 + }
1.5881 +
1.5882 + /* Locate the cell within its page and leave pCell pointing to the
1.5883 + ** data. The clearCell() call frees any overflow pages associated with the
1.5884 + ** cell. The cell itself is still intact.
1.5885 + */
1.5886 + idx = pCur->aiIdx[pCur->iPage];
1.5887 + pCell = findCell(pPage, idx);
1.5888 + if( !pPage->leaf ){
1.5889 + pgnoChild = get4byte(pCell);
1.5890 + }
1.5891 + rc = clearCell(pPage, pCell);
1.5892 + if( rc ){
1.5893 + return rc;
1.5894 + }
1.5895 +
1.5896 + if( !pPage->leaf ){
1.5897 + /*
1.5898 + ** The entry we are about to delete is not a leaf so if we do not
1.5899 + ** do something we will leave a hole on an internal page.
1.5900 + ** We have to fill the hole by moving in a cell from a leaf. The
1.5901 + ** next Cell after the one to be deleted is guaranteed to exist and
1.5902 + ** to be a leaf so we can use it.
1.5903 + */
1.5904 + BtCursor leafCur;
1.5905 + MemPage *pLeafPage;
1.5906 +
1.5907 + unsigned char *pNext;
1.5908 + int notUsed;
1.5909 + unsigned char *tempCell = 0;
1.5910 + assert( !pPage->intKey );
1.5911 + sqlite3BtreeGetTempCursor(pCur, &leafCur);
1.5912 + rc = sqlite3BtreeNext(&leafCur, ¬Used);
1.5913 + if( rc==SQLITE_OK ){
1.5914 + assert( leafCur.aiIdx[leafCur.iPage]==0 );
1.5915 + pLeafPage = leafCur.apPage[leafCur.iPage];
1.5916 + rc = sqlite3PagerWrite(pLeafPage->pDbPage);
1.5917 + }
1.5918 + if( rc==SQLITE_OK ){
1.5919 + int leafCursorInvalid = 0;
1.5920 + u16 szNext;
1.5921 + TRACE(("DELETE: table=%d delete internal from %d replace from leaf %d\n",
1.5922 + pCur->pgnoRoot, pPage->pgno, pLeafPage->pgno));
1.5923 + dropCell(pPage, idx, cellSizePtr(pPage, pCell));
1.5924 + pNext = findCell(pLeafPage, 0);
1.5925 + szNext = cellSizePtr(pLeafPage, pNext);
1.5926 + assert( MX_CELL_SIZE(pBt)>=szNext+4 );
1.5927 + allocateTempSpace(pBt);
1.5928 + tempCell = pBt->pTmpSpace;
1.5929 + if( tempCell==0 ){
1.5930 + rc = SQLITE_NOMEM;
1.5931 + }
1.5932 + if( rc==SQLITE_OK ){
1.5933 + rc = insertCell(pPage, idx, pNext-4, szNext+4, tempCell, 0);
1.5934 + }
1.5935 +
1.5936 +
1.5937 + /* The "if" statement in the next code block is critical. The
1.5938 + ** slightest error in that statement would allow SQLite to operate
1.5939 + ** correctly most of the time but produce very rare failures. To
1.5940 + ** guard against this, the following macros help to verify that
1.5941 + ** the "if" statement is well tested.
1.5942 + */
1.5943 + testcase( pPage->nOverflow==0 && pPage->nFree<pBt->usableSize*2/3
1.5944 + && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
1.5945 + testcase( pPage->nOverflow==0 && pPage->nFree==pBt->usableSize*2/3
1.5946 + && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
1.5947 + testcase( pPage->nOverflow==0 && pPage->nFree==pBt->usableSize*2/3+1
1.5948 + && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
1.5949 + testcase( pPage->nOverflow>0 && pPage->nFree<=pBt->usableSize*2/3
1.5950 + && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
1.5951 + testcase( (pPage->nOverflow>0 || (pPage->nFree > pBt->usableSize*2/3))
1.5952 + && pLeafPage->nFree+2+szNext == pBt->usableSize*2/3 );
1.5953 +
1.5954 +
1.5955 + if( (pPage->nOverflow>0 || (pPage->nFree > pBt->usableSize*2/3)) &&
1.5956 + (pLeafPage->nFree+2+szNext > pBt->usableSize*2/3)
1.5957 + ){
1.5958 + /* This branch is taken if the internal node is now either overflowing
1.5959 + ** or underfull and the leaf node will be underfull after the just cell
1.5960 + ** copied to the internal node is deleted from it. This is a special
1.5961 + ** case because the call to balance() to correct the internal node
1.5962 + ** may change the tree structure and invalidate the contents of
1.5963 + ** the leafCur.apPage[] and leafCur.aiIdx[] arrays, which will be
1.5964 + ** used by the balance() required to correct the underfull leaf
1.5965 + ** node.
1.5966 + **
1.5967 + ** The formula used in the expression above are based on facets of
1.5968 + ** the SQLite file-format that do not change over time.
1.5969 + */
1.5970 + testcase( pPage->nFree==pBt->usableSize*2/3+1 );
1.5971 + testcase( pLeafPage->nFree+2+szNext==pBt->usableSize*2/3+1 );
1.5972 + leafCursorInvalid = 1;
1.5973 + }
1.5974 +
1.5975 + if( rc==SQLITE_OK ){
1.5976 + put4byte(findOverflowCell(pPage, idx), pgnoChild);
1.5977 + VVA_ONLY( pCur->pagesShuffled = 0 );
1.5978 + rc = balance(pCur, 0);
1.5979 + }
1.5980 +
1.5981 + if( rc==SQLITE_OK && leafCursorInvalid ){
1.5982 + /* The leaf-node is now underfull and so the tree needs to be
1.5983 + ** rebalanced. However, the balance() operation on the internal
1.5984 + ** node above may have modified the structure of the B-Tree and
1.5985 + ** so the current contents of leafCur.apPage[] and leafCur.aiIdx[]
1.5986 + ** may not be trusted.
1.5987 + **
1.5988 + ** It is not possible to copy the ancestry from pCur, as the same
1.5989 + ** balance() call has invalidated the pCur->apPage[] and aiIdx[]
1.5990 + ** arrays.
1.5991 + **
1.5992 + ** The call to saveCursorPosition() below internally saves the
1.5993 + ** key that leafCur is currently pointing to. Currently, there
1.5994 + ** are two copies of that key in the tree - one here on the leaf
1.5995 + ** page and one on some internal node in the tree. The copy on
1.5996 + ** the leaf node is always the next key in tree-order after the
1.5997 + ** copy on the internal node. So, the call to sqlite3BtreeNext()
1.5998 + ** calls restoreCursorPosition() to point the cursor to the copy
1.5999 + ** stored on the internal node, then advances to the next entry,
1.6000 + ** which happens to be the copy of the key on the internal node.
1.6001 + ** Net effect: leafCur is pointing back to the duplicate cell
1.6002 + ** that needs to be removed, and the leafCur.apPage[] and
1.6003 + ** leafCur.aiIdx[] arrays are correct.
1.6004 + */
1.6005 + VVA_ONLY( Pgno leafPgno = pLeafPage->pgno );
1.6006 + rc = saveCursorPosition(&leafCur);
1.6007 + if( rc==SQLITE_OK ){
1.6008 + rc = sqlite3BtreeNext(&leafCur, ¬Used);
1.6009 + }
1.6010 + pLeafPage = leafCur.apPage[leafCur.iPage];
1.6011 + assert( rc!=SQLITE_OK || pLeafPage->pgno==leafPgno );
1.6012 + assert( rc!=SQLITE_OK || leafCur.aiIdx[leafCur.iPage]==0 );
1.6013 + }
1.6014 +
1.6015 + if( rc==SQLITE_OK ){
1.6016 + dropCell(pLeafPage, 0, szNext);
1.6017 + VVA_ONLY( leafCur.pagesShuffled = 0 );
1.6018 + rc = balance(&leafCur, 0);
1.6019 + assert( leafCursorInvalid || !leafCur.pagesShuffled
1.6020 + || !pCur->pagesShuffled );
1.6021 + }
1.6022 + }
1.6023 + sqlite3BtreeReleaseTempCursor(&leafCur);
1.6024 + }else{
1.6025 + TRACE(("DELETE: table=%d delete from leaf %d\n",
1.6026 + pCur->pgnoRoot, pPage->pgno));
1.6027 + dropCell(pPage, idx, cellSizePtr(pPage, pCell));
1.6028 + rc = balance(pCur, 0);
1.6029 + }
1.6030 + if( rc==SQLITE_OK ){
1.6031 + moveToRoot(pCur);
1.6032 + }
1.6033 + return rc;
1.6034 +}
1.6035 +
1.6036 +/*
1.6037 +** Create a new BTree table. Write into *piTable the page
1.6038 +** number for the root page of the new table.
1.6039 +**
1.6040 +** The type of type is determined by the flags parameter. Only the
1.6041 +** following values of flags are currently in use. Other values for
1.6042 +** flags might not work:
1.6043 +**
1.6044 +** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys
1.6045 +** BTREE_ZERODATA Used for SQL indices
1.6046 +*/
1.6047 +static int btreeCreateTable(Btree *p, int *piTable, int flags){
1.6048 + BtShared *pBt = p->pBt;
1.6049 + MemPage *pRoot;
1.6050 + Pgno pgnoRoot;
1.6051 + int rc;
1.6052 +
1.6053 + assert( sqlite3BtreeHoldsMutex(p) );
1.6054 + if( pBt->inTransaction!=TRANS_WRITE ){
1.6055 + /* Must start a transaction first */
1.6056 + rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
1.6057 + return rc;
1.6058 + }
1.6059 + assert( !pBt->readOnly );
1.6060 +
1.6061 +#ifdef SQLITE_OMIT_AUTOVACUUM
1.6062 + rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
1.6063 + if( rc ){
1.6064 + return rc;
1.6065 + }
1.6066 +#else
1.6067 + if( pBt->autoVacuum ){
1.6068 + Pgno pgnoMove; /* Move a page here to make room for the root-page */
1.6069 + MemPage *pPageMove; /* The page to move to. */
1.6070 +
1.6071 + /* Creating a new table may probably require moving an existing database
1.6072 + ** to make room for the new tables root page. In case this page turns
1.6073 + ** out to be an overflow page, delete all overflow page-map caches
1.6074 + ** held by open cursors.
1.6075 + */
1.6076 + invalidateAllOverflowCache(pBt);
1.6077 +
1.6078 + /* Read the value of meta[3] from the database to determine where the
1.6079 + ** root page of the new table should go. meta[3] is the largest root-page
1.6080 + ** created so far, so the new root-page is (meta[3]+1).
1.6081 + */
1.6082 + rc = sqlite3BtreeGetMeta(p, 4, &pgnoRoot);
1.6083 + if( rc!=SQLITE_OK ){
1.6084 + return rc;
1.6085 + }
1.6086 + pgnoRoot++;
1.6087 +
1.6088 + /* The new root-page may not be allocated on a pointer-map page, or the
1.6089 + ** PENDING_BYTE page.
1.6090 + */
1.6091 + while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
1.6092 + pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
1.6093 + pgnoRoot++;
1.6094 + }
1.6095 + assert( pgnoRoot>=3 );
1.6096 +
1.6097 + /* Allocate a page. The page that currently resides at pgnoRoot will
1.6098 + ** be moved to the allocated page (unless the allocated page happens
1.6099 + ** to reside at pgnoRoot).
1.6100 + */
1.6101 + rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1);
1.6102 + if( rc!=SQLITE_OK ){
1.6103 + return rc;
1.6104 + }
1.6105 +
1.6106 + if( pgnoMove!=pgnoRoot ){
1.6107 + /* pgnoRoot is the page that will be used for the root-page of
1.6108 + ** the new table (assuming an error did not occur). But we were
1.6109 + ** allocated pgnoMove. If required (i.e. if it was not allocated
1.6110 + ** by extending the file), the current page at position pgnoMove
1.6111 + ** is already journaled.
1.6112 + */
1.6113 + u8 eType;
1.6114 + Pgno iPtrPage;
1.6115 +
1.6116 + releasePage(pPageMove);
1.6117 +
1.6118 + /* Move the page currently at pgnoRoot to pgnoMove. */
1.6119 + rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0);
1.6120 + if( rc!=SQLITE_OK ){
1.6121 + return rc;
1.6122 + }
1.6123 + rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
1.6124 + if( rc!=SQLITE_OK || eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
1.6125 + releasePage(pRoot);
1.6126 + return rc;
1.6127 + }
1.6128 + assert( eType!=PTRMAP_ROOTPAGE );
1.6129 + assert( eType!=PTRMAP_FREEPAGE );
1.6130 + rc = sqlite3PagerWrite(pRoot->pDbPage);
1.6131 + if( rc!=SQLITE_OK ){
1.6132 + releasePage(pRoot);
1.6133 + return rc;
1.6134 + }
1.6135 + rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
1.6136 + releasePage(pRoot);
1.6137 +
1.6138 + /* Obtain the page at pgnoRoot */
1.6139 + if( rc!=SQLITE_OK ){
1.6140 + return rc;
1.6141 + }
1.6142 + rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0);
1.6143 + if( rc!=SQLITE_OK ){
1.6144 + return rc;
1.6145 + }
1.6146 + rc = sqlite3PagerWrite(pRoot->pDbPage);
1.6147 + if( rc!=SQLITE_OK ){
1.6148 + releasePage(pRoot);
1.6149 + return rc;
1.6150 + }
1.6151 + }else{
1.6152 + pRoot = pPageMove;
1.6153 + }
1.6154 +
1.6155 + /* Update the pointer-map and meta-data with the new root-page number. */
1.6156 + rc = ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0);
1.6157 + if( rc ){
1.6158 + releasePage(pRoot);
1.6159 + return rc;
1.6160 + }
1.6161 + rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
1.6162 + if( rc ){
1.6163 + releasePage(pRoot);
1.6164 + return rc;
1.6165 + }
1.6166 +
1.6167 + }else{
1.6168 + rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
1.6169 + if( rc ) return rc;
1.6170 + }
1.6171 +#endif
1.6172 + assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
1.6173 + zeroPage(pRoot, flags | PTF_LEAF);
1.6174 + sqlite3PagerUnref(pRoot->pDbPage);
1.6175 + *piTable = (int)pgnoRoot;
1.6176 + return SQLITE_OK;
1.6177 +}
1.6178 +int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
1.6179 + int rc;
1.6180 + sqlite3BtreeEnter(p);
1.6181 + p->pBt->db = p->db;
1.6182 + rc = btreeCreateTable(p, piTable, flags);
1.6183 + sqlite3BtreeLeave(p);
1.6184 + return rc;
1.6185 +}
1.6186 +
1.6187 +/*
1.6188 +** Erase the given database page and all its children. Return
1.6189 +** the page to the freelist.
1.6190 +*/
1.6191 +static int clearDatabasePage(
1.6192 + BtShared *pBt, /* The BTree that contains the table */
1.6193 + Pgno pgno, /* Page number to clear */
1.6194 + MemPage *pParent, /* Parent page. NULL for the root */
1.6195 + int freePageFlag /* Deallocate page if true */
1.6196 +){
1.6197 + MemPage *pPage = 0;
1.6198 + int rc;
1.6199 + unsigned char *pCell;
1.6200 + int i;
1.6201 +
1.6202 + assert( sqlite3_mutex_held(pBt->mutex) );
1.6203 + if( pgno>pagerPagecount(pBt->pPager) ){
1.6204 + return SQLITE_CORRUPT_BKPT;
1.6205 + }
1.6206 +
1.6207 + rc = getAndInitPage(pBt, pgno, &pPage);
1.6208 + if( rc ) goto cleardatabasepage_out;
1.6209 + for(i=0; i<pPage->nCell; i++){
1.6210 + pCell = findCell(pPage, i);
1.6211 + if( !pPage->leaf ){
1.6212 + rc = clearDatabasePage(pBt, get4byte(pCell), pPage, 1);
1.6213 + if( rc ) goto cleardatabasepage_out;
1.6214 + }
1.6215 + rc = clearCell(pPage, pCell);
1.6216 + if( rc ) goto cleardatabasepage_out;
1.6217 + }
1.6218 + if( !pPage->leaf ){
1.6219 + rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), pPage, 1);
1.6220 + if( rc ) goto cleardatabasepage_out;
1.6221 + }
1.6222 + if( freePageFlag ){
1.6223 + rc = freePage(pPage);
1.6224 + }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
1.6225 + zeroPage(pPage, pPage->aData[0] | PTF_LEAF);
1.6226 + }
1.6227 +
1.6228 +cleardatabasepage_out:
1.6229 + releasePage(pPage);
1.6230 + return rc;
1.6231 +}
1.6232 +
1.6233 +/*
1.6234 +** Delete all information from a single table in the database. iTable is
1.6235 +** the page number of the root of the table. After this routine returns,
1.6236 +** the root page is empty, but still exists.
1.6237 +**
1.6238 +** This routine will fail with SQLITE_LOCKED if there are any open
1.6239 +** read cursors on the table. Open write cursors are moved to the
1.6240 +** root of the table.
1.6241 +*/
1.6242 +int sqlite3BtreeClearTable(Btree *p, int iTable){
1.6243 + int rc;
1.6244 + BtShared *pBt = p->pBt;
1.6245 + sqlite3BtreeEnter(p);
1.6246 + pBt->db = p->db;
1.6247 + if( p->inTrans!=TRANS_WRITE ){
1.6248 + rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
1.6249 + }else if( (rc = checkReadLocks(p, iTable, 0, 1))!=SQLITE_OK ){
1.6250 + /* nothing to do */
1.6251 + }else if( SQLITE_OK!=(rc = saveAllCursors(pBt, iTable, 0)) ){
1.6252 + /* nothing to do */
1.6253 + }else{
1.6254 + rc = clearDatabasePage(pBt, (Pgno)iTable, 0, 0);
1.6255 + }
1.6256 + sqlite3BtreeLeave(p);
1.6257 + return rc;
1.6258 +}
1.6259 +
1.6260 +/*
1.6261 +** Erase all information in a table and add the root of the table to
1.6262 +** the freelist. Except, the root of the principle table (the one on
1.6263 +** page 1) is never added to the freelist.
1.6264 +**
1.6265 +** This routine will fail with SQLITE_LOCKED if there are any open
1.6266 +** cursors on the table.
1.6267 +**
1.6268 +** If AUTOVACUUM is enabled and the page at iTable is not the last
1.6269 +** root page in the database file, then the last root page
1.6270 +** in the database file is moved into the slot formerly occupied by
1.6271 +** iTable and that last slot formerly occupied by the last root page
1.6272 +** is added to the freelist instead of iTable. In this say, all
1.6273 +** root pages are kept at the beginning of the database file, which
1.6274 +** is necessary for AUTOVACUUM to work right. *piMoved is set to the
1.6275 +** page number that used to be the last root page in the file before
1.6276 +** the move. If no page gets moved, *piMoved is set to 0.
1.6277 +** The last root page is recorded in meta[3] and the value of
1.6278 +** meta[3] is updated by this procedure.
1.6279 +*/
1.6280 +static int btreeDropTable(Btree *p, int iTable, int *piMoved){
1.6281 + int rc;
1.6282 + MemPage *pPage = 0;
1.6283 + BtShared *pBt = p->pBt;
1.6284 +
1.6285 + assert( sqlite3BtreeHoldsMutex(p) );
1.6286 + if( p->inTrans!=TRANS_WRITE ){
1.6287 + return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
1.6288 + }
1.6289 +
1.6290 + /* It is illegal to drop a table if any cursors are open on the
1.6291 + ** database. This is because in auto-vacuum mode the backend may
1.6292 + ** need to move another root-page to fill a gap left by the deleted
1.6293 + ** root page. If an open cursor was using this page a problem would
1.6294 + ** occur.
1.6295 + */
1.6296 + if( pBt->pCursor ){
1.6297 + return SQLITE_LOCKED;
1.6298 + }
1.6299 +
1.6300 + rc = sqlite3BtreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
1.6301 + if( rc ) return rc;
1.6302 + rc = sqlite3BtreeClearTable(p, iTable);
1.6303 + if( rc ){
1.6304 + releasePage(pPage);
1.6305 + return rc;
1.6306 + }
1.6307 +
1.6308 + *piMoved = 0;
1.6309 +
1.6310 + if( iTable>1 ){
1.6311 +#ifdef SQLITE_OMIT_AUTOVACUUM
1.6312 + rc = freePage(pPage);
1.6313 + releasePage(pPage);
1.6314 +#else
1.6315 + if( pBt->autoVacuum ){
1.6316 + Pgno maxRootPgno;
1.6317 + rc = sqlite3BtreeGetMeta(p, 4, &maxRootPgno);
1.6318 + if( rc!=SQLITE_OK ){
1.6319 + releasePage(pPage);
1.6320 + return rc;
1.6321 + }
1.6322 +
1.6323 + if( iTable==maxRootPgno ){
1.6324 + /* If the table being dropped is the table with the largest root-page
1.6325 + ** number in the database, put the root page on the free list.
1.6326 + */
1.6327 + rc = freePage(pPage);
1.6328 + releasePage(pPage);
1.6329 + if( rc!=SQLITE_OK ){
1.6330 + return rc;
1.6331 + }
1.6332 + }else{
1.6333 + /* The table being dropped does not have the largest root-page
1.6334 + ** number in the database. So move the page that does into the
1.6335 + ** gap left by the deleted root-page.
1.6336 + */
1.6337 + MemPage *pMove;
1.6338 + releasePage(pPage);
1.6339 + rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0);
1.6340 + if( rc!=SQLITE_OK ){
1.6341 + return rc;
1.6342 + }
1.6343 + rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
1.6344 + releasePage(pMove);
1.6345 + if( rc!=SQLITE_OK ){
1.6346 + return rc;
1.6347 + }
1.6348 + rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0);
1.6349 + if( rc!=SQLITE_OK ){
1.6350 + return rc;
1.6351 + }
1.6352 + rc = freePage(pMove);
1.6353 + releasePage(pMove);
1.6354 + if( rc!=SQLITE_OK ){
1.6355 + return rc;
1.6356 + }
1.6357 + *piMoved = maxRootPgno;
1.6358 + }
1.6359 +
1.6360 + /* Set the new 'max-root-page' value in the database header. This
1.6361 + ** is the old value less one, less one more if that happens to
1.6362 + ** be a root-page number, less one again if that is the
1.6363 + ** PENDING_BYTE_PAGE.
1.6364 + */
1.6365 + maxRootPgno--;
1.6366 + if( maxRootPgno==PENDING_BYTE_PAGE(pBt) ){
1.6367 + maxRootPgno--;
1.6368 + }
1.6369 + if( maxRootPgno==PTRMAP_PAGENO(pBt, maxRootPgno) ){
1.6370 + maxRootPgno--;
1.6371 + }
1.6372 + assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
1.6373 +
1.6374 + rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
1.6375 + }else{
1.6376 + rc = freePage(pPage);
1.6377 + releasePage(pPage);
1.6378 + }
1.6379 +#endif
1.6380 + }else{
1.6381 + /* If sqlite3BtreeDropTable was called on page 1. */
1.6382 + zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
1.6383 + releasePage(pPage);
1.6384 + }
1.6385 + return rc;
1.6386 +}
1.6387 +int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
1.6388 + int rc;
1.6389 + sqlite3BtreeEnter(p);
1.6390 + p->pBt->db = p->db;
1.6391 + rc = btreeDropTable(p, iTable, piMoved);
1.6392 + sqlite3BtreeLeave(p);
1.6393 + return rc;
1.6394 +}
1.6395 +
1.6396 +
1.6397 +/*
1.6398 +** Read the meta-information out of a database file. Meta[0]
1.6399 +** is the number of free pages currently in the database. Meta[1]
1.6400 +** through meta[15] are available for use by higher layers. Meta[0]
1.6401 +** is read-only, the others are read/write.
1.6402 +**
1.6403 +** The schema layer numbers meta values differently. At the schema
1.6404 +** layer (and the SetCookie and ReadCookie opcodes) the number of
1.6405 +** free pages is not visible. So Cookie[0] is the same as Meta[1].
1.6406 +*/
1.6407 +int sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
1.6408 + DbPage *pDbPage;
1.6409 + int rc;
1.6410 + unsigned char *pP1;
1.6411 + BtShared *pBt = p->pBt;
1.6412 +
1.6413 + sqlite3BtreeEnter(p);
1.6414 + pBt->db = p->db;
1.6415 +
1.6416 + /* Reading a meta-data value requires a read-lock on page 1 (and hence
1.6417 + ** the sqlite_master table. We grab this lock regardless of whether or
1.6418 + ** not the SQLITE_ReadUncommitted flag is set (the table rooted at page
1.6419 + ** 1 is treated as a special case by queryTableLock() and lockTable()).
1.6420 + */
1.6421 + rc = queryTableLock(p, 1, READ_LOCK);
1.6422 + if( rc!=SQLITE_OK ){
1.6423 + sqlite3BtreeLeave(p);
1.6424 + return rc;
1.6425 + }
1.6426 +
1.6427 + assert( idx>=0 && idx<=15 );
1.6428 + if( pBt->pPage1 ){
1.6429 + /* The b-tree is already holding a reference to page 1 of the database
1.6430 + ** file. In this case the required meta-data value can be read directly
1.6431 + ** from the page data of this reference. This is slightly faster than
1.6432 + ** requesting a new reference from the pager layer.
1.6433 + */
1.6434 + pP1 = (unsigned char *)pBt->pPage1->aData;
1.6435 + }else{
1.6436 + /* The b-tree does not have a reference to page 1 of the database file.
1.6437 + ** Obtain one from the pager layer.
1.6438 + */
1.6439 + rc = sqlite3PagerGet(pBt->pPager, 1, &pDbPage);
1.6440 + if( rc ){
1.6441 + sqlite3BtreeLeave(p);
1.6442 + return rc;
1.6443 + }
1.6444 + pP1 = (unsigned char *)sqlite3PagerGetData(pDbPage);
1.6445 + }
1.6446 + *pMeta = get4byte(&pP1[36 + idx*4]);
1.6447 +
1.6448 + /* If the b-tree is not holding a reference to page 1, then one was
1.6449 + ** requested from the pager layer in the above block. Release it now.
1.6450 + */
1.6451 + if( !pBt->pPage1 ){
1.6452 + sqlite3PagerUnref(pDbPage);
1.6453 + }
1.6454 +
1.6455 + /* If autovacuumed is disabled in this build but we are trying to
1.6456 + ** access an autovacuumed database, then make the database readonly.
1.6457 + */
1.6458 +#ifdef SQLITE_OMIT_AUTOVACUUM
1.6459 + if( idx==4 && *pMeta>0 ) pBt->readOnly = 1;
1.6460 +#endif
1.6461 +
1.6462 + /* Grab the read-lock on page 1. */
1.6463 + rc = lockTable(p, 1, READ_LOCK);
1.6464 + sqlite3BtreeLeave(p);
1.6465 + return rc;
1.6466 +}
1.6467 +
1.6468 +/*
1.6469 +** Write meta-information back into the database. Meta[0] is
1.6470 +** read-only and may not be written.
1.6471 +*/
1.6472 +int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
1.6473 + BtShared *pBt = p->pBt;
1.6474 + unsigned char *pP1;
1.6475 + int rc;
1.6476 + assert( idx>=1 && idx<=15 );
1.6477 + sqlite3BtreeEnter(p);
1.6478 + pBt->db = p->db;
1.6479 + if( p->inTrans!=TRANS_WRITE ){
1.6480 + rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
1.6481 + }else{
1.6482 + assert( pBt->pPage1!=0 );
1.6483 + pP1 = pBt->pPage1->aData;
1.6484 + rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
1.6485 + if( rc==SQLITE_OK ){
1.6486 + put4byte(&pP1[36 + idx*4], iMeta);
1.6487 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.6488 + if( idx==7 ){
1.6489 + assert( pBt->autoVacuum || iMeta==0 );
1.6490 + assert( iMeta==0 || iMeta==1 );
1.6491 + pBt->incrVacuum = iMeta;
1.6492 + }
1.6493 +#endif
1.6494 + }
1.6495 + }
1.6496 + sqlite3BtreeLeave(p);
1.6497 + return rc;
1.6498 +}
1.6499 +
1.6500 +/*
1.6501 +** Return the flag byte at the beginning of the page that the cursor
1.6502 +** is currently pointing to.
1.6503 +*/
1.6504 +int sqlite3BtreeFlags(BtCursor *pCur){
1.6505 + /* TODO: What about CURSOR_REQUIRESEEK state? Probably need to call
1.6506 + ** restoreCursorPosition() here.
1.6507 + */
1.6508 + MemPage *pPage;
1.6509 + restoreCursorPosition(pCur);
1.6510 + pPage = pCur->apPage[pCur->iPage];
1.6511 + assert( cursorHoldsMutex(pCur) );
1.6512 + assert( pPage->pBt==pCur->pBt );
1.6513 + return pPage ? pPage->aData[pPage->hdrOffset] : 0;
1.6514 +}
1.6515 +
1.6516 +
1.6517 +/*
1.6518 +** Return the pager associated with a BTree. This routine is used for
1.6519 +** testing and debugging only.
1.6520 +*/
1.6521 +Pager *sqlite3BtreePager(Btree *p){
1.6522 + return p->pBt->pPager;
1.6523 +}
1.6524 +
1.6525 +#ifndef SQLITE_OMIT_INTEGRITY_CHECK
1.6526 +/*
1.6527 +** Append a message to the error message string.
1.6528 +*/
1.6529 +static void checkAppendMsg(
1.6530 + IntegrityCk *pCheck,
1.6531 + char *zMsg1,
1.6532 + const char *zFormat,
1.6533 + ...
1.6534 +){
1.6535 + va_list ap;
1.6536 + if( !pCheck->mxErr ) return;
1.6537 + pCheck->mxErr--;
1.6538 + pCheck->nErr++;
1.6539 + va_start(ap, zFormat);
1.6540 + if( pCheck->errMsg.nChar ){
1.6541 + sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);
1.6542 + }
1.6543 + if( zMsg1 ){
1.6544 + sqlite3StrAccumAppend(&pCheck->errMsg, zMsg1, -1);
1.6545 + }
1.6546 + sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap);
1.6547 + va_end(ap);
1.6548 + if( pCheck->errMsg.mallocFailed ){
1.6549 + pCheck->mallocFailed = 1;
1.6550 + }
1.6551 +}
1.6552 +#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
1.6553 +
1.6554 +#ifndef SQLITE_OMIT_INTEGRITY_CHECK
1.6555 +/*
1.6556 +** Add 1 to the reference count for page iPage. If this is the second
1.6557 +** reference to the page, add an error message to pCheck->zErrMsg.
1.6558 +** Return 1 if there are 2 ore more references to the page and 0 if
1.6559 +** if this is the first reference to the page.
1.6560 +**
1.6561 +** Also check that the page number is in bounds.
1.6562 +*/
1.6563 +static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){
1.6564 + if( iPage==0 ) return 1;
1.6565 + if( iPage>pCheck->nPage || iPage<0 ){
1.6566 + checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage);
1.6567 + return 1;
1.6568 + }
1.6569 + if( pCheck->anRef[iPage]==1 ){
1.6570 + checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage);
1.6571 + return 1;
1.6572 + }
1.6573 + return (pCheck->anRef[iPage]++)>1;
1.6574 +}
1.6575 +
1.6576 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.6577 +/*
1.6578 +** Check that the entry in the pointer-map for page iChild maps to
1.6579 +** page iParent, pointer type ptrType. If not, append an error message
1.6580 +** to pCheck.
1.6581 +*/
1.6582 +static void checkPtrmap(
1.6583 + IntegrityCk *pCheck, /* Integrity check context */
1.6584 + Pgno iChild, /* Child page number */
1.6585 + u8 eType, /* Expected pointer map type */
1.6586 + Pgno iParent, /* Expected pointer map parent page number */
1.6587 + char *zContext /* Context description (used for error msg) */
1.6588 +){
1.6589 + int rc;
1.6590 + u8 ePtrmapType;
1.6591 + Pgno iPtrmapParent;
1.6592 +
1.6593 + rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
1.6594 + if( rc!=SQLITE_OK ){
1.6595 + checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild);
1.6596 + return;
1.6597 + }
1.6598 +
1.6599 + if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
1.6600 + checkAppendMsg(pCheck, zContext,
1.6601 + "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
1.6602 + iChild, eType, iParent, ePtrmapType, iPtrmapParent);
1.6603 + }
1.6604 +}
1.6605 +#endif
1.6606 +
1.6607 +/*
1.6608 +** Check the integrity of the freelist or of an overflow page list.
1.6609 +** Verify that the number of pages on the list is N.
1.6610 +*/
1.6611 +static void checkList(
1.6612 + IntegrityCk *pCheck, /* Integrity checking context */
1.6613 + int isFreeList, /* True for a freelist. False for overflow page list */
1.6614 + int iPage, /* Page number for first page in the list */
1.6615 + int N, /* Expected number of pages in the list */
1.6616 + char *zContext /* Context for error messages */
1.6617 +){
1.6618 + int i;
1.6619 + int expected = N;
1.6620 + int iFirst = iPage;
1.6621 + while( N-- > 0 && pCheck->mxErr ){
1.6622 + DbPage *pOvflPage;
1.6623 + unsigned char *pOvflData;
1.6624 + if( iPage<1 ){
1.6625 + checkAppendMsg(pCheck, zContext,
1.6626 + "%d of %d pages missing from overflow list starting at %d",
1.6627 + N+1, expected, iFirst);
1.6628 + break;
1.6629 + }
1.6630 + if( checkRef(pCheck, iPage, zContext) ) break;
1.6631 + if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){
1.6632 + checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage);
1.6633 + break;
1.6634 + }
1.6635 + pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
1.6636 + if( isFreeList ){
1.6637 + int n = get4byte(&pOvflData[4]);
1.6638 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.6639 + if( pCheck->pBt->autoVacuum ){
1.6640 + checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext);
1.6641 + }
1.6642 +#endif
1.6643 + if( n>pCheck->pBt->usableSize/4-2 ){
1.6644 + checkAppendMsg(pCheck, zContext,
1.6645 + "freelist leaf count too big on page %d", iPage);
1.6646 + N--;
1.6647 + }else{
1.6648 + for(i=0; i<n; i++){
1.6649 + Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
1.6650 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.6651 + if( pCheck->pBt->autoVacuum ){
1.6652 + checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext);
1.6653 + }
1.6654 +#endif
1.6655 + checkRef(pCheck, iFreePage, zContext);
1.6656 + }
1.6657 + N -= n;
1.6658 + }
1.6659 + }
1.6660 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.6661 + else{
1.6662 + /* If this database supports auto-vacuum and iPage is not the last
1.6663 + ** page in this overflow list, check that the pointer-map entry for
1.6664 + ** the following page matches iPage.
1.6665 + */
1.6666 + if( pCheck->pBt->autoVacuum && N>0 ){
1.6667 + i = get4byte(pOvflData);
1.6668 + checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext);
1.6669 + }
1.6670 + }
1.6671 +#endif
1.6672 + iPage = get4byte(pOvflData);
1.6673 + sqlite3PagerUnref(pOvflPage);
1.6674 + }
1.6675 +}
1.6676 +#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
1.6677 +
1.6678 +#ifndef SQLITE_OMIT_INTEGRITY_CHECK
1.6679 +/*
1.6680 +** Do various sanity checks on a single page of a tree. Return
1.6681 +** the tree depth. Root pages return 0. Parents of root pages
1.6682 +** return 1, and so forth.
1.6683 +**
1.6684 +** These checks are done:
1.6685 +**
1.6686 +** 1. Make sure that cells and freeblocks do not overlap
1.6687 +** but combine to completely cover the page.
1.6688 +** NO 2. Make sure cell keys are in order.
1.6689 +** NO 3. Make sure no key is less than or equal to zLowerBound.
1.6690 +** NO 4. Make sure no key is greater than or equal to zUpperBound.
1.6691 +** 5. Check the integrity of overflow pages.
1.6692 +** 6. Recursively call checkTreePage on all children.
1.6693 +** 7. Verify that the depth of all children is the same.
1.6694 +** 8. Make sure this page is at least 33% full or else it is
1.6695 +** the root of the tree.
1.6696 +*/
1.6697 +static int checkTreePage(
1.6698 + IntegrityCk *pCheck, /* Context for the sanity check */
1.6699 + int iPage, /* Page number of the page to check */
1.6700 + MemPage *pParent, /* Parent page */
1.6701 + char *zParentContext /* Parent context */
1.6702 +){
1.6703 + MemPage *pPage;
1.6704 + int i, rc, depth, d2, pgno, cnt;
1.6705 + int hdr, cellStart;
1.6706 + int nCell;
1.6707 + u8 *data;
1.6708 + BtShared *pBt;
1.6709 + int usableSize;
1.6710 + char zContext[100];
1.6711 + char *hit;
1.6712 +
1.6713 + sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage);
1.6714 +
1.6715 + /* Check that the page exists
1.6716 + */
1.6717 + pBt = pCheck->pBt;
1.6718 + usableSize = pBt->usableSize;
1.6719 + if( iPage==0 ) return 0;
1.6720 + if( checkRef(pCheck, iPage, zParentContext) ) return 0;
1.6721 + if( (rc = sqlite3BtreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
1.6722 + checkAppendMsg(pCheck, zContext,
1.6723 + "unable to get the page. error code=%d", rc);
1.6724 + return 0;
1.6725 + }
1.6726 + if( (rc = sqlite3BtreeInitPage(pPage))!=0 ){
1.6727 + checkAppendMsg(pCheck, zContext,
1.6728 + "sqlite3BtreeInitPage() returns error code %d", rc);
1.6729 + releasePage(pPage);
1.6730 + return 0;
1.6731 + }
1.6732 +
1.6733 + /* Check out all the cells.
1.6734 + */
1.6735 + depth = 0;
1.6736 + for(i=0; i<pPage->nCell && pCheck->mxErr; i++){
1.6737 + u8 *pCell;
1.6738 + int sz;
1.6739 + CellInfo info;
1.6740 +
1.6741 + /* Check payload overflow pages
1.6742 + */
1.6743 + sqlite3_snprintf(sizeof(zContext), zContext,
1.6744 + "On tree page %d cell %d: ", iPage, i);
1.6745 + pCell = findCell(pPage,i);
1.6746 + sqlite3BtreeParseCellPtr(pPage, pCell, &info);
1.6747 + sz = info.nData;
1.6748 + if( !pPage->intKey ) sz += info.nKey;
1.6749 + assert( sz==info.nPayload );
1.6750 + if( sz>info.nLocal ){
1.6751 + int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4);
1.6752 + Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
1.6753 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.6754 + if( pBt->autoVacuum ){
1.6755 + checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext);
1.6756 + }
1.6757 +#endif
1.6758 + checkList(pCheck, 0, pgnoOvfl, nPage, zContext);
1.6759 + }
1.6760 +
1.6761 + /* Check sanity of left child page.
1.6762 + */
1.6763 + if( !pPage->leaf ){
1.6764 + pgno = get4byte(pCell);
1.6765 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.6766 + if( pBt->autoVacuum ){
1.6767 + checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
1.6768 + }
1.6769 +#endif
1.6770 + d2 = checkTreePage(pCheck,pgno,pPage,zContext);
1.6771 + if( i>0 && d2!=depth ){
1.6772 + checkAppendMsg(pCheck, zContext, "Child page depth differs");
1.6773 + }
1.6774 + depth = d2;
1.6775 + }
1.6776 + }
1.6777 + if( !pPage->leaf ){
1.6778 + pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
1.6779 + sqlite3_snprintf(sizeof(zContext), zContext,
1.6780 + "On page %d at right child: ", iPage);
1.6781 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.6782 + if( pBt->autoVacuum ){
1.6783 + checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, 0);
1.6784 + }
1.6785 +#endif
1.6786 + checkTreePage(pCheck, pgno, pPage, zContext);
1.6787 + }
1.6788 +
1.6789 + /* Check for complete coverage of the page
1.6790 + */
1.6791 + data = pPage->aData;
1.6792 + hdr = pPage->hdrOffset;
1.6793 + hit = sqlite3PageMalloc( pBt->pageSize );
1.6794 + if( hit==0 ){
1.6795 + pCheck->mallocFailed = 1;
1.6796 + }else{
1.6797 + memset(hit, 0, usableSize );
1.6798 + memset(hit, 1, get2byte(&data[hdr+5]));
1.6799 + nCell = get2byte(&data[hdr+3]);
1.6800 + cellStart = hdr + 12 - 4*pPage->leaf;
1.6801 + for(i=0; i<nCell; i++){
1.6802 + int pc = get2byte(&data[cellStart+i*2]);
1.6803 + u16 size = 1024;
1.6804 + int j;
1.6805 + if( pc<=usableSize ){
1.6806 + size = cellSizePtr(pPage, &data[pc]);
1.6807 + }
1.6808 + if( (pc+size-1)>=usableSize || pc<0 ){
1.6809 + checkAppendMsg(pCheck, 0,
1.6810 + "Corruption detected in cell %d on page %d",i,iPage,0);
1.6811 + }else{
1.6812 + for(j=pc+size-1; j>=pc; j--) hit[j]++;
1.6813 + }
1.6814 + }
1.6815 + for(cnt=0, i=get2byte(&data[hdr+1]); i>0 && i<usableSize && cnt<10000;
1.6816 + cnt++){
1.6817 + int size = get2byte(&data[i+2]);
1.6818 + int j;
1.6819 + if( (i+size-1)>=usableSize || i<0 ){
1.6820 + checkAppendMsg(pCheck, 0,
1.6821 + "Corruption detected in cell %d on page %d",i,iPage,0);
1.6822 + }else{
1.6823 + for(j=i+size-1; j>=i; j--) hit[j]++;
1.6824 + }
1.6825 + i = get2byte(&data[i]);
1.6826 + }
1.6827 + for(i=cnt=0; i<usableSize; i++){
1.6828 + if( hit[i]==0 ){
1.6829 + cnt++;
1.6830 + }else if( hit[i]>1 ){
1.6831 + checkAppendMsg(pCheck, 0,
1.6832 + "Multiple uses for byte %d of page %d", i, iPage);
1.6833 + break;
1.6834 + }
1.6835 + }
1.6836 + if( cnt!=data[hdr+7] ){
1.6837 + checkAppendMsg(pCheck, 0,
1.6838 + "Fragmented space is %d byte reported as %d on page %d",
1.6839 + cnt, data[hdr+7], iPage);
1.6840 + }
1.6841 + }
1.6842 + sqlite3PageFree(hit);
1.6843 +
1.6844 + releasePage(pPage);
1.6845 + return depth+1;
1.6846 +}
1.6847 +#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
1.6848 +
1.6849 +#ifndef SQLITE_OMIT_INTEGRITY_CHECK
1.6850 +/*
1.6851 +** This routine does a complete check of the given BTree file. aRoot[] is
1.6852 +** an array of pages numbers were each page number is the root page of
1.6853 +** a table. nRoot is the number of entries in aRoot.
1.6854 +**
1.6855 +** Write the number of error seen in *pnErr. Except for some memory
1.6856 +** allocation errors, nn error message is held in memory obtained from
1.6857 +** malloc is returned if *pnErr is non-zero. If *pnErr==0 then NULL is
1.6858 +** returned.
1.6859 +*/
1.6860 +char *sqlite3BtreeIntegrityCheck(
1.6861 + Btree *p, /* The btree to be checked */
1.6862 + int *aRoot, /* An array of root pages numbers for individual trees */
1.6863 + int nRoot, /* Number of entries in aRoot[] */
1.6864 + int mxErr, /* Stop reporting errors after this many */
1.6865 + int *pnErr /* Write number of errors seen to this variable */
1.6866 +){
1.6867 + int i;
1.6868 + int nRef;
1.6869 + IntegrityCk sCheck;
1.6870 + BtShared *pBt = p->pBt;
1.6871 + char zErr[100];
1.6872 +
1.6873 + sqlite3BtreeEnter(p);
1.6874 + pBt->db = p->db;
1.6875 + nRef = sqlite3PagerRefcount(pBt->pPager);
1.6876 + if( lockBtreeWithRetry(p)!=SQLITE_OK ){
1.6877 + *pnErr = 1;
1.6878 + sqlite3BtreeLeave(p);
1.6879 + return sqlite3DbStrDup(0, "cannot acquire a read lock on the database");
1.6880 + }
1.6881 + sCheck.pBt = pBt;
1.6882 + sCheck.pPager = pBt->pPager;
1.6883 + sCheck.nPage = pagerPagecount(sCheck.pPager);
1.6884 + sCheck.mxErr = mxErr;
1.6885 + sCheck.nErr = 0;
1.6886 + sCheck.mallocFailed = 0;
1.6887 + *pnErr = 0;
1.6888 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.6889 + if( pBt->nTrunc!=0 ){
1.6890 + sCheck.nPage = pBt->nTrunc;
1.6891 + }
1.6892 +#endif
1.6893 + if( sCheck.nPage==0 ){
1.6894 + unlockBtreeIfUnused(pBt);
1.6895 + sqlite3BtreeLeave(p);
1.6896 + return 0;
1.6897 + }
1.6898 + sCheck.anRef = sqlite3Malloc( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
1.6899 + if( !sCheck.anRef ){
1.6900 + unlockBtreeIfUnused(pBt);
1.6901 + *pnErr = 1;
1.6902 + sqlite3BtreeLeave(p);
1.6903 + return 0;
1.6904 + }
1.6905 + for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
1.6906 + i = PENDING_BYTE_PAGE(pBt);
1.6907 + if( i<=sCheck.nPage ){
1.6908 + sCheck.anRef[i] = 1;
1.6909 + }
1.6910 + sqlite3StrAccumInit(&sCheck.errMsg, zErr, sizeof(zErr), 20000);
1.6911 +
1.6912 + /* Check the integrity of the freelist
1.6913 + */
1.6914 + checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
1.6915 + get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
1.6916 +
1.6917 + /* Check all the tables.
1.6918 + */
1.6919 + for(i=0; i<nRoot && sCheck.mxErr; i++){
1.6920 + if( aRoot[i]==0 ) continue;
1.6921 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.6922 + if( pBt->autoVacuum && aRoot[i]>1 ){
1.6923 + checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0);
1.6924 + }
1.6925 +#endif
1.6926 + checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: ");
1.6927 + }
1.6928 +
1.6929 + /* Make sure every page in the file is referenced
1.6930 + */
1.6931 + for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
1.6932 +#ifdef SQLITE_OMIT_AUTOVACUUM
1.6933 + if( sCheck.anRef[i]==0 ){
1.6934 + checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
1.6935 + }
1.6936 +#else
1.6937 + /* If the database supports auto-vacuum, make sure no tables contain
1.6938 + ** references to pointer-map pages.
1.6939 + */
1.6940 + if( sCheck.anRef[i]==0 &&
1.6941 + (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
1.6942 + checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
1.6943 + }
1.6944 + if( sCheck.anRef[i]!=0 &&
1.6945 + (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
1.6946 + checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i);
1.6947 + }
1.6948 +#endif
1.6949 + }
1.6950 +
1.6951 + /* Make sure this analysis did not leave any unref() pages
1.6952 + */
1.6953 + unlockBtreeIfUnused(pBt);
1.6954 + if( nRef != sqlite3PagerRefcount(pBt->pPager) ){
1.6955 + checkAppendMsg(&sCheck, 0,
1.6956 + "Outstanding page count goes from %d to %d during this analysis",
1.6957 + nRef, sqlite3PagerRefcount(pBt->pPager)
1.6958 + );
1.6959 + }
1.6960 +
1.6961 + /* Clean up and report errors.
1.6962 + */
1.6963 + sqlite3BtreeLeave(p);
1.6964 + sqlite3_free(sCheck.anRef);
1.6965 + if( sCheck.mallocFailed ){
1.6966 + sqlite3StrAccumReset(&sCheck.errMsg);
1.6967 + *pnErr = sCheck.nErr+1;
1.6968 + return 0;
1.6969 + }
1.6970 + *pnErr = sCheck.nErr;
1.6971 + if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);
1.6972 + return sqlite3StrAccumFinish(&sCheck.errMsg);
1.6973 +}
1.6974 +#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
1.6975 +
1.6976 +/*
1.6977 +** Return the full pathname of the underlying database file.
1.6978 +**
1.6979 +** The pager filename is invariant as long as the pager is
1.6980 +** open so it is safe to access without the BtShared mutex.
1.6981 +*/
1.6982 +const char *sqlite3BtreeGetFilename(Btree *p){
1.6983 + assert( p->pBt->pPager!=0 );
1.6984 + return sqlite3PagerFilename(p->pBt->pPager);
1.6985 +}
1.6986 +
1.6987 +/*
1.6988 +** Return the pathname of the directory that contains the database file.
1.6989 +**
1.6990 +** The pager directory name is invariant as long as the pager is
1.6991 +** open so it is safe to access without the BtShared mutex.
1.6992 +*/
1.6993 +const char *sqlite3BtreeGetDirname(Btree *p){
1.6994 + assert( p->pBt->pPager!=0 );
1.6995 + return sqlite3PagerDirname(p->pBt->pPager);
1.6996 +}
1.6997 +
1.6998 +/*
1.6999 +** Return the pathname of the journal file for this database. The return
1.7000 +** value of this routine is the same regardless of whether the journal file
1.7001 +** has been created or not.
1.7002 +**
1.7003 +** The pager journal filename is invariant as long as the pager is
1.7004 +** open so it is safe to access without the BtShared mutex.
1.7005 +*/
1.7006 +const char *sqlite3BtreeGetJournalname(Btree *p){
1.7007 + assert( p->pBt->pPager!=0 );
1.7008 + return sqlite3PagerJournalname(p->pBt->pPager);
1.7009 +}
1.7010 +
1.7011 +#ifndef SQLITE_OMIT_VACUUM
1.7012 +/*
1.7013 +** Copy the complete content of pBtFrom into pBtTo. A transaction
1.7014 +** must be active for both files.
1.7015 +**
1.7016 +** The size of file pTo may be reduced by this operation.
1.7017 +** If anything goes wrong, the transaction on pTo is rolled back.
1.7018 +**
1.7019 +** If successful, CommitPhaseOne() may be called on pTo before returning.
1.7020 +** The caller should finish committing the transaction on pTo by calling
1.7021 +** sqlite3BtreeCommit().
1.7022 +*/
1.7023 +static int btreeCopyFile(Btree *pTo, Btree *pFrom){
1.7024 + int rc = SQLITE_OK;
1.7025 + Pgno i;
1.7026 +
1.7027 + Pgno nFromPage; /* Number of pages in pFrom */
1.7028 + Pgno nToPage; /* Number of pages in pTo */
1.7029 + Pgno nNewPage; /* Number of pages in pTo after the copy */
1.7030 +
1.7031 + Pgno iSkip; /* Pending byte page in pTo */
1.7032 + int nToPageSize; /* Page size of pTo in bytes */
1.7033 + int nFromPageSize; /* Page size of pFrom in bytes */
1.7034 +
1.7035 + BtShared *pBtTo = pTo->pBt;
1.7036 + BtShared *pBtFrom = pFrom->pBt;
1.7037 + pBtTo->db = pTo->db;
1.7038 + pBtFrom->db = pFrom->db;
1.7039 +
1.7040 + nToPageSize = pBtTo->pageSize;
1.7041 + nFromPageSize = pBtFrom->pageSize;
1.7042 +
1.7043 + if( pTo->inTrans!=TRANS_WRITE || pFrom->inTrans!=TRANS_WRITE ){
1.7044 + return SQLITE_ERROR;
1.7045 + }
1.7046 + if( pBtTo->pCursor ){
1.7047 + return SQLITE_BUSY;
1.7048 + }
1.7049 +
1.7050 + nToPage = pagerPagecount(pBtTo->pPager);
1.7051 + nFromPage = pagerPagecount(pBtFrom->pPager);
1.7052 + iSkip = PENDING_BYTE_PAGE(pBtTo);
1.7053 +
1.7054 + /* Variable nNewPage is the number of pages required to store the
1.7055 + ** contents of pFrom using the current page-size of pTo.
1.7056 + */
1.7057 + nNewPage = ((i64)nFromPage * (i64)nFromPageSize + (i64)nToPageSize - 1) /
1.7058 + (i64)nToPageSize;
1.7059 +
1.7060 + for(i=1; rc==SQLITE_OK && (i<=nToPage || i<=nNewPage); i++){
1.7061 +
1.7062 + /* Journal the original page.
1.7063 + **
1.7064 + ** iSkip is the page number of the locking page (PENDING_BYTE_PAGE)
1.7065 + ** in database *pTo (before the copy). This page is never written
1.7066 + ** into the journal file. Unless i==iSkip or the page was not
1.7067 + ** present in pTo before the copy operation, journal page i from pTo.
1.7068 + */
1.7069 + if( i!=iSkip && i<=nToPage ){
1.7070 + DbPage *pDbPage = 0;
1.7071 + rc = sqlite3PagerGet(pBtTo->pPager, i, &pDbPage);
1.7072 + if( rc==SQLITE_OK ){
1.7073 + rc = sqlite3PagerWrite(pDbPage);
1.7074 + if( rc==SQLITE_OK && i>nFromPage ){
1.7075 + /* Yeah. It seems wierd to call DontWrite() right after Write(). But
1.7076 + ** that is because the names of those procedures do not exactly
1.7077 + ** represent what they do. Write() really means "put this page in the
1.7078 + ** rollback journal and mark it as dirty so that it will be written
1.7079 + ** to the database file later." DontWrite() undoes the second part of
1.7080 + ** that and prevents the page from being written to the database. The
1.7081 + ** page is still on the rollback journal, though. And that is the
1.7082 + ** whole point of this block: to put pages on the rollback journal.
1.7083 + */
1.7084 + rc = sqlite3PagerDontWrite(pDbPage);
1.7085 + }
1.7086 + sqlite3PagerUnref(pDbPage);
1.7087 + }
1.7088 + }
1.7089 +
1.7090 + /* Overwrite the data in page i of the target database */
1.7091 + if( rc==SQLITE_OK && i!=iSkip && i<=nNewPage ){
1.7092 +
1.7093 + DbPage *pToPage = 0;
1.7094 + sqlite3_int64 iOff;
1.7095 +
1.7096 + rc = sqlite3PagerGet(pBtTo->pPager, i, &pToPage);
1.7097 + if( rc==SQLITE_OK ){
1.7098 + rc = sqlite3PagerWrite(pToPage);
1.7099 + }
1.7100 +
1.7101 + for(
1.7102 + iOff=(i-1)*nToPageSize;
1.7103 + rc==SQLITE_OK && iOff<i*nToPageSize;
1.7104 + iOff += nFromPageSize
1.7105 + ){
1.7106 + DbPage *pFromPage = 0;
1.7107 + Pgno iFrom = (iOff/nFromPageSize)+1;
1.7108 +
1.7109 + if( iFrom==PENDING_BYTE_PAGE(pBtFrom) ){
1.7110 + continue;
1.7111 + }
1.7112 +
1.7113 + rc = sqlite3PagerGet(pBtFrom->pPager, iFrom, &pFromPage);
1.7114 + if( rc==SQLITE_OK ){
1.7115 + char *zTo = sqlite3PagerGetData(pToPage);
1.7116 + char *zFrom = sqlite3PagerGetData(pFromPage);
1.7117 + int nCopy;
1.7118 +
1.7119 + if( nFromPageSize>=nToPageSize ){
1.7120 + zFrom += ((i-1)*nToPageSize - ((iFrom-1)*nFromPageSize));
1.7121 + nCopy = nToPageSize;
1.7122 + }else{
1.7123 + zTo += (((iFrom-1)*nFromPageSize) - (i-1)*nToPageSize);
1.7124 + nCopy = nFromPageSize;
1.7125 + }
1.7126 +
1.7127 + memcpy(zTo, zFrom, nCopy);
1.7128 + sqlite3PagerUnref(pFromPage);
1.7129 + }
1.7130 + }
1.7131 +
1.7132 + if( pToPage ){
1.7133 + MemPage *p = (MemPage *)sqlite3PagerGetExtra(pToPage);
1.7134 + p->isInit = 0;
1.7135 + sqlite3PagerUnref(pToPage);
1.7136 + }
1.7137 + }
1.7138 + }
1.7139 +
1.7140 + /* If things have worked so far, the database file may need to be
1.7141 + ** truncated. The complex part is that it may need to be truncated to
1.7142 + ** a size that is not an integer multiple of nToPageSize - the current
1.7143 + ** page size used by the pager associated with B-Tree pTo.
1.7144 + **
1.7145 + ** For example, say the page-size of pTo is 2048 bytes and the original
1.7146 + ** number of pages is 5 (10 KB file). If pFrom has a page size of 1024
1.7147 + ** bytes and 9 pages, then the file needs to be truncated to 9KB.
1.7148 + */
1.7149 + if( rc==SQLITE_OK ){
1.7150 + if( nFromPageSize!=nToPageSize ){
1.7151 + sqlite3_file *pFile = sqlite3PagerFile(pBtTo->pPager);
1.7152 + i64 iSize = (i64)nFromPageSize * (i64)nFromPage;
1.7153 + i64 iNow = (i64)((nToPage>nNewPage)?nToPage:nNewPage) * (i64)nToPageSize;
1.7154 + i64 iPending = ((i64)PENDING_BYTE_PAGE(pBtTo)-1) *(i64)nToPageSize;
1.7155 +
1.7156 + assert( iSize<=iNow );
1.7157 +
1.7158 + /* Commit phase one syncs the journal file associated with pTo
1.7159 + ** containing the original data. It does not sync the database file
1.7160 + ** itself. After doing this it is safe to use OsTruncate() and other
1.7161 + ** file APIs on the database file directly.
1.7162 + */
1.7163 + pBtTo->db = pTo->db;
1.7164 + rc = sqlite3PagerCommitPhaseOne(pBtTo->pPager, 0, 0, 1);
1.7165 + if( iSize<iNow && rc==SQLITE_OK ){
1.7166 + rc = sqlite3OsTruncate(pFile, iSize);
1.7167 + }
1.7168 +
1.7169 + /* The loop that copied data from database pFrom to pTo did not
1.7170 + ** populate the locking page of database pTo. If the page-size of
1.7171 + ** pFrom is smaller than that of pTo, this means some data will
1.7172 + ** not have been copied.
1.7173 + **
1.7174 + ** This block copies the missing data from database pFrom to pTo
1.7175 + ** using file APIs. This is safe because at this point we know that
1.7176 + ** all of the original data from pTo has been synced into the
1.7177 + ** journal file. At this point it would be safe to do anything at
1.7178 + ** all to the database file except truncate it to zero bytes.
1.7179 + */
1.7180 + if( rc==SQLITE_OK && nFromPageSize<nToPageSize && iSize>iPending){
1.7181 + i64 iOff;
1.7182 + for(
1.7183 + iOff=iPending;
1.7184 + rc==SQLITE_OK && iOff<(iPending+nToPageSize);
1.7185 + iOff += nFromPageSize
1.7186 + ){
1.7187 + DbPage *pFromPage = 0;
1.7188 + Pgno iFrom = (iOff/nFromPageSize)+1;
1.7189 +
1.7190 + if( iFrom==PENDING_BYTE_PAGE(pBtFrom) || iFrom>nFromPage ){
1.7191 + continue;
1.7192 + }
1.7193 +
1.7194 + rc = sqlite3PagerGet(pBtFrom->pPager, iFrom, &pFromPage);
1.7195 + if( rc==SQLITE_OK ){
1.7196 + char *zFrom = sqlite3PagerGetData(pFromPage);
1.7197 + rc = sqlite3OsWrite(pFile, zFrom, nFromPageSize, iOff);
1.7198 + sqlite3PagerUnref(pFromPage);
1.7199 + }
1.7200 + }
1.7201 + }
1.7202 +
1.7203 + /* Sync the database file */
1.7204 + if( rc==SQLITE_OK ){
1.7205 + rc = sqlite3PagerSync(pBtTo->pPager);
1.7206 + }
1.7207 + }else{
1.7208 + rc = sqlite3PagerTruncate(pBtTo->pPager, nNewPage);
1.7209 + }
1.7210 + if( rc==SQLITE_OK ){
1.7211 + pBtTo->pageSizeFixed = 0;
1.7212 + }
1.7213 + }
1.7214 +
1.7215 + if( rc ){
1.7216 + sqlite3BtreeRollback(pTo);
1.7217 + }
1.7218 +
1.7219 + return rc;
1.7220 +}
1.7221 +int sqlite3BtreeCopyFile(Btree *pTo, Btree *pFrom){
1.7222 + int rc;
1.7223 + sqlite3BtreeEnter(pTo);
1.7224 + sqlite3BtreeEnter(pFrom);
1.7225 + rc = btreeCopyFile(pTo, pFrom);
1.7226 + sqlite3BtreeLeave(pFrom);
1.7227 + sqlite3BtreeLeave(pTo);
1.7228 + return rc;
1.7229 +}
1.7230 +
1.7231 +#endif /* SQLITE_OMIT_VACUUM */
1.7232 +
1.7233 +/*
1.7234 +** Return non-zero if a transaction is active.
1.7235 +*/
1.7236 +int sqlite3BtreeIsInTrans(Btree *p){
1.7237 + assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
1.7238 + return (p && (p->inTrans==TRANS_WRITE));
1.7239 +}
1.7240 +
1.7241 +/*
1.7242 +** Return non-zero if a statement transaction is active.
1.7243 +*/
1.7244 +int sqlite3BtreeIsInStmt(Btree *p){
1.7245 + assert( sqlite3BtreeHoldsMutex(p) );
1.7246 + return (p->pBt && p->pBt->inStmt);
1.7247 +}
1.7248 +
1.7249 +/*
1.7250 +** Return non-zero if a read (or write) transaction is active.
1.7251 +*/
1.7252 +int sqlite3BtreeIsInReadTrans(Btree *p){
1.7253 + assert( sqlite3_mutex_held(p->db->mutex) );
1.7254 + return (p && (p->inTrans!=TRANS_NONE));
1.7255 +}
1.7256 +
1.7257 +/*
1.7258 +** This function returns a pointer to a blob of memory associated with
1.7259 +** a single shared-btree. The memory is used by client code for its own
1.7260 +** purposes (for example, to store a high-level schema associated with
1.7261 +** the shared-btree). The btree layer manages reference counting issues.
1.7262 +**
1.7263 +** The first time this is called on a shared-btree, nBytes bytes of memory
1.7264 +** are allocated, zeroed, and returned to the caller. For each subsequent
1.7265 +** call the nBytes parameter is ignored and a pointer to the same blob
1.7266 +** of memory returned.
1.7267 +**
1.7268 +** If the nBytes parameter is 0 and the blob of memory has not yet been
1.7269 +** allocated, a null pointer is returned. If the blob has already been
1.7270 +** allocated, it is returned as normal.
1.7271 +**
1.7272 +** Just before the shared-btree is closed, the function passed as the
1.7273 +** xFree argument when the memory allocation was made is invoked on the
1.7274 +** blob of allocated memory. This function should not call sqlite3_free()
1.7275 +** on the memory, the btree layer does that.
1.7276 +*/
1.7277 +void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
1.7278 + BtShared *pBt = p->pBt;
1.7279 + sqlite3BtreeEnter(p);
1.7280 + if( !pBt->pSchema && nBytes ){
1.7281 + pBt->pSchema = sqlite3MallocZero(nBytes);
1.7282 + pBt->xFreeSchema = xFree;
1.7283 + }
1.7284 + sqlite3BtreeLeave(p);
1.7285 + return pBt->pSchema;
1.7286 +}
1.7287 +
1.7288 +/*
1.7289 +** Return true if another user of the same shared btree as the argument
1.7290 +** handle holds an exclusive lock on the sqlite_master table.
1.7291 +*/
1.7292 +int sqlite3BtreeSchemaLocked(Btree *p){
1.7293 + int rc;
1.7294 + assert( sqlite3_mutex_held(p->db->mutex) );
1.7295 + sqlite3BtreeEnter(p);
1.7296 + rc = (queryTableLock(p, MASTER_ROOT, READ_LOCK)!=SQLITE_OK);
1.7297 + sqlite3BtreeLeave(p);
1.7298 + return rc;
1.7299 +}
1.7300 +
1.7301 +
1.7302 +#ifndef SQLITE_OMIT_SHARED_CACHE
1.7303 +/*
1.7304 +** Obtain a lock on the table whose root page is iTab. The
1.7305 +** lock is a write lock if isWritelock is true or a read lock
1.7306 +** if it is false.
1.7307 +*/
1.7308 +int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
1.7309 + int rc = SQLITE_OK;
1.7310 + if( p->sharable ){
1.7311 + u8 lockType = READ_LOCK + isWriteLock;
1.7312 + assert( READ_LOCK+1==WRITE_LOCK );
1.7313 + assert( isWriteLock==0 || isWriteLock==1 );
1.7314 + sqlite3BtreeEnter(p);
1.7315 + rc = queryTableLock(p, iTab, lockType);
1.7316 + if( rc==SQLITE_OK ){
1.7317 + rc = lockTable(p, iTab, lockType);
1.7318 + }
1.7319 + sqlite3BtreeLeave(p);
1.7320 + }
1.7321 + return rc;
1.7322 +}
1.7323 +#endif
1.7324 +
1.7325 +#ifndef SQLITE_OMIT_INCRBLOB
1.7326 +/*
1.7327 +** Argument pCsr must be a cursor opened for writing on an
1.7328 +** INTKEY table currently pointing at a valid table entry.
1.7329 +** This function modifies the data stored as part of that entry.
1.7330 +** Only the data content may only be modified, it is not possible
1.7331 +** to change the length of the data stored.
1.7332 +*/
1.7333 +int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
1.7334 + assert( cursorHoldsMutex(pCsr) );
1.7335 + assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
1.7336 + assert(pCsr->isIncrblobHandle);
1.7337 +
1.7338 + restoreCursorPosition(pCsr);
1.7339 + assert( pCsr->eState!=CURSOR_REQUIRESEEK );
1.7340 + if( pCsr->eState!=CURSOR_VALID ){
1.7341 + return SQLITE_ABORT;
1.7342 + }
1.7343 +
1.7344 + /* Check some preconditions:
1.7345 + ** (a) the cursor is open for writing,
1.7346 + ** (b) there is no read-lock on the table being modified and
1.7347 + ** (c) the cursor points at a valid row of an intKey table.
1.7348 + */
1.7349 + if( !pCsr->wrFlag ){
1.7350 + return SQLITE_READONLY;
1.7351 + }
1.7352 + assert( !pCsr->pBt->readOnly
1.7353 + && pCsr->pBt->inTransaction==TRANS_WRITE );
1.7354 + if( checkReadLocks(pCsr->pBtree, pCsr->pgnoRoot, pCsr, 0) ){
1.7355 + return SQLITE_LOCKED; /* The table pCur points to has a read lock */
1.7356 + }
1.7357 + if( pCsr->eState==CURSOR_INVALID || !pCsr->apPage[pCsr->iPage]->intKey ){
1.7358 + return SQLITE_ERROR;
1.7359 + }
1.7360 +
1.7361 + return accessPayload(pCsr, offset, amt, (unsigned char *)z, 0, 1);
1.7362 +}
1.7363 +
1.7364 +/*
1.7365 +** Set a flag on this cursor to cache the locations of pages from the
1.7366 +** overflow list for the current row. This is used by cursors opened
1.7367 +** for incremental blob IO only.
1.7368 +**
1.7369 +** This function sets a flag only. The actual page location cache
1.7370 +** (stored in BtCursor.aOverflow[]) is allocated and used by function
1.7371 +** accessPayload() (the worker function for sqlite3BtreeData() and
1.7372 +** sqlite3BtreePutData()).
1.7373 +*/
1.7374 +void sqlite3BtreeCacheOverflow(BtCursor *pCur){
1.7375 + assert( cursorHoldsMutex(pCur) );
1.7376 + assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
1.7377 + assert(!pCur->isIncrblobHandle);
1.7378 + assert(!pCur->aOverflow);
1.7379 + pCur->isIncrblobHandle = 1;
1.7380 +}
1.7381 +#endif