1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/persistentdata/persistentstorage/sql/SQLite364/btree.c Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,7414 @@
1.4 +/*
1.5 +** 2004 April 6
1.6 +**
1.7 +** The author disclaims copyright to this source code. In place of
1.8 +** a legal notice, here is a blessing:
1.9 +**
1.10 +** May you do good and not evil.
1.11 +** May you find forgiveness for yourself and forgive others.
1.12 +** May you share freely, never taking more than you give.
1.13 +**
1.14 +*************************************************************************
1.15 +** $Id: btree.c,v 1.525 2008/10/08 17:58:49 danielk1977 Exp $
1.16 +**
1.17 +** This file implements a external (disk-based) database using BTrees.
1.18 +** See the header comment on "btreeInt.h" for additional information.
1.19 +** Including a description of file format and an overview of operation.
1.20 +*/
1.21 +#include "btreeInt.h"
1.22 +
1.23 +/*
1.24 +** The header string that appears at the beginning of every
1.25 +** SQLite database.
1.26 +*/
1.27 +static const char zMagicHeader[] = SQLITE_FILE_HEADER;
1.28 +
1.29 +/*
1.30 +** Set this global variable to 1 to enable tracing using the TRACE
1.31 +** macro.
1.32 +*/
1.33 +#if 0
1.34 +int sqlite3BtreeTrace=0; /* True to enable tracing */
1.35 +# define TRACE(X) if(sqlite3BtreeTrace){printf X;fflush(stdout);}
1.36 +#else
1.37 +# define TRACE(X)
1.38 +#endif
1.39 +
1.40 +/*
1.41 +** Sometimes we need a small amount of code such as a variable initialization
1.42 +** to setup for a later assert() statement. We do not want this code to
1.43 +** appear when assert() is disabled. The following macro is therefore
1.44 +** used to contain that setup code. The "VVA" acronym stands for
1.45 +** "Verification, Validation, and Accreditation". In other words, the
1.46 +** code within VVA_ONLY() will only run during verification processes.
1.47 +*/
1.48 +#ifndef NDEBUG
1.49 +# define VVA_ONLY(X) X
1.50 +#else
1.51 +# define VVA_ONLY(X)
1.52 +#endif
1.53 +
1.54 +
1.55 +
1.56 +#ifndef SQLITE_OMIT_SHARED_CACHE
1.57 +/*
1.58 +** A list of BtShared objects that are eligible for participation
1.59 +** in shared cache. This variable has file scope during normal builds,
1.60 +** but the test harness needs to access it so we make it global for
1.61 +** test builds.
1.62 +*/
1.63 +#ifdef SQLITE_TEST
1.64 +BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
1.65 +#else
1.66 +static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
1.67 +#endif
1.68 +#endif /* SQLITE_OMIT_SHARED_CACHE */
1.69 +
1.70 +#ifndef SQLITE_OMIT_SHARED_CACHE
1.71 +/*
1.72 +** Enable or disable the shared pager and schema features.
1.73 +**
1.74 +** This routine has no effect on existing database connections.
1.75 +** The shared cache setting effects only future calls to
1.76 +** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
1.77 +*/
1.78 +int sqlite3_enable_shared_cache(int enable){
1.79 + sqlite3GlobalConfig.sharedCacheEnabled = enable;
1.80 + return SQLITE_OK;
1.81 +}
1.82 +#endif
1.83 +
1.84 +
1.85 +/*
1.86 +** Forward declaration
1.87 +*/
1.88 +static int checkReadLocks(Btree*, Pgno, BtCursor*, i64);
1.89 +
1.90 +
1.91 +#ifdef SQLITE_OMIT_SHARED_CACHE
1.92 + /*
1.93 + ** The functions queryTableLock(), lockTable() and unlockAllTables()
1.94 + ** manipulate entries in the BtShared.pLock linked list used to store
1.95 + ** shared-cache table level locks. If the library is compiled with the
1.96 + ** shared-cache feature disabled, then there is only ever one user
1.97 + ** of each BtShared structure and so this locking is not necessary.
1.98 + ** So define the lock related functions as no-ops.
1.99 + */
1.100 + #define queryTableLock(a,b,c) SQLITE_OK
1.101 + #define lockTable(a,b,c) SQLITE_OK
1.102 + #define unlockAllTables(a)
1.103 +#endif
1.104 +
1.105 +#ifndef SQLITE_OMIT_SHARED_CACHE
1.106 +/*
1.107 +** Query to see if btree handle p may obtain a lock of type eLock
1.108 +** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
1.109 +** SQLITE_OK if the lock may be obtained (by calling lockTable()), or
1.110 +** SQLITE_LOCKED if not.
1.111 +*/
1.112 +static int queryTableLock(Btree *p, Pgno iTab, u8 eLock){
1.113 + BtShared *pBt = p->pBt;
1.114 + BtLock *pIter;
1.115 +
1.116 + assert( sqlite3BtreeHoldsMutex(p) );
1.117 + assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
1.118 + assert( p->db!=0 );
1.119 +
1.120 + /* This is a no-op if the shared-cache is not enabled */
1.121 + if( !p->sharable ){
1.122 + return SQLITE_OK;
1.123 + }
1.124 +
1.125 + /* If some other connection is holding an exclusive lock, the
1.126 + ** requested lock may not be obtained.
1.127 + */
1.128 + if( pBt->pExclusive && pBt->pExclusive!=p ){
1.129 + return SQLITE_LOCKED;
1.130 + }
1.131 +
1.132 + /* This (along with lockTable()) is where the ReadUncommitted flag is
1.133 + ** dealt with. If the caller is querying for a read-lock and the flag is
1.134 + ** set, it is unconditionally granted - even if there are write-locks
1.135 + ** on the table. If a write-lock is requested, the ReadUncommitted flag
1.136 + ** is not considered.
1.137 + **
1.138 + ** In function lockTable(), if a read-lock is demanded and the
1.139 + ** ReadUncommitted flag is set, no entry is added to the locks list
1.140 + ** (BtShared.pLock).
1.141 + **
1.142 + ** To summarize: If the ReadUncommitted flag is set, then read cursors do
1.143 + ** not create or respect table locks. The locking procedure for a
1.144 + ** write-cursor does not change.
1.145 + */
1.146 + if(
1.147 + 0==(p->db->flags&SQLITE_ReadUncommitted) ||
1.148 + eLock==WRITE_LOCK ||
1.149 + iTab==MASTER_ROOT
1.150 + ){
1.151 + for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
1.152 + if( pIter->pBtree!=p && pIter->iTable==iTab &&
1.153 + (pIter->eLock!=eLock || eLock!=READ_LOCK) ){
1.154 + return SQLITE_LOCKED;
1.155 + }
1.156 + }
1.157 + }
1.158 + return SQLITE_OK;
1.159 +}
1.160 +#endif /* !SQLITE_OMIT_SHARED_CACHE */
1.161 +
1.162 +#ifndef SQLITE_OMIT_SHARED_CACHE
1.163 +/*
1.164 +** Add a lock on the table with root-page iTable to the shared-btree used
1.165 +** by Btree handle p. Parameter eLock must be either READ_LOCK or
1.166 +** WRITE_LOCK.
1.167 +**
1.168 +** SQLITE_OK is returned if the lock is added successfully. SQLITE_BUSY and
1.169 +** SQLITE_NOMEM may also be returned.
1.170 +*/
1.171 +static int lockTable(Btree *p, Pgno iTable, u8 eLock){
1.172 + BtShared *pBt = p->pBt;
1.173 + BtLock *pLock = 0;
1.174 + BtLock *pIter;
1.175 +
1.176 + assert( sqlite3BtreeHoldsMutex(p) );
1.177 + assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
1.178 + assert( p->db!=0 );
1.179 +
1.180 + /* This is a no-op if the shared-cache is not enabled */
1.181 + if( !p->sharable ){
1.182 + return SQLITE_OK;
1.183 + }
1.184 +
1.185 + assert( SQLITE_OK==queryTableLock(p, iTable, eLock) );
1.186 +
1.187 + /* If the read-uncommitted flag is set and a read-lock is requested,
1.188 + ** return early without adding an entry to the BtShared.pLock list. See
1.189 + ** comment in function queryTableLock() for more info on handling
1.190 + ** the ReadUncommitted flag.
1.191 + */
1.192 + if(
1.193 + (p->db->flags&SQLITE_ReadUncommitted) &&
1.194 + (eLock==READ_LOCK) &&
1.195 + iTable!=MASTER_ROOT
1.196 + ){
1.197 + return SQLITE_OK;
1.198 + }
1.199 +
1.200 + /* First search the list for an existing lock on this table. */
1.201 + for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
1.202 + if( pIter->iTable==iTable && pIter->pBtree==p ){
1.203 + pLock = pIter;
1.204 + break;
1.205 + }
1.206 + }
1.207 +
1.208 + /* If the above search did not find a BtLock struct associating Btree p
1.209 + ** with table iTable, allocate one and link it into the list.
1.210 + */
1.211 + if( !pLock ){
1.212 + pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
1.213 + if( !pLock ){
1.214 + return SQLITE_NOMEM;
1.215 + }
1.216 + pLock->iTable = iTable;
1.217 + pLock->pBtree = p;
1.218 + pLock->pNext = pBt->pLock;
1.219 + pBt->pLock = pLock;
1.220 + }
1.221 +
1.222 + /* Set the BtLock.eLock variable to the maximum of the current lock
1.223 + ** and the requested lock. This means if a write-lock was already held
1.224 + ** and a read-lock requested, we don't incorrectly downgrade the lock.
1.225 + */
1.226 + assert( WRITE_LOCK>READ_LOCK );
1.227 + if( eLock>pLock->eLock ){
1.228 + pLock->eLock = eLock;
1.229 + }
1.230 +
1.231 + return SQLITE_OK;
1.232 +}
1.233 +#endif /* !SQLITE_OMIT_SHARED_CACHE */
1.234 +
1.235 +#ifndef SQLITE_OMIT_SHARED_CACHE
1.236 +/*
1.237 +** Release all the table locks (locks obtained via calls to the lockTable()
1.238 +** procedure) held by Btree handle p.
1.239 +*/
1.240 +static void unlockAllTables(Btree *p){
1.241 + BtShared *pBt = p->pBt;
1.242 + BtLock **ppIter = &pBt->pLock;
1.243 +
1.244 + assert( sqlite3BtreeHoldsMutex(p) );
1.245 + assert( p->sharable || 0==*ppIter );
1.246 +
1.247 + while( *ppIter ){
1.248 + BtLock *pLock = *ppIter;
1.249 + assert( pBt->pExclusive==0 || pBt->pExclusive==pLock->pBtree );
1.250 + if( pLock->pBtree==p ){
1.251 + *ppIter = pLock->pNext;
1.252 + sqlite3_free(pLock);
1.253 + }else{
1.254 + ppIter = &pLock->pNext;
1.255 + }
1.256 + }
1.257 +
1.258 + if( pBt->pExclusive==p ){
1.259 + pBt->pExclusive = 0;
1.260 + }
1.261 +}
1.262 +#endif /* SQLITE_OMIT_SHARED_CACHE */
1.263 +
1.264 +static void releasePage(MemPage *pPage); /* Forward reference */
1.265 +
1.266 +/*
1.267 +** Verify that the cursor holds a mutex on the BtShared
1.268 +*/
1.269 +#ifndef NDEBUG
1.270 +static int cursorHoldsMutex(BtCursor *p){
1.271 + return sqlite3_mutex_held(p->pBt->mutex);
1.272 +}
1.273 +#endif
1.274 +
1.275 +
1.276 +#ifndef SQLITE_OMIT_INCRBLOB
1.277 +/*
1.278 +** Invalidate the overflow page-list cache for cursor pCur, if any.
1.279 +*/
1.280 +static void invalidateOverflowCache(BtCursor *pCur){
1.281 + assert( cursorHoldsMutex(pCur) );
1.282 + sqlite3_free(pCur->aOverflow);
1.283 + pCur->aOverflow = 0;
1.284 +}
1.285 +
1.286 +/*
1.287 +** Invalidate the overflow page-list cache for all cursors opened
1.288 +** on the shared btree structure pBt.
1.289 +*/
1.290 +static void invalidateAllOverflowCache(BtShared *pBt){
1.291 + BtCursor *p;
1.292 + assert( sqlite3_mutex_held(pBt->mutex) );
1.293 + for(p=pBt->pCursor; p; p=p->pNext){
1.294 + invalidateOverflowCache(p);
1.295 + }
1.296 +}
1.297 +#else
1.298 + #define invalidateOverflowCache(x)
1.299 + #define invalidateAllOverflowCache(x)
1.300 +#endif
1.301 +
1.302 +/*
1.303 +** Save the current cursor position in the variables BtCursor.nKey
1.304 +** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
1.305 +*/
1.306 +static int saveCursorPosition(BtCursor *pCur){
1.307 + int rc;
1.308 +
1.309 + assert( CURSOR_VALID==pCur->eState );
1.310 + assert( 0==pCur->pKey );
1.311 + assert( cursorHoldsMutex(pCur) );
1.312 +
1.313 + rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);
1.314 +
1.315 + /* If this is an intKey table, then the above call to BtreeKeySize()
1.316 + ** stores the integer key in pCur->nKey. In this case this value is
1.317 + ** all that is required. Otherwise, if pCur is not open on an intKey
1.318 + ** table, then malloc space for and store the pCur->nKey bytes of key
1.319 + ** data.
1.320 + */
1.321 + if( rc==SQLITE_OK && 0==pCur->apPage[0]->intKey){
1.322 + void *pKey = sqlite3Malloc(pCur->nKey);
1.323 + if( pKey ){
1.324 + rc = sqlite3BtreeKey(pCur, 0, pCur->nKey, pKey);
1.325 + if( rc==SQLITE_OK ){
1.326 + pCur->pKey = pKey;
1.327 + }else{
1.328 + sqlite3_free(pKey);
1.329 + }
1.330 + }else{
1.331 + rc = SQLITE_NOMEM;
1.332 + }
1.333 + }
1.334 + assert( !pCur->apPage[0]->intKey || !pCur->pKey );
1.335 +
1.336 + if( rc==SQLITE_OK ){
1.337 + int i;
1.338 + for(i=0; i<=pCur->iPage; i++){
1.339 + releasePage(pCur->apPage[i]);
1.340 + pCur->apPage[i] = 0;
1.341 + }
1.342 + pCur->iPage = -1;
1.343 + pCur->eState = CURSOR_REQUIRESEEK;
1.344 + }
1.345 +
1.346 + invalidateOverflowCache(pCur);
1.347 + return rc;
1.348 +}
1.349 +
1.350 +/*
1.351 +** Save the positions of all cursors except pExcept open on the table
1.352 +** with root-page iRoot. Usually, this is called just before cursor
1.353 +** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()).
1.354 +*/
1.355 +static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
1.356 + BtCursor *p;
1.357 + assert( sqlite3_mutex_held(pBt->mutex) );
1.358 + assert( pExcept==0 || pExcept->pBt==pBt );
1.359 + for(p=pBt->pCursor; p; p=p->pNext){
1.360 + if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) &&
1.361 + p->eState==CURSOR_VALID ){
1.362 + int rc = saveCursorPosition(p);
1.363 + if( SQLITE_OK!=rc ){
1.364 + return rc;
1.365 + }
1.366 + }
1.367 + }
1.368 + return SQLITE_OK;
1.369 +}
1.370 +
1.371 +/*
1.372 +** Clear the current cursor position.
1.373 +*/
1.374 +void sqlite3BtreeClearCursor(BtCursor *pCur){
1.375 + assert( cursorHoldsMutex(pCur) );
1.376 + sqlite3_free(pCur->pKey);
1.377 + pCur->pKey = 0;
1.378 + pCur->eState = CURSOR_INVALID;
1.379 +}
1.380 +
1.381 +/*
1.382 +** Restore the cursor to the position it was in (or as close to as possible)
1.383 +** when saveCursorPosition() was called. Note that this call deletes the
1.384 +** saved position info stored by saveCursorPosition(), so there can be
1.385 +** at most one effective restoreCursorPosition() call after each
1.386 +** saveCursorPosition().
1.387 +*/
1.388 +int sqlite3BtreeRestoreCursorPosition(BtCursor *pCur){
1.389 + int rc;
1.390 + assert( cursorHoldsMutex(pCur) );
1.391 + assert( pCur->eState>=CURSOR_REQUIRESEEK );
1.392 + if( pCur->eState==CURSOR_FAULT ){
1.393 + return pCur->skip;
1.394 + }
1.395 + pCur->eState = CURSOR_INVALID;
1.396 + rc = sqlite3BtreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skip);
1.397 + if( rc==SQLITE_OK ){
1.398 + sqlite3_free(pCur->pKey);
1.399 + pCur->pKey = 0;
1.400 + assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
1.401 + }
1.402 + return rc;
1.403 +}
1.404 +
1.405 +#define restoreCursorPosition(p) \
1.406 + (p->eState>=CURSOR_REQUIRESEEK ? \
1.407 + sqlite3BtreeRestoreCursorPosition(p) : \
1.408 + SQLITE_OK)
1.409 +
1.410 +/*
1.411 +** Determine whether or not a cursor has moved from the position it
1.412 +** was last placed at. Cursor can move when the row they are pointing
1.413 +** at is deleted out from under them.
1.414 +**
1.415 +** This routine returns an error code if something goes wrong. The
1.416 +** integer *pHasMoved is set to one if the cursor has moved and 0 if not.
1.417 +*/
1.418 +int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved){
1.419 + int rc;
1.420 +
1.421 + rc = restoreCursorPosition(pCur);
1.422 + if( rc ){
1.423 + *pHasMoved = 1;
1.424 + return rc;
1.425 + }
1.426 + if( pCur->eState!=CURSOR_VALID || pCur->skip!=0 ){
1.427 + *pHasMoved = 1;
1.428 + }else{
1.429 + *pHasMoved = 0;
1.430 + }
1.431 + return SQLITE_OK;
1.432 +}
1.433 +
1.434 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.435 +/*
1.436 +** Given a page number of a regular database page, return the page
1.437 +** number for the pointer-map page that contains the entry for the
1.438 +** input page number.
1.439 +*/
1.440 +static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
1.441 + int nPagesPerMapPage, iPtrMap, ret;
1.442 + assert( sqlite3_mutex_held(pBt->mutex) );
1.443 + nPagesPerMapPage = (pBt->usableSize/5)+1;
1.444 + iPtrMap = (pgno-2)/nPagesPerMapPage;
1.445 + ret = (iPtrMap*nPagesPerMapPage) + 2;
1.446 + if( ret==PENDING_BYTE_PAGE(pBt) ){
1.447 + ret++;
1.448 + }
1.449 + return ret;
1.450 +}
1.451 +
1.452 +/*
1.453 +** Write an entry into the pointer map.
1.454 +**
1.455 +** This routine updates the pointer map entry for page number 'key'
1.456 +** so that it maps to type 'eType' and parent page number 'pgno'.
1.457 +** An error code is returned if something goes wrong, otherwise SQLITE_OK.
1.458 +*/
1.459 +static int ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent){
1.460 + DbPage *pDbPage; /* The pointer map page */
1.461 + u8 *pPtrmap; /* The pointer map data */
1.462 + Pgno iPtrmap; /* The pointer map page number */
1.463 + int offset; /* Offset in pointer map page */
1.464 + int rc;
1.465 +
1.466 + assert( sqlite3_mutex_held(pBt->mutex) );
1.467 + /* The master-journal page number must never be used as a pointer map page */
1.468 + assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
1.469 +
1.470 + assert( pBt->autoVacuum );
1.471 + if( key==0 ){
1.472 + return SQLITE_CORRUPT_BKPT;
1.473 + }
1.474 + iPtrmap = PTRMAP_PAGENO(pBt, key);
1.475 + rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
1.476 + if( rc!=SQLITE_OK ){
1.477 + return rc;
1.478 + }
1.479 + offset = PTRMAP_PTROFFSET(iPtrmap, key);
1.480 + pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
1.481 +
1.482 + if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
1.483 + TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
1.484 + rc = sqlite3PagerWrite(pDbPage);
1.485 + if( rc==SQLITE_OK ){
1.486 + pPtrmap[offset] = eType;
1.487 + put4byte(&pPtrmap[offset+1], parent);
1.488 + }
1.489 + }
1.490 +
1.491 + sqlite3PagerUnref(pDbPage);
1.492 + return rc;
1.493 +}
1.494 +
1.495 +/*
1.496 +** Read an entry from the pointer map.
1.497 +**
1.498 +** This routine retrieves the pointer map entry for page 'key', writing
1.499 +** the type and parent page number to *pEType and *pPgno respectively.
1.500 +** An error code is returned if something goes wrong, otherwise SQLITE_OK.
1.501 +*/
1.502 +static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
1.503 + DbPage *pDbPage; /* The pointer map page */
1.504 + int iPtrmap; /* Pointer map page index */
1.505 + u8 *pPtrmap; /* Pointer map page data */
1.506 + int offset; /* Offset of entry in pointer map */
1.507 + int rc;
1.508 +
1.509 + assert( sqlite3_mutex_held(pBt->mutex) );
1.510 +
1.511 + iPtrmap = PTRMAP_PAGENO(pBt, key);
1.512 + rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
1.513 + if( rc!=0 ){
1.514 + return rc;
1.515 + }
1.516 + pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
1.517 +
1.518 + offset = PTRMAP_PTROFFSET(iPtrmap, key);
1.519 + assert( pEType!=0 );
1.520 + *pEType = pPtrmap[offset];
1.521 + if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
1.522 +
1.523 + sqlite3PagerUnref(pDbPage);
1.524 + if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
1.525 + return SQLITE_OK;
1.526 +}
1.527 +
1.528 +#else /* if defined SQLITE_OMIT_AUTOVACUUM */
1.529 + #define ptrmapPut(w,x,y,z) SQLITE_OK
1.530 + #define ptrmapGet(w,x,y,z) SQLITE_OK
1.531 + #define ptrmapPutOvfl(y,z) SQLITE_OK
1.532 +#endif
1.533 +
1.534 +/*
1.535 +** Given a btree page and a cell index (0 means the first cell on
1.536 +** the page, 1 means the second cell, and so forth) return a pointer
1.537 +** to the cell content.
1.538 +**
1.539 +** This routine works only for pages that do not contain overflow cells.
1.540 +*/
1.541 +#define findCell(P,I) \
1.542 + ((P)->aData + ((P)->maskPage & get2byte(&(P)->aData[(P)->cellOffset+2*(I)])))
1.543 +
1.544 +/*
1.545 +** This a more complex version of findCell() that works for
1.546 +** pages that do contain overflow cells. See insert
1.547 +*/
1.548 +static u8 *findOverflowCell(MemPage *pPage, int iCell){
1.549 + int i;
1.550 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.551 + for(i=pPage->nOverflow-1; i>=0; i--){
1.552 + int k;
1.553 + struct _OvflCell *pOvfl;
1.554 + pOvfl = &pPage->aOvfl[i];
1.555 + k = pOvfl->idx;
1.556 + if( k<=iCell ){
1.557 + if( k==iCell ){
1.558 + return pOvfl->pCell;
1.559 + }
1.560 + iCell--;
1.561 + }
1.562 + }
1.563 + return findCell(pPage, iCell);
1.564 +}
1.565 +
1.566 +/*
1.567 +** Parse a cell content block and fill in the CellInfo structure. There
1.568 +** are two versions of this function. sqlite3BtreeParseCell() takes a
1.569 +** cell index as the second argument and sqlite3BtreeParseCellPtr()
1.570 +** takes a pointer to the body of the cell as its second argument.
1.571 +**
1.572 +** Within this file, the parseCell() macro can be called instead of
1.573 +** sqlite3BtreeParseCellPtr(). Using some compilers, this will be faster.
1.574 +*/
1.575 +void sqlite3BtreeParseCellPtr(
1.576 + MemPage *pPage, /* Page containing the cell */
1.577 + u8 *pCell, /* Pointer to the cell text. */
1.578 + CellInfo *pInfo /* Fill in this structure */
1.579 +){
1.580 + int n; /* Number bytes in cell content header */
1.581 + u32 nPayload; /* Number of bytes of cell payload */
1.582 +
1.583 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.584 +
1.585 + pInfo->pCell = pCell;
1.586 + assert( pPage->leaf==0 || pPage->leaf==1 );
1.587 + n = pPage->childPtrSize;
1.588 + assert( n==4-4*pPage->leaf );
1.589 + if( pPage->intKey ){
1.590 + if( pPage->hasData ){
1.591 + n += getVarint32(&pCell[n], nPayload);
1.592 + }else{
1.593 + nPayload = 0;
1.594 + }
1.595 + n += getVarint(&pCell[n], (u64*)&pInfo->nKey);
1.596 + pInfo->nData = nPayload;
1.597 + }else{
1.598 + pInfo->nData = 0;
1.599 + n += getVarint32(&pCell[n], nPayload);
1.600 + pInfo->nKey = nPayload;
1.601 + }
1.602 + pInfo->nPayload = nPayload;
1.603 + pInfo->nHeader = n;
1.604 + if( likely(nPayload<=pPage->maxLocal) ){
1.605 + /* This is the (easy) common case where the entire payload fits
1.606 + ** on the local page. No overflow is required.
1.607 + */
1.608 + int nSize; /* Total size of cell content in bytes */
1.609 + nSize = nPayload + n;
1.610 + pInfo->nLocal = nPayload;
1.611 + pInfo->iOverflow = 0;
1.612 + if( (nSize & ~3)==0 ){
1.613 + nSize = 4; /* Minimum cell size is 4 */
1.614 + }
1.615 + pInfo->nSize = nSize;
1.616 + }else{
1.617 + /* If the payload will not fit completely on the local page, we have
1.618 + ** to decide how much to store locally and how much to spill onto
1.619 + ** overflow pages. The strategy is to minimize the amount of unused
1.620 + ** space on overflow pages while keeping the amount of local storage
1.621 + ** in between minLocal and maxLocal.
1.622 + **
1.623 + ** Warning: changing the way overflow payload is distributed in any
1.624 + ** way will result in an incompatible file format.
1.625 + */
1.626 + int minLocal; /* Minimum amount of payload held locally */
1.627 + int maxLocal; /* Maximum amount of payload held locally */
1.628 + int surplus; /* Overflow payload available for local storage */
1.629 +
1.630 + minLocal = pPage->minLocal;
1.631 + maxLocal = pPage->maxLocal;
1.632 + surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);
1.633 + if( surplus <= maxLocal ){
1.634 + pInfo->nLocal = surplus;
1.635 + }else{
1.636 + pInfo->nLocal = minLocal;
1.637 + }
1.638 + pInfo->iOverflow = pInfo->nLocal + n;
1.639 + pInfo->nSize = pInfo->iOverflow + 4;
1.640 + }
1.641 +}
1.642 +#define parseCell(pPage, iCell, pInfo) \
1.643 + sqlite3BtreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo))
1.644 +void sqlite3BtreeParseCell(
1.645 + MemPage *pPage, /* Page containing the cell */
1.646 + int iCell, /* The cell index. First cell is 0 */
1.647 + CellInfo *pInfo /* Fill in this structure */
1.648 +){
1.649 + parseCell(pPage, iCell, pInfo);
1.650 +}
1.651 +
1.652 +/*
1.653 +** Compute the total number of bytes that a Cell needs in the cell
1.654 +** data area of the btree-page. The return number includes the cell
1.655 +** data header and the local payload, but not any overflow page or
1.656 +** the space used by the cell pointer.
1.657 +*/
1.658 +#ifndef NDEBUG
1.659 +static u16 cellSize(MemPage *pPage, int iCell){
1.660 + CellInfo info;
1.661 + sqlite3BtreeParseCell(pPage, iCell, &info);
1.662 + return info.nSize;
1.663 +}
1.664 +#endif
1.665 +static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
1.666 + CellInfo info;
1.667 + sqlite3BtreeParseCellPtr(pPage, pCell, &info);
1.668 + return info.nSize;
1.669 +}
1.670 +
1.671 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.672 +/*
1.673 +** If the cell pCell, part of page pPage contains a pointer
1.674 +** to an overflow page, insert an entry into the pointer-map
1.675 +** for the overflow page.
1.676 +*/
1.677 +static int ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell){
1.678 + CellInfo info;
1.679 + assert( pCell!=0 );
1.680 + sqlite3BtreeParseCellPtr(pPage, pCell, &info);
1.681 + assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
1.682 + if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
1.683 + Pgno ovfl = get4byte(&pCell[info.iOverflow]);
1.684 + return ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno);
1.685 + }
1.686 + return SQLITE_OK;
1.687 +}
1.688 +/*
1.689 +** If the cell with index iCell on page pPage contains a pointer
1.690 +** to an overflow page, insert an entry into the pointer-map
1.691 +** for the overflow page.
1.692 +*/
1.693 +static int ptrmapPutOvfl(MemPage *pPage, int iCell){
1.694 + u8 *pCell;
1.695 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.696 + pCell = findOverflowCell(pPage, iCell);
1.697 + return ptrmapPutOvflPtr(pPage, pCell);
1.698 +}
1.699 +#endif
1.700 +
1.701 +
1.702 +/*
1.703 +** Defragment the page given. All Cells are moved to the
1.704 +** end of the page and all free space is collected into one
1.705 +** big FreeBlk that occurs in between the header and cell
1.706 +** pointer array and the cell content area.
1.707 +*/
1.708 +static int defragmentPage(MemPage *pPage){
1.709 + int i; /* Loop counter */
1.710 + int pc; /* Address of a i-th cell */
1.711 + int addr; /* Offset of first byte after cell pointer array */
1.712 + int hdr; /* Offset to the page header */
1.713 + int size; /* Size of a cell */
1.714 + int usableSize; /* Number of usable bytes on a page */
1.715 + int cellOffset; /* Offset to the cell pointer array */
1.716 + int cbrk; /* Offset to the cell content area */
1.717 + int nCell; /* Number of cells on the page */
1.718 + unsigned char *data; /* The page data */
1.719 + unsigned char *temp; /* Temp area for cell content */
1.720 +
1.721 + assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1.722 + assert( pPage->pBt!=0 );
1.723 + assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
1.724 + assert( pPage->nOverflow==0 );
1.725 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.726 + temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
1.727 + data = pPage->aData;
1.728 + hdr = pPage->hdrOffset;
1.729 + cellOffset = pPage->cellOffset;
1.730 + nCell = pPage->nCell;
1.731 + assert( nCell==get2byte(&data[hdr+3]) );
1.732 + usableSize = pPage->pBt->usableSize;
1.733 + cbrk = get2byte(&data[hdr+5]);
1.734 + memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk);
1.735 + cbrk = usableSize;
1.736 + for(i=0; i<nCell; i++){
1.737 + u8 *pAddr; /* The i-th cell pointer */
1.738 + pAddr = &data[cellOffset + i*2];
1.739 + pc = get2byte(pAddr);
1.740 + if( pc>=usableSize ){
1.741 + return SQLITE_CORRUPT_BKPT;
1.742 + }
1.743 + size = cellSizePtr(pPage, &temp[pc]);
1.744 + cbrk -= size;
1.745 + if( cbrk<cellOffset+2*nCell || pc+size>usableSize ){
1.746 + return SQLITE_CORRUPT_BKPT;
1.747 + }
1.748 + assert( cbrk+size<=usableSize && cbrk>=0 );
1.749 + memcpy(&data[cbrk], &temp[pc], size);
1.750 + put2byte(pAddr, cbrk);
1.751 + }
1.752 + assert( cbrk>=cellOffset+2*nCell );
1.753 + put2byte(&data[hdr+5], cbrk);
1.754 + data[hdr+1] = 0;
1.755 + data[hdr+2] = 0;
1.756 + data[hdr+7] = 0;
1.757 + addr = cellOffset+2*nCell;
1.758 + memset(&data[addr], 0, cbrk-addr);
1.759 + if( cbrk-addr!=pPage->nFree ){
1.760 + return SQLITE_CORRUPT_BKPT;
1.761 + }
1.762 + return SQLITE_OK;
1.763 +}
1.764 +
1.765 +/*
1.766 +** Allocate nByte bytes of space on a page.
1.767 +**
1.768 +** Return the index into pPage->aData[] of the first byte of
1.769 +** the new allocation. The caller guarantees that there is enough
1.770 +** space. This routine will never fail.
1.771 +**
1.772 +** If the page contains nBytes of free space but does not contain
1.773 +** nBytes of contiguous free space, then this routine automatically
1.774 +** calls defragementPage() to consolidate all free space before
1.775 +** allocating the new chunk.
1.776 +*/
1.777 +static int allocateSpace(MemPage *pPage, int nByte){
1.778 + int addr, pc, hdr;
1.779 + int size;
1.780 + int nFrag;
1.781 + int top;
1.782 + int nCell;
1.783 + int cellOffset;
1.784 + unsigned char *data;
1.785 +
1.786 + data = pPage->aData;
1.787 + assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1.788 + assert( pPage->pBt );
1.789 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.790 + assert( nByte>=0 ); /* Minimum cell size is 4 */
1.791 + assert( pPage->nFree>=nByte );
1.792 + assert( pPage->nOverflow==0 );
1.793 + pPage->nFree -= nByte;
1.794 + hdr = pPage->hdrOffset;
1.795 +
1.796 + nFrag = data[hdr+7];
1.797 + if( nFrag<60 ){
1.798 + /* Search the freelist looking for a slot big enough to satisfy the
1.799 + ** space request. */
1.800 + addr = hdr+1;
1.801 + while( (pc = get2byte(&data[addr]))>0 ){
1.802 + size = get2byte(&data[pc+2]);
1.803 + if( size>=nByte ){
1.804 + if( size<nByte+4 ){
1.805 + memcpy(&data[addr], &data[pc], 2);
1.806 + data[hdr+7] = nFrag + size - nByte;
1.807 + return pc;
1.808 + }else{
1.809 + put2byte(&data[pc+2], size-nByte);
1.810 + return pc + size - nByte;
1.811 + }
1.812 + }
1.813 + addr = pc;
1.814 + }
1.815 + }
1.816 +
1.817 + /* Allocate memory from the gap in between the cell pointer array
1.818 + ** and the cell content area.
1.819 + */
1.820 + top = get2byte(&data[hdr+5]);
1.821 + nCell = get2byte(&data[hdr+3]);
1.822 + cellOffset = pPage->cellOffset;
1.823 + if( nFrag>=60 || cellOffset + 2*nCell > top - nByte ){
1.824 + defragmentPage(pPage);
1.825 + top = get2byte(&data[hdr+5]);
1.826 + }
1.827 + top -= nByte;
1.828 + assert( cellOffset + 2*nCell <= top );
1.829 + put2byte(&data[hdr+5], top);
1.830 + return top;
1.831 +}
1.832 +
1.833 +/*
1.834 +** Return a section of the pPage->aData to the freelist.
1.835 +** The first byte of the new free block is pPage->aDisk[start]
1.836 +** and the size of the block is "size" bytes.
1.837 +**
1.838 +** Most of the effort here is involved in coalesing adjacent
1.839 +** free blocks into a single big free block.
1.840 +*/
1.841 +static int freeSpace(MemPage *pPage, int start, int size){
1.842 + int addr, pbegin, hdr;
1.843 + unsigned char *data = pPage->aData;
1.844 +
1.845 + assert( pPage->pBt!=0 );
1.846 + assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1.847 + assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) );
1.848 + assert( (start + size)<=pPage->pBt->usableSize );
1.849 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.850 + assert( size>=0 ); /* Minimum cell size is 4 */
1.851 +
1.852 +#ifdef SQLITE_SECURE_DELETE
1.853 + /* Overwrite deleted information with zeros when the SECURE_DELETE
1.854 + ** option is enabled at compile-time */
1.855 + memset(&data[start], 0, size);
1.856 +#endif
1.857 +
1.858 + /* Add the space back into the linked list of freeblocks */
1.859 + hdr = pPage->hdrOffset;
1.860 + addr = hdr + 1;
1.861 + while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
1.862 + assert( pbegin<=pPage->pBt->usableSize-4 );
1.863 + if( pbegin<=addr ) return SQLITE_CORRUPT_BKPT;
1.864 + addr = pbegin;
1.865 + }
1.866 + if( pbegin>pPage->pBt->usableSize-4 ) return SQLITE_CORRUPT_BKPT;
1.867 + assert( pbegin>addr || pbegin==0 );
1.868 + put2byte(&data[addr], start);
1.869 + put2byte(&data[start], pbegin);
1.870 + put2byte(&data[start+2], size);
1.871 + pPage->nFree += size;
1.872 +
1.873 + /* Coalesce adjacent free blocks */
1.874 + addr = pPage->hdrOffset + 1;
1.875 + while( (pbegin = get2byte(&data[addr]))>0 ){
1.876 + int pnext, psize;
1.877 + assert( pbegin>addr );
1.878 + assert( pbegin<=pPage->pBt->usableSize-4 );
1.879 + pnext = get2byte(&data[pbegin]);
1.880 + psize = get2byte(&data[pbegin+2]);
1.881 + if( pbegin + psize + 3 >= pnext && pnext>0 ){
1.882 + int frag = pnext - (pbegin+psize);
1.883 + if( frag<0 || frag>data[pPage->hdrOffset+7] ) return SQLITE_CORRUPT_BKPT;
1.884 + data[pPage->hdrOffset+7] -= frag;
1.885 + put2byte(&data[pbegin], get2byte(&data[pnext]));
1.886 + put2byte(&data[pbegin+2], pnext+get2byte(&data[pnext+2])-pbegin);
1.887 + }else{
1.888 + addr = pbegin;
1.889 + }
1.890 + }
1.891 +
1.892 + /* If the cell content area begins with a freeblock, remove it. */
1.893 + if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){
1.894 + int top;
1.895 + pbegin = get2byte(&data[hdr+1]);
1.896 + memcpy(&data[hdr+1], &data[pbegin], 2);
1.897 + top = get2byte(&data[hdr+5]);
1.898 + put2byte(&data[hdr+5], top + get2byte(&data[pbegin+2]));
1.899 + }
1.900 + return SQLITE_OK;
1.901 +}
1.902 +
1.903 +/*
1.904 +** Decode the flags byte (the first byte of the header) for a page
1.905 +** and initialize fields of the MemPage structure accordingly.
1.906 +**
1.907 +** Only the following combinations are supported. Anything different
1.908 +** indicates a corrupt database files:
1.909 +**
1.910 +** PTF_ZERODATA
1.911 +** PTF_ZERODATA | PTF_LEAF
1.912 +** PTF_LEAFDATA | PTF_INTKEY
1.913 +** PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
1.914 +*/
1.915 +static int decodeFlags(MemPage *pPage, int flagByte){
1.916 + BtShared *pBt; /* A copy of pPage->pBt */
1.917 +
1.918 + assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
1.919 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.920 + pPage->leaf = flagByte>>3; assert( PTF_LEAF == 1<<3 );
1.921 + flagByte &= ~PTF_LEAF;
1.922 + pPage->childPtrSize = 4-4*pPage->leaf;
1.923 + pBt = pPage->pBt;
1.924 + if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
1.925 + pPage->intKey = 1;
1.926 + pPage->hasData = pPage->leaf;
1.927 + pPage->maxLocal = pBt->maxLeaf;
1.928 + pPage->minLocal = pBt->minLeaf;
1.929 + }else if( flagByte==PTF_ZERODATA ){
1.930 + pPage->intKey = 0;
1.931 + pPage->hasData = 0;
1.932 + pPage->maxLocal = pBt->maxLocal;
1.933 + pPage->minLocal = pBt->minLocal;
1.934 + }else{
1.935 + return SQLITE_CORRUPT_BKPT;
1.936 + }
1.937 + return SQLITE_OK;
1.938 +}
1.939 +
1.940 +/*
1.941 +** Initialize the auxiliary information for a disk block.
1.942 +**
1.943 +** Return SQLITE_OK on success. If we see that the page does
1.944 +** not contain a well-formed database page, then return
1.945 +** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not
1.946 +** guarantee that the page is well-formed. It only shows that
1.947 +** we failed to detect any corruption.
1.948 +*/
1.949 +int sqlite3BtreeInitPage(MemPage *pPage){
1.950 +
1.951 + assert( pPage->pBt!=0 );
1.952 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.953 + assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
1.954 + assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
1.955 + assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
1.956 +
1.957 + if( !pPage->isInit ){
1.958 + int pc; /* Address of a freeblock within pPage->aData[] */
1.959 + int hdr; /* Offset to beginning of page header */
1.960 + u8 *data; /* Equal to pPage->aData */
1.961 + BtShared *pBt; /* The main btree structure */
1.962 + int usableSize; /* Amount of usable space on each page */
1.963 + int cellOffset; /* Offset from start of page to first cell pointer */
1.964 + int nFree; /* Number of unused bytes on the page */
1.965 + int top; /* First byte of the cell content area */
1.966 +
1.967 + pBt = pPage->pBt;
1.968 +
1.969 + hdr = pPage->hdrOffset;
1.970 + data = pPage->aData;
1.971 + if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;
1.972 + assert( pBt->pageSize>=512 && pBt->pageSize<=32768 );
1.973 + pPage->maskPage = pBt->pageSize - 1;
1.974 + pPage->nOverflow = 0;
1.975 + usableSize = pBt->usableSize;
1.976 + pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
1.977 + top = get2byte(&data[hdr+5]);
1.978 + pPage->nCell = get2byte(&data[hdr+3]);
1.979 + if( pPage->nCell>MX_CELL(pBt) ){
1.980 + /* To many cells for a single page. The page must be corrupt */
1.981 + return SQLITE_CORRUPT_BKPT;
1.982 + }
1.983 +
1.984 + /* Compute the total free space on the page */
1.985 + pc = get2byte(&data[hdr+1]);
1.986 + nFree = data[hdr+7] + top - (cellOffset + 2*pPage->nCell);
1.987 + while( pc>0 ){
1.988 + int next, size;
1.989 + if( pc>usableSize-4 ){
1.990 + /* Free block is off the page */
1.991 + return SQLITE_CORRUPT_BKPT;
1.992 + }
1.993 + next = get2byte(&data[pc]);
1.994 + size = get2byte(&data[pc+2]);
1.995 + if( next>0 && next<=pc+size+3 ){
1.996 + /* Free blocks must be in accending order */
1.997 + return SQLITE_CORRUPT_BKPT;
1.998 + }
1.999 + nFree += size;
1.1000 + pc = next;
1.1001 + }
1.1002 + pPage->nFree = nFree;
1.1003 + if( nFree>=usableSize ){
1.1004 + /* Free space cannot exceed total page size */
1.1005 + return SQLITE_CORRUPT_BKPT;
1.1006 + }
1.1007 +
1.1008 +#if 0
1.1009 + /* Check that all the offsets in the cell offset array are within range.
1.1010 + **
1.1011 + ** Omitting this consistency check and using the pPage->maskPage mask
1.1012 + ** to prevent overrunning the page buffer in findCell() results in a
1.1013 + ** 2.5% performance gain.
1.1014 + */
1.1015 + {
1.1016 + u8 *pOff; /* Iterator used to check all cell offsets are in range */
1.1017 + u8 *pEnd; /* Pointer to end of cell offset array */
1.1018 + u8 mask; /* Mask of bits that must be zero in MSB of cell offsets */
1.1019 + mask = ~(((u8)(pBt->pageSize>>8))-1);
1.1020 + pEnd = &data[cellOffset + pPage->nCell*2];
1.1021 + for(pOff=&data[cellOffset]; pOff!=pEnd && !((*pOff)&mask); pOff+=2);
1.1022 + if( pOff!=pEnd ){
1.1023 + return SQLITE_CORRUPT_BKPT;
1.1024 + }
1.1025 + }
1.1026 +#endif
1.1027 +
1.1028 + pPage->isInit = 1;
1.1029 + }
1.1030 + return SQLITE_OK;
1.1031 +}
1.1032 +
1.1033 +/*
1.1034 +** Set up a raw page so that it looks like a database page holding
1.1035 +** no entries.
1.1036 +*/
1.1037 +static void zeroPage(MemPage *pPage, int flags){
1.1038 + unsigned char *data = pPage->aData;
1.1039 + BtShared *pBt = pPage->pBt;
1.1040 + int hdr = pPage->hdrOffset;
1.1041 + int first;
1.1042 +
1.1043 + assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
1.1044 + assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1.1045 + assert( sqlite3PagerGetData(pPage->pDbPage) == data );
1.1046 + assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1.1047 + assert( sqlite3_mutex_held(pBt->mutex) );
1.1048 + /*memset(&data[hdr], 0, pBt->usableSize - hdr);*/
1.1049 + data[hdr] = flags;
1.1050 + first = hdr + 8 + 4*((flags&PTF_LEAF)==0);
1.1051 + memset(&data[hdr+1], 0, 4);
1.1052 + data[hdr+7] = 0;
1.1053 + put2byte(&data[hdr+5], pBt->usableSize);
1.1054 + pPage->nFree = pBt->usableSize - first;
1.1055 + decodeFlags(pPage, flags);
1.1056 + pPage->hdrOffset = hdr;
1.1057 + pPage->cellOffset = first;
1.1058 + pPage->nOverflow = 0;
1.1059 + assert( pBt->pageSize>=512 && pBt->pageSize<=32768 );
1.1060 + pPage->maskPage = pBt->pageSize - 1;
1.1061 + pPage->nCell = 0;
1.1062 + pPage->isInit = 1;
1.1063 +}
1.1064 +
1.1065 +
1.1066 +/*
1.1067 +** Convert a DbPage obtained from the pager into a MemPage used by
1.1068 +** the btree layer.
1.1069 +*/
1.1070 +static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
1.1071 + MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
1.1072 + pPage->aData = sqlite3PagerGetData(pDbPage);
1.1073 + pPage->pDbPage = pDbPage;
1.1074 + pPage->pBt = pBt;
1.1075 + pPage->pgno = pgno;
1.1076 + pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
1.1077 + return pPage;
1.1078 +}
1.1079 +
1.1080 +/*
1.1081 +** Get a page from the pager. Initialize the MemPage.pBt and
1.1082 +** MemPage.aData elements if needed.
1.1083 +**
1.1084 +** If the noContent flag is set, it means that we do not care about
1.1085 +** the content of the page at this time. So do not go to the disk
1.1086 +** to fetch the content. Just fill in the content with zeros for now.
1.1087 +** If in the future we call sqlite3PagerWrite() on this page, that
1.1088 +** means we have started to be concerned about content and the disk
1.1089 +** read should occur at that point.
1.1090 +*/
1.1091 +int sqlite3BtreeGetPage(
1.1092 + BtShared *pBt, /* The btree */
1.1093 + Pgno pgno, /* Number of the page to fetch */
1.1094 + MemPage **ppPage, /* Return the page in this parameter */
1.1095 + int noContent /* Do not load page content if true */
1.1096 +){
1.1097 + int rc;
1.1098 + DbPage *pDbPage;
1.1099 +
1.1100 + assert( sqlite3_mutex_held(pBt->mutex) );
1.1101 + rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent);
1.1102 + if( rc ) return rc;
1.1103 + *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
1.1104 + return SQLITE_OK;
1.1105 +}
1.1106 +
1.1107 +/*
1.1108 +** Return the size of the database file in pages. Or return -1 if
1.1109 +** there is any kind of error.
1.1110 +*/
1.1111 +static int pagerPagecount(Pager *pPager){
1.1112 + int rc;
1.1113 + int nPage;
1.1114 + rc = sqlite3PagerPagecount(pPager, &nPage);
1.1115 + return (rc==SQLITE_OK?nPage:-1);
1.1116 +}
1.1117 +
1.1118 +/*
1.1119 +** Get a page from the pager and initialize it. This routine
1.1120 +** is just a convenience wrapper around separate calls to
1.1121 +** sqlite3BtreeGetPage() and sqlite3BtreeInitPage().
1.1122 +*/
1.1123 +static int getAndInitPage(
1.1124 + BtShared *pBt, /* The database file */
1.1125 + Pgno pgno, /* Number of the page to get */
1.1126 + MemPage **ppPage /* Write the page pointer here */
1.1127 +){
1.1128 + int rc;
1.1129 + DbPage *pDbPage;
1.1130 + MemPage *pPage;
1.1131 +
1.1132 + assert( sqlite3_mutex_held(pBt->mutex) );
1.1133 + if( pgno==0 ){
1.1134 + return SQLITE_CORRUPT_BKPT;
1.1135 + }
1.1136 +
1.1137 + /* It is often the case that the page we want is already in cache.
1.1138 + ** If so, get it directly. This saves us from having to call
1.1139 + ** pagerPagecount() to make sure pgno is within limits, which results
1.1140 + ** in a measureable performance improvements.
1.1141 + */
1.1142 + pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
1.1143 + if( pDbPage ){
1.1144 + /* Page is already in cache */
1.1145 + *ppPage = pPage = btreePageFromDbPage(pDbPage, pgno, pBt);
1.1146 + rc = SQLITE_OK;
1.1147 + }else{
1.1148 + /* Page not in cache. Acquire it. */
1.1149 + if( pgno>pagerPagecount(pBt->pPager) ){
1.1150 + return SQLITE_CORRUPT_BKPT;
1.1151 + }
1.1152 + rc = sqlite3BtreeGetPage(pBt, pgno, ppPage, 0);
1.1153 + if( rc ) return rc;
1.1154 + pPage = *ppPage;
1.1155 + }
1.1156 + if( !pPage->isInit ){
1.1157 + rc = sqlite3BtreeInitPage(pPage);
1.1158 + }
1.1159 + if( rc!=SQLITE_OK ){
1.1160 + releasePage(pPage);
1.1161 + *ppPage = 0;
1.1162 + }
1.1163 + return rc;
1.1164 +}
1.1165 +
1.1166 +/*
1.1167 +** Release a MemPage. This should be called once for each prior
1.1168 +** call to sqlite3BtreeGetPage.
1.1169 +*/
1.1170 +static void releasePage(MemPage *pPage){
1.1171 + if( pPage ){
1.1172 + assert( pPage->aData );
1.1173 + assert( pPage->pBt );
1.1174 + assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1.1175 + assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
1.1176 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.1177 + sqlite3PagerUnref(pPage->pDbPage);
1.1178 + }
1.1179 +}
1.1180 +
1.1181 +/*
1.1182 +** During a rollback, when the pager reloads information into the cache
1.1183 +** so that the cache is restored to its original state at the start of
1.1184 +** the transaction, for each page restored this routine is called.
1.1185 +**
1.1186 +** This routine needs to reset the extra data section at the end of the
1.1187 +** page to agree with the restored data.
1.1188 +*/
1.1189 +static void pageReinit(DbPage *pData){
1.1190 + MemPage *pPage;
1.1191 + pPage = (MemPage *)sqlite3PagerGetExtra(pData);
1.1192 + if( pPage->isInit ){
1.1193 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.1194 + pPage->isInit = 0;
1.1195 + if( sqlite3PagerPageRefcount(pData)>0 ){
1.1196 + sqlite3BtreeInitPage(pPage);
1.1197 + }
1.1198 + }
1.1199 +}
1.1200 +
1.1201 +/*
1.1202 +** Invoke the busy handler for a btree.
1.1203 +*/
1.1204 +static int sqlite3BtreeInvokeBusyHandler(void *pArg, int n){
1.1205 + BtShared *pBt = (BtShared*)pArg;
1.1206 + assert( pBt->db );
1.1207 + assert( sqlite3_mutex_held(pBt->db->mutex) );
1.1208 + return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
1.1209 +}
1.1210 +
1.1211 +/*
1.1212 +** Open a database file.
1.1213 +**
1.1214 +** zFilename is the name of the database file. If zFilename is NULL
1.1215 +** a new database with a random name is created. This randomly named
1.1216 +** database file will be deleted when sqlite3BtreeClose() is called.
1.1217 +** If zFilename is ":memory:" then an in-memory database is created
1.1218 +** that is automatically destroyed when it is closed.
1.1219 +*/
1.1220 +int sqlite3BtreeOpen(
1.1221 + const char *zFilename, /* Name of the file containing the BTree database */
1.1222 + sqlite3 *db, /* Associated database handle */
1.1223 + Btree **ppBtree, /* Pointer to new Btree object written here */
1.1224 + int flags, /* Options */
1.1225 + int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */
1.1226 +){
1.1227 + sqlite3_vfs *pVfs; /* The VFS to use for this btree */
1.1228 + BtShared *pBt = 0; /* Shared part of btree structure */
1.1229 + Btree *p; /* Handle to return */
1.1230 + int rc = SQLITE_OK;
1.1231 + int nReserve;
1.1232 + unsigned char zDbHeader[100];
1.1233 +
1.1234 + /* Set the variable isMemdb to true for an in-memory database, or
1.1235 + ** false for a file-based database. This symbol is only required if
1.1236 + ** either of the shared-data or autovacuum features are compiled
1.1237 + ** into the library.
1.1238 + */
1.1239 +#if !defined(SQLITE_OMIT_SHARED_CACHE) || !defined(SQLITE_OMIT_AUTOVACUUM)
1.1240 + #ifdef SQLITE_OMIT_MEMORYDB
1.1241 + const int isMemdb = 0;
1.1242 + #else
1.1243 + const int isMemdb = zFilename && !strcmp(zFilename, ":memory:");
1.1244 + #endif
1.1245 +#endif
1.1246 +
1.1247 + assert( db!=0 );
1.1248 + assert( sqlite3_mutex_held(db->mutex) );
1.1249 +
1.1250 + pVfs = db->pVfs;
1.1251 + p = sqlite3MallocZero(sizeof(Btree));
1.1252 + if( !p ){
1.1253 + return SQLITE_NOMEM;
1.1254 + }
1.1255 + p->inTrans = TRANS_NONE;
1.1256 + p->db = db;
1.1257 +
1.1258 +#if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1.1259 + /*
1.1260 + ** If this Btree is a candidate for shared cache, try to find an
1.1261 + ** existing BtShared object that we can share with
1.1262 + */
1.1263 + if( isMemdb==0
1.1264 + && (db->flags & SQLITE_Vtab)==0
1.1265 + && zFilename && zFilename[0]
1.1266 + ){
1.1267 + if( sqlite3GlobalConfig.sharedCacheEnabled ){
1.1268 + int nFullPathname = pVfs->mxPathname+1;
1.1269 + char *zFullPathname = sqlite3Malloc(nFullPathname);
1.1270 + sqlite3_mutex *mutexShared;
1.1271 + p->sharable = 1;
1.1272 + db->flags |= SQLITE_SharedCache;
1.1273 + if( !zFullPathname ){
1.1274 + sqlite3_free(p);
1.1275 + return SQLITE_NOMEM;
1.1276 + }
1.1277 + sqlite3OsFullPathname(pVfs, zFilename, nFullPathname, zFullPathname);
1.1278 + mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1.1279 + sqlite3_mutex_enter(mutexShared);
1.1280 + for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
1.1281 + assert( pBt->nRef>0 );
1.1282 + if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager))
1.1283 + && sqlite3PagerVfs(pBt->pPager)==pVfs ){
1.1284 + p->pBt = pBt;
1.1285 + pBt->nRef++;
1.1286 + break;
1.1287 + }
1.1288 + }
1.1289 + sqlite3_mutex_leave(mutexShared);
1.1290 + sqlite3_free(zFullPathname);
1.1291 + }
1.1292 +#ifdef SQLITE_DEBUG
1.1293 + else{
1.1294 + /* In debug mode, we mark all persistent databases as sharable
1.1295 + ** even when they are not. This exercises the locking code and
1.1296 + ** gives more opportunity for asserts(sqlite3_mutex_held())
1.1297 + ** statements to find locking problems.
1.1298 + */
1.1299 + p->sharable = 1;
1.1300 + }
1.1301 +#endif
1.1302 + }
1.1303 +#endif
1.1304 + if( pBt==0 ){
1.1305 + /*
1.1306 + ** The following asserts make sure that structures used by the btree are
1.1307 + ** the right size. This is to guard against size changes that result
1.1308 + ** when compiling on a different architecture.
1.1309 + */
1.1310 + assert( sizeof(i64)==8 || sizeof(i64)==4 );
1.1311 + assert( sizeof(u64)==8 || sizeof(u64)==4 );
1.1312 + assert( sizeof(u32)==4 );
1.1313 + assert( sizeof(u16)==2 );
1.1314 + assert( sizeof(Pgno)==4 );
1.1315 +
1.1316 + pBt = sqlite3MallocZero( sizeof(*pBt) );
1.1317 + if( pBt==0 ){
1.1318 + rc = SQLITE_NOMEM;
1.1319 + goto btree_open_out;
1.1320 + }
1.1321 + pBt->busyHdr.xFunc = sqlite3BtreeInvokeBusyHandler;
1.1322 + pBt->busyHdr.pArg = pBt;
1.1323 + rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
1.1324 + EXTRA_SIZE, flags, vfsFlags);
1.1325 + if( rc==SQLITE_OK ){
1.1326 + rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
1.1327 + }
1.1328 + if( rc!=SQLITE_OK ){
1.1329 + goto btree_open_out;
1.1330 + }
1.1331 + sqlite3PagerSetBusyhandler(pBt->pPager, &pBt->busyHdr);
1.1332 + p->pBt = pBt;
1.1333 +
1.1334 + sqlite3PagerSetReiniter(pBt->pPager, pageReinit);
1.1335 + pBt->pCursor = 0;
1.1336 + pBt->pPage1 = 0;
1.1337 + pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager);
1.1338 + pBt->pageSize = get2byte(&zDbHeader[16]);
1.1339 + if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
1.1340 + || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
1.1341 + pBt->pageSize = 0;
1.1342 + sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1.1343 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.1344 + /* If the magic name ":memory:" will create an in-memory database, then
1.1345 + ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
1.1346 + ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
1.1347 + ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
1.1348 + ** regular file-name. In this case the auto-vacuum applies as per normal.
1.1349 + */
1.1350 + if( zFilename && !isMemdb ){
1.1351 + pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
1.1352 + pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
1.1353 + }
1.1354 +#endif
1.1355 + nReserve = 0;
1.1356 + }else{
1.1357 + nReserve = zDbHeader[20];
1.1358 + pBt->pageSizeFixed = 1;
1.1359 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.1360 + pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
1.1361 + pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
1.1362 +#endif
1.1363 + }
1.1364 + pBt->usableSize = pBt->pageSize - nReserve;
1.1365 + assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */
1.1366 + sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1.1367 +
1.1368 +#if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1.1369 + /* Add the new BtShared object to the linked list sharable BtShareds.
1.1370 + */
1.1371 + if( p->sharable ){
1.1372 + sqlite3_mutex *mutexShared;
1.1373 + pBt->nRef = 1;
1.1374 + mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1.1375 + if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
1.1376 + pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
1.1377 + if( pBt->mutex==0 ){
1.1378 + rc = SQLITE_NOMEM;
1.1379 + db->mallocFailed = 0;
1.1380 + goto btree_open_out;
1.1381 + }
1.1382 + }
1.1383 + sqlite3_mutex_enter(mutexShared);
1.1384 + pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
1.1385 + GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
1.1386 + sqlite3_mutex_leave(mutexShared);
1.1387 + }
1.1388 +#endif
1.1389 + }
1.1390 +
1.1391 +#if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1.1392 + /* If the new Btree uses a sharable pBtShared, then link the new
1.1393 + ** Btree into the list of all sharable Btrees for the same connection.
1.1394 + ** The list is kept in ascending order by pBt address.
1.1395 + */
1.1396 + if( p->sharable ){
1.1397 + int i;
1.1398 + Btree *pSib;
1.1399 + for(i=0; i<db->nDb; i++){
1.1400 + if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
1.1401 + while( pSib->pPrev ){ pSib = pSib->pPrev; }
1.1402 + if( p->pBt<pSib->pBt ){
1.1403 + p->pNext = pSib;
1.1404 + p->pPrev = 0;
1.1405 + pSib->pPrev = p;
1.1406 + }else{
1.1407 + while( pSib->pNext && pSib->pNext->pBt<p->pBt ){
1.1408 + pSib = pSib->pNext;
1.1409 + }
1.1410 + p->pNext = pSib->pNext;
1.1411 + p->pPrev = pSib;
1.1412 + if( p->pNext ){
1.1413 + p->pNext->pPrev = p;
1.1414 + }
1.1415 + pSib->pNext = p;
1.1416 + }
1.1417 + break;
1.1418 + }
1.1419 + }
1.1420 + }
1.1421 +#endif
1.1422 + *ppBtree = p;
1.1423 +
1.1424 +btree_open_out:
1.1425 + if( rc!=SQLITE_OK ){
1.1426 + if( pBt && pBt->pPager ){
1.1427 + sqlite3PagerClose(pBt->pPager);
1.1428 + }
1.1429 + sqlite3_free(pBt);
1.1430 + sqlite3_free(p);
1.1431 + *ppBtree = 0;
1.1432 + }
1.1433 + return rc;
1.1434 +}
1.1435 +
1.1436 +/*
1.1437 +** Decrement the BtShared.nRef counter. When it reaches zero,
1.1438 +** remove the BtShared structure from the sharing list. Return
1.1439 +** true if the BtShared.nRef counter reaches zero and return
1.1440 +** false if it is still positive.
1.1441 +*/
1.1442 +static int removeFromSharingList(BtShared *pBt){
1.1443 +#ifndef SQLITE_OMIT_SHARED_CACHE
1.1444 + sqlite3_mutex *pMaster;
1.1445 + BtShared *pList;
1.1446 + int removed = 0;
1.1447 +
1.1448 + assert( sqlite3_mutex_notheld(pBt->mutex) );
1.1449 + pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1.1450 + sqlite3_mutex_enter(pMaster);
1.1451 + pBt->nRef--;
1.1452 + if( pBt->nRef<=0 ){
1.1453 + if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
1.1454 + GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
1.1455 + }else{
1.1456 + pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
1.1457 + while( ALWAYS(pList) && pList->pNext!=pBt ){
1.1458 + pList=pList->pNext;
1.1459 + }
1.1460 + if( ALWAYS(pList) ){
1.1461 + pList->pNext = pBt->pNext;
1.1462 + }
1.1463 + }
1.1464 + if( SQLITE_THREADSAFE ){
1.1465 + sqlite3_mutex_free(pBt->mutex);
1.1466 + }
1.1467 + removed = 1;
1.1468 + }
1.1469 + sqlite3_mutex_leave(pMaster);
1.1470 + return removed;
1.1471 +#else
1.1472 + return 1;
1.1473 +#endif
1.1474 +}
1.1475 +
1.1476 +/*
1.1477 +** Make sure pBt->pTmpSpace points to an allocation of
1.1478 +** MX_CELL_SIZE(pBt) bytes.
1.1479 +*/
1.1480 +static void allocateTempSpace(BtShared *pBt){
1.1481 + if( !pBt->pTmpSpace ){
1.1482 + pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
1.1483 + }
1.1484 +}
1.1485 +
1.1486 +/*
1.1487 +** Free the pBt->pTmpSpace allocation
1.1488 +*/
1.1489 +static void freeTempSpace(BtShared *pBt){
1.1490 + sqlite3PageFree( pBt->pTmpSpace);
1.1491 + pBt->pTmpSpace = 0;
1.1492 +}
1.1493 +
1.1494 +/*
1.1495 +** Close an open database and invalidate all cursors.
1.1496 +*/
1.1497 +int sqlite3BtreeClose(Btree *p){
1.1498 + BtShared *pBt = p->pBt;
1.1499 + BtCursor *pCur;
1.1500 +
1.1501 + /* Close all cursors opened via this handle. */
1.1502 + assert( sqlite3_mutex_held(p->db->mutex) );
1.1503 + sqlite3BtreeEnter(p);
1.1504 + pBt->db = p->db;
1.1505 + pCur = pBt->pCursor;
1.1506 + while( pCur ){
1.1507 + BtCursor *pTmp = pCur;
1.1508 + pCur = pCur->pNext;
1.1509 + if( pTmp->pBtree==p ){
1.1510 + sqlite3BtreeCloseCursor(pTmp);
1.1511 + }
1.1512 + }
1.1513 +
1.1514 + /* Rollback any active transaction and free the handle structure.
1.1515 + ** The call to sqlite3BtreeRollback() drops any table-locks held by
1.1516 + ** this handle.
1.1517 + */
1.1518 + sqlite3BtreeRollback(p);
1.1519 + sqlite3BtreeLeave(p);
1.1520 +
1.1521 + /* If there are still other outstanding references to the shared-btree
1.1522 + ** structure, return now. The remainder of this procedure cleans
1.1523 + ** up the shared-btree.
1.1524 + */
1.1525 + assert( p->wantToLock==0 && p->locked==0 );
1.1526 + if( !p->sharable || removeFromSharingList(pBt) ){
1.1527 + /* The pBt is no longer on the sharing list, so we can access
1.1528 + ** it without having to hold the mutex.
1.1529 + **
1.1530 + ** Clean out and delete the BtShared object.
1.1531 + */
1.1532 + assert( !pBt->pCursor );
1.1533 + sqlite3PagerClose(pBt->pPager);
1.1534 + if( pBt->xFreeSchema && pBt->pSchema ){
1.1535 + pBt->xFreeSchema(pBt->pSchema);
1.1536 + }
1.1537 + sqlite3_free(pBt->pSchema);
1.1538 + freeTempSpace(pBt);
1.1539 + sqlite3_free(pBt);
1.1540 + }
1.1541 +
1.1542 +#ifndef SQLITE_OMIT_SHARED_CACHE
1.1543 + assert( p->wantToLock==0 );
1.1544 + assert( p->locked==0 );
1.1545 + if( p->pPrev ) p->pPrev->pNext = p->pNext;
1.1546 + if( p->pNext ) p->pNext->pPrev = p->pPrev;
1.1547 +#endif
1.1548 +
1.1549 + sqlite3_free(p);
1.1550 + return SQLITE_OK;
1.1551 +}
1.1552 +
1.1553 +/*
1.1554 +** Change the limit on the number of pages allowed in the cache.
1.1555 +**
1.1556 +** The maximum number of cache pages is set to the absolute
1.1557 +** value of mxPage. If mxPage is negative, the pager will
1.1558 +** operate asynchronously - it will not stop to do fsync()s
1.1559 +** to insure data is written to the disk surface before
1.1560 +** continuing. Transactions still work if synchronous is off,
1.1561 +** and the database cannot be corrupted if this program
1.1562 +** crashes. But if the operating system crashes or there is
1.1563 +** an abrupt power failure when synchronous is off, the database
1.1564 +** could be left in an inconsistent and unrecoverable state.
1.1565 +** Synchronous is on by default so database corruption is not
1.1566 +** normally a worry.
1.1567 +*/
1.1568 +int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
1.1569 + BtShared *pBt = p->pBt;
1.1570 + assert( sqlite3_mutex_held(p->db->mutex) );
1.1571 + sqlite3BtreeEnter(p);
1.1572 + sqlite3PagerSetCachesize(pBt->pPager, mxPage);
1.1573 + sqlite3BtreeLeave(p);
1.1574 + return SQLITE_OK;
1.1575 +}
1.1576 +
1.1577 +/*
1.1578 +** Change the way data is synced to disk in order to increase or decrease
1.1579 +** how well the database resists damage due to OS crashes and power
1.1580 +** failures. Level 1 is the same as asynchronous (no syncs() occur and
1.1581 +** there is a high probability of damage) Level 2 is the default. There
1.1582 +** is a very low but non-zero probability of damage. Level 3 reduces the
1.1583 +** probability of damage to near zero but with a write performance reduction.
1.1584 +*/
1.1585 +#ifndef SQLITE_OMIT_PAGER_PRAGMAS
1.1586 +int sqlite3BtreeSetSafetyLevel(Btree *p, int level, int fullSync){
1.1587 + BtShared *pBt = p->pBt;
1.1588 + assert( sqlite3_mutex_held(p->db->mutex) );
1.1589 + sqlite3BtreeEnter(p);
1.1590 + sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync);
1.1591 + sqlite3BtreeLeave(p);
1.1592 + return SQLITE_OK;
1.1593 +}
1.1594 +#endif
1.1595 +
1.1596 +/*
1.1597 +** Return TRUE if the given btree is set to safety level 1. In other
1.1598 +** words, return TRUE if no sync() occurs on the disk files.
1.1599 +*/
1.1600 +int sqlite3BtreeSyncDisabled(Btree *p){
1.1601 + BtShared *pBt = p->pBt;
1.1602 + int rc;
1.1603 + assert( sqlite3_mutex_held(p->db->mutex) );
1.1604 + sqlite3BtreeEnter(p);
1.1605 + assert( pBt && pBt->pPager );
1.1606 + rc = sqlite3PagerNosync(pBt->pPager);
1.1607 + sqlite3BtreeLeave(p);
1.1608 + return rc;
1.1609 +}
1.1610 +
1.1611 +#if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
1.1612 +/*
1.1613 +** Change the default pages size and the number of reserved bytes per page.
1.1614 +**
1.1615 +** The page size must be a power of 2 between 512 and 65536. If the page
1.1616 +** size supplied does not meet this constraint then the page size is not
1.1617 +** changed.
1.1618 +**
1.1619 +** Page sizes are constrained to be a power of two so that the region
1.1620 +** of the database file used for locking (beginning at PENDING_BYTE,
1.1621 +** the first byte past the 1GB boundary, 0x40000000) needs to occur
1.1622 +** at the beginning of a page.
1.1623 +**
1.1624 +** If parameter nReserve is less than zero, then the number of reserved
1.1625 +** bytes per page is left unchanged.
1.1626 +*/
1.1627 +int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve){
1.1628 + int rc = SQLITE_OK;
1.1629 + BtShared *pBt = p->pBt;
1.1630 + sqlite3BtreeEnter(p);
1.1631 + if( pBt->pageSizeFixed ){
1.1632 + sqlite3BtreeLeave(p);
1.1633 + return SQLITE_READONLY;
1.1634 + }
1.1635 + if( nReserve<0 ){
1.1636 + nReserve = pBt->pageSize - pBt->usableSize;
1.1637 + }
1.1638 + if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
1.1639 + ((pageSize-1)&pageSize)==0 ){
1.1640 + assert( (pageSize & 7)==0 );
1.1641 + assert( !pBt->pPage1 && !pBt->pCursor );
1.1642 + pBt->pageSize = pageSize;
1.1643 + freeTempSpace(pBt);
1.1644 + rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1.1645 + }
1.1646 + pBt->usableSize = pBt->pageSize - nReserve;
1.1647 + sqlite3BtreeLeave(p);
1.1648 + return rc;
1.1649 +}
1.1650 +
1.1651 +/*
1.1652 +** Return the currently defined page size
1.1653 +*/
1.1654 +int sqlite3BtreeGetPageSize(Btree *p){
1.1655 + return p->pBt->pageSize;
1.1656 +}
1.1657 +int sqlite3BtreeGetReserve(Btree *p){
1.1658 + int n;
1.1659 + sqlite3BtreeEnter(p);
1.1660 + n = p->pBt->pageSize - p->pBt->usableSize;
1.1661 + sqlite3BtreeLeave(p);
1.1662 + return n;
1.1663 +}
1.1664 +
1.1665 +/*
1.1666 +** Set the maximum page count for a database if mxPage is positive.
1.1667 +** No changes are made if mxPage is 0 or negative.
1.1668 +** Regardless of the value of mxPage, return the maximum page count.
1.1669 +*/
1.1670 +int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
1.1671 + int n;
1.1672 + sqlite3BtreeEnter(p);
1.1673 + n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
1.1674 + sqlite3BtreeLeave(p);
1.1675 + return n;
1.1676 +}
1.1677 +#endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */
1.1678 +
1.1679 +/*
1.1680 +** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
1.1681 +** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
1.1682 +** is disabled. The default value for the auto-vacuum property is
1.1683 +** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
1.1684 +*/
1.1685 +int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
1.1686 +#ifdef SQLITE_OMIT_AUTOVACUUM
1.1687 + return SQLITE_READONLY;
1.1688 +#else
1.1689 + BtShared *pBt = p->pBt;
1.1690 + int rc = SQLITE_OK;
1.1691 + int av = (autoVacuum?1:0);
1.1692 +
1.1693 + sqlite3BtreeEnter(p);
1.1694 + if( pBt->pageSizeFixed && av!=pBt->autoVacuum ){
1.1695 + rc = SQLITE_READONLY;
1.1696 + }else{
1.1697 + pBt->autoVacuum = av;
1.1698 + }
1.1699 + sqlite3BtreeLeave(p);
1.1700 + return rc;
1.1701 +#endif
1.1702 +}
1.1703 +
1.1704 +/*
1.1705 +** Return the value of the 'auto-vacuum' property. If auto-vacuum is
1.1706 +** enabled 1 is returned. Otherwise 0.
1.1707 +*/
1.1708 +int sqlite3BtreeGetAutoVacuum(Btree *p){
1.1709 +#ifdef SQLITE_OMIT_AUTOVACUUM
1.1710 + return BTREE_AUTOVACUUM_NONE;
1.1711 +#else
1.1712 + int rc;
1.1713 + sqlite3BtreeEnter(p);
1.1714 + rc = (
1.1715 + (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
1.1716 + (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
1.1717 + BTREE_AUTOVACUUM_INCR
1.1718 + );
1.1719 + sqlite3BtreeLeave(p);
1.1720 + return rc;
1.1721 +#endif
1.1722 +}
1.1723 +
1.1724 +
1.1725 +/*
1.1726 +** Get a reference to pPage1 of the database file. This will
1.1727 +** also acquire a readlock on that file.
1.1728 +**
1.1729 +** SQLITE_OK is returned on success. If the file is not a
1.1730 +** well-formed database file, then SQLITE_CORRUPT is returned.
1.1731 +** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM
1.1732 +** is returned if we run out of memory.
1.1733 +*/
1.1734 +static int lockBtree(BtShared *pBt){
1.1735 + int rc;
1.1736 + MemPage *pPage1;
1.1737 + int nPage;
1.1738 +
1.1739 + assert( sqlite3_mutex_held(pBt->mutex) );
1.1740 + if( pBt->pPage1 ) return SQLITE_OK;
1.1741 + rc = sqlite3BtreeGetPage(pBt, 1, &pPage1, 0);
1.1742 + if( rc!=SQLITE_OK ) return rc;
1.1743 +
1.1744 + /* Do some checking to help insure the file we opened really is
1.1745 + ** a valid database file.
1.1746 + */
1.1747 + rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
1.1748 + if( rc!=SQLITE_OK ){
1.1749 + goto page1_init_failed;
1.1750 + }else if( nPage>0 ){
1.1751 + int pageSize;
1.1752 + int usableSize;
1.1753 + u8 *page1 = pPage1->aData;
1.1754 + rc = SQLITE_NOTADB;
1.1755 + if( memcmp(page1, zMagicHeader, 16)!=0 ){
1.1756 + goto page1_init_failed;
1.1757 + }
1.1758 + if( page1[18]>1 ){
1.1759 + pBt->readOnly = 1;
1.1760 + }
1.1761 + if( page1[19]>1 ){
1.1762 + goto page1_init_failed;
1.1763 + }
1.1764 +
1.1765 + /* The maximum embedded fraction must be exactly 25%. And the minimum
1.1766 + ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data.
1.1767 + ** The original design allowed these amounts to vary, but as of
1.1768 + ** version 3.6.0, we require them to be fixed.
1.1769 + */
1.1770 + if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
1.1771 + goto page1_init_failed;
1.1772 + }
1.1773 + pageSize = get2byte(&page1[16]);
1.1774 + if( ((pageSize-1)&pageSize)!=0 || pageSize<512 ||
1.1775 + (SQLITE_MAX_PAGE_SIZE<32768 && pageSize>SQLITE_MAX_PAGE_SIZE)
1.1776 + ){
1.1777 + goto page1_init_failed;
1.1778 + }
1.1779 + assert( (pageSize & 7)==0 );
1.1780 + usableSize = pageSize - page1[20];
1.1781 + if( pageSize!=pBt->pageSize ){
1.1782 + /* After reading the first page of the database assuming a page size
1.1783 + ** of BtShared.pageSize, we have discovered that the page-size is
1.1784 + ** actually pageSize. Unlock the database, leave pBt->pPage1 at
1.1785 + ** zero and return SQLITE_OK. The caller will call this function
1.1786 + ** again with the correct page-size.
1.1787 + */
1.1788 + releasePage(pPage1);
1.1789 + pBt->usableSize = usableSize;
1.1790 + pBt->pageSize = pageSize;
1.1791 + freeTempSpace(pBt);
1.1792 + sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1.1793 + return SQLITE_OK;
1.1794 + }
1.1795 + if( usableSize<500 ){
1.1796 + goto page1_init_failed;
1.1797 + }
1.1798 + pBt->pageSize = pageSize;
1.1799 + pBt->usableSize = usableSize;
1.1800 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.1801 + pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
1.1802 + pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
1.1803 +#endif
1.1804 + }
1.1805 +
1.1806 + /* maxLocal is the maximum amount of payload to store locally for
1.1807 + ** a cell. Make sure it is small enough so that at least minFanout
1.1808 + ** cells can will fit on one page. We assume a 10-byte page header.
1.1809 + ** Besides the payload, the cell must store:
1.1810 + ** 2-byte pointer to the cell
1.1811 + ** 4-byte child pointer
1.1812 + ** 9-byte nKey value
1.1813 + ** 4-byte nData value
1.1814 + ** 4-byte overflow page pointer
1.1815 + ** So a cell consists of a 2-byte poiner, a header which is as much as
1.1816 + ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
1.1817 + ** page pointer.
1.1818 + */
1.1819 + pBt->maxLocal = (pBt->usableSize-12)*64/255 - 23;
1.1820 + pBt->minLocal = (pBt->usableSize-12)*32/255 - 23;
1.1821 + pBt->maxLeaf = pBt->usableSize - 35;
1.1822 + pBt->minLeaf = (pBt->usableSize-12)*32/255 - 23;
1.1823 + assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
1.1824 + pBt->pPage1 = pPage1;
1.1825 + return SQLITE_OK;
1.1826 +
1.1827 +page1_init_failed:
1.1828 + releasePage(pPage1);
1.1829 + pBt->pPage1 = 0;
1.1830 + return rc;
1.1831 +}
1.1832 +
1.1833 +/*
1.1834 +** This routine works like lockBtree() except that it also invokes the
1.1835 +** busy callback if there is lock contention.
1.1836 +*/
1.1837 +static int lockBtreeWithRetry(Btree *pRef){
1.1838 + int rc = SQLITE_OK;
1.1839 +
1.1840 + assert( sqlite3BtreeHoldsMutex(pRef) );
1.1841 + if( pRef->inTrans==TRANS_NONE ){
1.1842 + u8 inTransaction = pRef->pBt->inTransaction;
1.1843 + btreeIntegrity(pRef);
1.1844 + rc = sqlite3BtreeBeginTrans(pRef, 0);
1.1845 + pRef->pBt->inTransaction = inTransaction;
1.1846 + pRef->inTrans = TRANS_NONE;
1.1847 + if( rc==SQLITE_OK ){
1.1848 + pRef->pBt->nTransaction--;
1.1849 + }
1.1850 + btreeIntegrity(pRef);
1.1851 + }
1.1852 + return rc;
1.1853 +}
1.1854 +
1.1855 +
1.1856 +/*
1.1857 +** If there are no outstanding cursors and we are not in the middle
1.1858 +** of a transaction but there is a read lock on the database, then
1.1859 +** this routine unrefs the first page of the database file which
1.1860 +** has the effect of releasing the read lock.
1.1861 +**
1.1862 +** If there are any outstanding cursors, this routine is a no-op.
1.1863 +**
1.1864 +** If there is a transaction in progress, this routine is a no-op.
1.1865 +*/
1.1866 +static void unlockBtreeIfUnused(BtShared *pBt){
1.1867 + assert( sqlite3_mutex_held(pBt->mutex) );
1.1868 + if( pBt->inTransaction==TRANS_NONE && pBt->pCursor==0 && pBt->pPage1!=0 ){
1.1869 + if( sqlite3PagerRefcount(pBt->pPager)>=1 ){
1.1870 + assert( pBt->pPage1->aData );
1.1871 +#if 0
1.1872 + if( pBt->pPage1->aData==0 ){
1.1873 + MemPage *pPage = pBt->pPage1;
1.1874 + pPage->aData = sqlite3PagerGetData(pPage->pDbPage);
1.1875 + pPage->pBt = pBt;
1.1876 + pPage->pgno = 1;
1.1877 + }
1.1878 +#endif
1.1879 + releasePage(pBt->pPage1);
1.1880 + }
1.1881 + pBt->pPage1 = 0;
1.1882 + pBt->inStmt = 0;
1.1883 + }
1.1884 +}
1.1885 +
1.1886 +/*
1.1887 +** Create a new database by initializing the first page of the
1.1888 +** file.
1.1889 +*/
1.1890 +static int newDatabase(BtShared *pBt){
1.1891 + MemPage *pP1;
1.1892 + unsigned char *data;
1.1893 + int rc;
1.1894 + int nPage;
1.1895 +
1.1896 + assert( sqlite3_mutex_held(pBt->mutex) );
1.1897 + rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
1.1898 + if( rc!=SQLITE_OK || nPage>0 ){
1.1899 + return rc;
1.1900 + }
1.1901 + pP1 = pBt->pPage1;
1.1902 + assert( pP1!=0 );
1.1903 + data = pP1->aData;
1.1904 + rc = sqlite3PagerWrite(pP1->pDbPage);
1.1905 + if( rc ) return rc;
1.1906 + memcpy(data, zMagicHeader, sizeof(zMagicHeader));
1.1907 + assert( sizeof(zMagicHeader)==16 );
1.1908 + put2byte(&data[16], pBt->pageSize);
1.1909 + data[18] = 1;
1.1910 + data[19] = 1;
1.1911 + data[20] = pBt->pageSize - pBt->usableSize;
1.1912 + data[21] = 64;
1.1913 + data[22] = 32;
1.1914 + data[23] = 32;
1.1915 + memset(&data[24], 0, 100-24);
1.1916 + zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
1.1917 + pBt->pageSizeFixed = 1;
1.1918 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.1919 + assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
1.1920 + assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
1.1921 + put4byte(&data[36 + 4*4], pBt->autoVacuum);
1.1922 + put4byte(&data[36 + 7*4], pBt->incrVacuum);
1.1923 +#endif
1.1924 + return SQLITE_OK;
1.1925 +}
1.1926 +
1.1927 +/*
1.1928 +** Attempt to start a new transaction. A write-transaction
1.1929 +** is started if the second argument is nonzero, otherwise a read-
1.1930 +** transaction. If the second argument is 2 or more and exclusive
1.1931 +** transaction is started, meaning that no other process is allowed
1.1932 +** to access the database. A preexisting transaction may not be
1.1933 +** upgraded to exclusive by calling this routine a second time - the
1.1934 +** exclusivity flag only works for a new transaction.
1.1935 +**
1.1936 +** A write-transaction must be started before attempting any
1.1937 +** changes to the database. None of the following routines
1.1938 +** will work unless a transaction is started first:
1.1939 +**
1.1940 +** sqlite3BtreeCreateTable()
1.1941 +** sqlite3BtreeCreateIndex()
1.1942 +** sqlite3BtreeClearTable()
1.1943 +** sqlite3BtreeDropTable()
1.1944 +** sqlite3BtreeInsert()
1.1945 +** sqlite3BtreeDelete()
1.1946 +** sqlite3BtreeUpdateMeta()
1.1947 +**
1.1948 +** If an initial attempt to acquire the lock fails because of lock contention
1.1949 +** and the database was previously unlocked, then invoke the busy handler
1.1950 +** if there is one. But if there was previously a read-lock, do not
1.1951 +** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is
1.1952 +** returned when there is already a read-lock in order to avoid a deadlock.
1.1953 +**
1.1954 +** Suppose there are two processes A and B. A has a read lock and B has
1.1955 +** a reserved lock. B tries to promote to exclusive but is blocked because
1.1956 +** of A's read lock. A tries to promote to reserved but is blocked by B.
1.1957 +** One or the other of the two processes must give way or there can be
1.1958 +** no progress. By returning SQLITE_BUSY and not invoking the busy callback
1.1959 +** when A already has a read lock, we encourage A to give up and let B
1.1960 +** proceed.
1.1961 +*/
1.1962 +int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
1.1963 + BtShared *pBt = p->pBt;
1.1964 + int rc = SQLITE_OK;
1.1965 +
1.1966 + sqlite3BtreeEnter(p);
1.1967 + pBt->db = p->db;
1.1968 + btreeIntegrity(p);
1.1969 +
1.1970 + /* If the btree is already in a write-transaction, or it
1.1971 + ** is already in a read-transaction and a read-transaction
1.1972 + ** is requested, this is a no-op.
1.1973 + */
1.1974 + if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
1.1975 + goto trans_begun;
1.1976 + }
1.1977 +
1.1978 + /* Write transactions are not possible on a read-only database */
1.1979 + if( pBt->readOnly && wrflag ){
1.1980 + rc = SQLITE_READONLY;
1.1981 + goto trans_begun;
1.1982 + }
1.1983 +
1.1984 + /* If another database handle has already opened a write transaction
1.1985 + ** on this shared-btree structure and a second write transaction is
1.1986 + ** requested, return SQLITE_BUSY.
1.1987 + */
1.1988 + if( pBt->inTransaction==TRANS_WRITE && wrflag ){
1.1989 + rc = SQLITE_BUSY;
1.1990 + goto trans_begun;
1.1991 + }
1.1992 +
1.1993 +#ifndef SQLITE_OMIT_SHARED_CACHE
1.1994 + if( wrflag>1 ){
1.1995 + BtLock *pIter;
1.1996 + for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
1.1997 + if( pIter->pBtree!=p ){
1.1998 + rc = SQLITE_BUSY;
1.1999 + goto trans_begun;
1.2000 + }
1.2001 + }
1.2002 + }
1.2003 +#endif
1.2004 +
1.2005 + do {
1.2006 + if( pBt->pPage1==0 ){
1.2007 + do{
1.2008 + rc = lockBtree(pBt);
1.2009 + }while( pBt->pPage1==0 && rc==SQLITE_OK );
1.2010 + }
1.2011 +
1.2012 + if( rc==SQLITE_OK && wrflag ){
1.2013 + if( pBt->readOnly ){
1.2014 + rc = SQLITE_READONLY;
1.2015 + }else{
1.2016 + rc = sqlite3PagerBegin(pBt->pPage1->pDbPage, wrflag>1);
1.2017 + if( rc==SQLITE_OK ){
1.2018 + rc = newDatabase(pBt);
1.2019 + }
1.2020 + }
1.2021 + }
1.2022 +
1.2023 + if( rc==SQLITE_OK ){
1.2024 + if( wrflag ) pBt->inStmt = 0;
1.2025 + }else{
1.2026 + unlockBtreeIfUnused(pBt);
1.2027 + }
1.2028 + }while( rc==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
1.2029 + sqlite3BtreeInvokeBusyHandler(pBt, 0) );
1.2030 +
1.2031 + if( rc==SQLITE_OK ){
1.2032 + if( p->inTrans==TRANS_NONE ){
1.2033 + pBt->nTransaction++;
1.2034 + }
1.2035 + p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
1.2036 + if( p->inTrans>pBt->inTransaction ){
1.2037 + pBt->inTransaction = p->inTrans;
1.2038 + }
1.2039 +#ifndef SQLITE_OMIT_SHARED_CACHE
1.2040 + if( wrflag>1 ){
1.2041 + assert( !pBt->pExclusive );
1.2042 + pBt->pExclusive = p;
1.2043 + }
1.2044 +#endif
1.2045 + }
1.2046 +
1.2047 +
1.2048 +trans_begun:
1.2049 + btreeIntegrity(p);
1.2050 + sqlite3BtreeLeave(p);
1.2051 + return rc;
1.2052 +}
1.2053 +
1.2054 +
1.2055 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.2056 +
1.2057 +/*
1.2058 +** Set the pointer-map entries for all children of page pPage. Also, if
1.2059 +** pPage contains cells that point to overflow pages, set the pointer
1.2060 +** map entries for the overflow pages as well.
1.2061 +*/
1.2062 +static int setChildPtrmaps(MemPage *pPage){
1.2063 + int i; /* Counter variable */
1.2064 + int nCell; /* Number of cells in page pPage */
1.2065 + int rc; /* Return code */
1.2066 + BtShared *pBt = pPage->pBt;
1.2067 + int isInitOrig = pPage->isInit;
1.2068 + Pgno pgno = pPage->pgno;
1.2069 +
1.2070 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.2071 + rc = sqlite3BtreeInitPage(pPage);
1.2072 + if( rc!=SQLITE_OK ){
1.2073 + goto set_child_ptrmaps_out;
1.2074 + }
1.2075 + nCell = pPage->nCell;
1.2076 +
1.2077 + for(i=0; i<nCell; i++){
1.2078 + u8 *pCell = findCell(pPage, i);
1.2079 +
1.2080 + rc = ptrmapPutOvflPtr(pPage, pCell);
1.2081 + if( rc!=SQLITE_OK ){
1.2082 + goto set_child_ptrmaps_out;
1.2083 + }
1.2084 +
1.2085 + if( !pPage->leaf ){
1.2086 + Pgno childPgno = get4byte(pCell);
1.2087 + rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
1.2088 + if( rc!=SQLITE_OK ) goto set_child_ptrmaps_out;
1.2089 + }
1.2090 + }
1.2091 +
1.2092 + if( !pPage->leaf ){
1.2093 + Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
1.2094 + rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
1.2095 + }
1.2096 +
1.2097 +set_child_ptrmaps_out:
1.2098 + pPage->isInit = isInitOrig;
1.2099 + return rc;
1.2100 +}
1.2101 +
1.2102 +/*
1.2103 +** Somewhere on pPage, which is guarenteed to be a btree page, not an overflow
1.2104 +** page, is a pointer to page iFrom. Modify this pointer so that it points to
1.2105 +** iTo. Parameter eType describes the type of pointer to be modified, as
1.2106 +** follows:
1.2107 +**
1.2108 +** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child
1.2109 +** page of pPage.
1.2110 +**
1.2111 +** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
1.2112 +** page pointed to by one of the cells on pPage.
1.2113 +**
1.2114 +** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
1.2115 +** overflow page in the list.
1.2116 +*/
1.2117 +static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
1.2118 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.2119 + if( eType==PTRMAP_OVERFLOW2 ){
1.2120 + /* The pointer is always the first 4 bytes of the page in this case. */
1.2121 + if( get4byte(pPage->aData)!=iFrom ){
1.2122 + return SQLITE_CORRUPT_BKPT;
1.2123 + }
1.2124 + put4byte(pPage->aData, iTo);
1.2125 + }else{
1.2126 + int isInitOrig = pPage->isInit;
1.2127 + int i;
1.2128 + int nCell;
1.2129 +
1.2130 + sqlite3BtreeInitPage(pPage);
1.2131 + nCell = pPage->nCell;
1.2132 +
1.2133 + for(i=0; i<nCell; i++){
1.2134 + u8 *pCell = findCell(pPage, i);
1.2135 + if( eType==PTRMAP_OVERFLOW1 ){
1.2136 + CellInfo info;
1.2137 + sqlite3BtreeParseCellPtr(pPage, pCell, &info);
1.2138 + if( info.iOverflow ){
1.2139 + if( iFrom==get4byte(&pCell[info.iOverflow]) ){
1.2140 + put4byte(&pCell[info.iOverflow], iTo);
1.2141 + break;
1.2142 + }
1.2143 + }
1.2144 + }else{
1.2145 + if( get4byte(pCell)==iFrom ){
1.2146 + put4byte(pCell, iTo);
1.2147 + break;
1.2148 + }
1.2149 + }
1.2150 + }
1.2151 +
1.2152 + if( i==nCell ){
1.2153 + if( eType!=PTRMAP_BTREE ||
1.2154 + get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
1.2155 + return SQLITE_CORRUPT_BKPT;
1.2156 + }
1.2157 + put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
1.2158 + }
1.2159 +
1.2160 + pPage->isInit = isInitOrig;
1.2161 + }
1.2162 + return SQLITE_OK;
1.2163 +}
1.2164 +
1.2165 +
1.2166 +/*
1.2167 +** Move the open database page pDbPage to location iFreePage in the
1.2168 +** database. The pDbPage reference remains valid.
1.2169 +*/
1.2170 +static int relocatePage(
1.2171 + BtShared *pBt, /* Btree */
1.2172 + MemPage *pDbPage, /* Open page to move */
1.2173 + u8 eType, /* Pointer map 'type' entry for pDbPage */
1.2174 + Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */
1.2175 + Pgno iFreePage, /* The location to move pDbPage to */
1.2176 + int isCommit
1.2177 +){
1.2178 + MemPage *pPtrPage; /* The page that contains a pointer to pDbPage */
1.2179 + Pgno iDbPage = pDbPage->pgno;
1.2180 + Pager *pPager = pBt->pPager;
1.2181 + int rc;
1.2182 +
1.2183 + assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
1.2184 + eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
1.2185 + assert( sqlite3_mutex_held(pBt->mutex) );
1.2186 + assert( pDbPage->pBt==pBt );
1.2187 +
1.2188 + /* Move page iDbPage from its current location to page number iFreePage */
1.2189 + TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
1.2190 + iDbPage, iFreePage, iPtrPage, eType));
1.2191 + rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
1.2192 + if( rc!=SQLITE_OK ){
1.2193 + return rc;
1.2194 + }
1.2195 + pDbPage->pgno = iFreePage;
1.2196 +
1.2197 + /* If pDbPage was a btree-page, then it may have child pages and/or cells
1.2198 + ** that point to overflow pages. The pointer map entries for all these
1.2199 + ** pages need to be changed.
1.2200 + **
1.2201 + ** If pDbPage is an overflow page, then the first 4 bytes may store a
1.2202 + ** pointer to a subsequent overflow page. If this is the case, then
1.2203 + ** the pointer map needs to be updated for the subsequent overflow page.
1.2204 + */
1.2205 + if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
1.2206 + rc = setChildPtrmaps(pDbPage);
1.2207 + if( rc!=SQLITE_OK ){
1.2208 + return rc;
1.2209 + }
1.2210 + }else{
1.2211 + Pgno nextOvfl = get4byte(pDbPage->aData);
1.2212 + if( nextOvfl!=0 ){
1.2213 + rc = ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage);
1.2214 + if( rc!=SQLITE_OK ){
1.2215 + return rc;
1.2216 + }
1.2217 + }
1.2218 + }
1.2219 +
1.2220 + /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
1.2221 + ** that it points at iFreePage. Also fix the pointer map entry for
1.2222 + ** iPtrPage.
1.2223 + */
1.2224 + if( eType!=PTRMAP_ROOTPAGE ){
1.2225 + rc = sqlite3BtreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
1.2226 + if( rc!=SQLITE_OK ){
1.2227 + return rc;
1.2228 + }
1.2229 + rc = sqlite3PagerWrite(pPtrPage->pDbPage);
1.2230 + if( rc!=SQLITE_OK ){
1.2231 + releasePage(pPtrPage);
1.2232 + return rc;
1.2233 + }
1.2234 + rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
1.2235 + releasePage(pPtrPage);
1.2236 + if( rc==SQLITE_OK ){
1.2237 + rc = ptrmapPut(pBt, iFreePage, eType, iPtrPage);
1.2238 + }
1.2239 + }
1.2240 + return rc;
1.2241 +}
1.2242 +
1.2243 +/* Forward declaration required by incrVacuumStep(). */
1.2244 +static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
1.2245 +
1.2246 +/*
1.2247 +** Perform a single step of an incremental-vacuum. If successful,
1.2248 +** return SQLITE_OK. If there is no work to do (and therefore no
1.2249 +** point in calling this function again), return SQLITE_DONE.
1.2250 +**
1.2251 +** More specificly, this function attempts to re-organize the
1.2252 +** database so that the last page of the file currently in use
1.2253 +** is no longer in use.
1.2254 +**
1.2255 +** If the nFin parameter is non-zero, the implementation assumes
1.2256 +** that the caller will keep calling incrVacuumStep() until
1.2257 +** it returns SQLITE_DONE or an error, and that nFin is the
1.2258 +** number of pages the database file will contain after this
1.2259 +** process is complete.
1.2260 +*/
1.2261 +static int incrVacuumStep(BtShared *pBt, Pgno nFin){
1.2262 + Pgno iLastPg; /* Last page in the database */
1.2263 + Pgno nFreeList; /* Number of pages still on the free-list */
1.2264 +
1.2265 + assert( sqlite3_mutex_held(pBt->mutex) );
1.2266 + iLastPg = pBt->nTrunc;
1.2267 + if( iLastPg==0 ){
1.2268 + iLastPg = pagerPagecount(pBt->pPager);
1.2269 + }
1.2270 +
1.2271 + if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
1.2272 + int rc;
1.2273 + u8 eType;
1.2274 + Pgno iPtrPage;
1.2275 +
1.2276 + nFreeList = get4byte(&pBt->pPage1->aData[36]);
1.2277 + if( nFreeList==0 || nFin==iLastPg ){
1.2278 + return SQLITE_DONE;
1.2279 + }
1.2280 +
1.2281 + rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
1.2282 + if( rc!=SQLITE_OK ){
1.2283 + return rc;
1.2284 + }
1.2285 + if( eType==PTRMAP_ROOTPAGE ){
1.2286 + return SQLITE_CORRUPT_BKPT;
1.2287 + }
1.2288 +
1.2289 + if( eType==PTRMAP_FREEPAGE ){
1.2290 + if( nFin==0 ){
1.2291 + /* Remove the page from the files free-list. This is not required
1.2292 + ** if nFin is non-zero. In that case, the free-list will be
1.2293 + ** truncated to zero after this function returns, so it doesn't
1.2294 + ** matter if it still contains some garbage entries.
1.2295 + */
1.2296 + Pgno iFreePg;
1.2297 + MemPage *pFreePg;
1.2298 + rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1);
1.2299 + if( rc!=SQLITE_OK ){
1.2300 + return rc;
1.2301 + }
1.2302 + assert( iFreePg==iLastPg );
1.2303 + releasePage(pFreePg);
1.2304 + }
1.2305 + } else {
1.2306 + Pgno iFreePg; /* Index of free page to move pLastPg to */
1.2307 + MemPage *pLastPg;
1.2308 +
1.2309 + rc = sqlite3BtreeGetPage(pBt, iLastPg, &pLastPg, 0);
1.2310 + if( rc!=SQLITE_OK ){
1.2311 + return rc;
1.2312 + }
1.2313 +
1.2314 + /* If nFin is zero, this loop runs exactly once and page pLastPg
1.2315 + ** is swapped with the first free page pulled off the free list.
1.2316 + **
1.2317 + ** On the other hand, if nFin is greater than zero, then keep
1.2318 + ** looping until a free-page located within the first nFin pages
1.2319 + ** of the file is found.
1.2320 + */
1.2321 + do {
1.2322 + MemPage *pFreePg;
1.2323 + rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0);
1.2324 + if( rc!=SQLITE_OK ){
1.2325 + releasePage(pLastPg);
1.2326 + return rc;
1.2327 + }
1.2328 + releasePage(pFreePg);
1.2329 + }while( nFin!=0 && iFreePg>nFin );
1.2330 + assert( iFreePg<iLastPg );
1.2331 +
1.2332 + rc = sqlite3PagerWrite(pLastPg->pDbPage);
1.2333 + if( rc==SQLITE_OK ){
1.2334 + rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, nFin!=0);
1.2335 + }
1.2336 + releasePage(pLastPg);
1.2337 + if( rc!=SQLITE_OK ){
1.2338 + return rc;
1.2339 + }
1.2340 + }
1.2341 + }
1.2342 +
1.2343 + pBt->nTrunc = iLastPg - 1;
1.2344 + while( pBt->nTrunc==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, pBt->nTrunc) ){
1.2345 + pBt->nTrunc--;
1.2346 + }
1.2347 + return SQLITE_OK;
1.2348 +}
1.2349 +
1.2350 +/*
1.2351 +** A write-transaction must be opened before calling this function.
1.2352 +** It performs a single unit of work towards an incremental vacuum.
1.2353 +**
1.2354 +** If the incremental vacuum is finished after this function has run,
1.2355 +** SQLITE_DONE is returned. If it is not finished, but no error occured,
1.2356 +** SQLITE_OK is returned. Otherwise an SQLite error code.
1.2357 +*/
1.2358 +int sqlite3BtreeIncrVacuum(Btree *p){
1.2359 + int rc;
1.2360 + BtShared *pBt = p->pBt;
1.2361 +
1.2362 + sqlite3BtreeEnter(p);
1.2363 + pBt->db = p->db;
1.2364 + assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
1.2365 + if( !pBt->autoVacuum ){
1.2366 + rc = SQLITE_DONE;
1.2367 + }else{
1.2368 + invalidateAllOverflowCache(pBt);
1.2369 + rc = incrVacuumStep(pBt, 0);
1.2370 + }
1.2371 + sqlite3BtreeLeave(p);
1.2372 + return rc;
1.2373 +}
1.2374 +
1.2375 +/*
1.2376 +** This routine is called prior to sqlite3PagerCommit when a transaction
1.2377 +** is commited for an auto-vacuum database.
1.2378 +**
1.2379 +** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
1.2380 +** the database file should be truncated to during the commit process.
1.2381 +** i.e. the database has been reorganized so that only the first *pnTrunc
1.2382 +** pages are in use.
1.2383 +*/
1.2384 +static int autoVacuumCommit(BtShared *pBt, Pgno *pnTrunc){
1.2385 + int rc = SQLITE_OK;
1.2386 + Pager *pPager = pBt->pPager;
1.2387 + VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager) );
1.2388 +
1.2389 + assert( sqlite3_mutex_held(pBt->mutex) );
1.2390 + invalidateAllOverflowCache(pBt);
1.2391 + assert(pBt->autoVacuum);
1.2392 + if( !pBt->incrVacuum ){
1.2393 + Pgno nFin = 0;
1.2394 +
1.2395 + if( pBt->nTrunc==0 ){
1.2396 + Pgno nFree;
1.2397 + Pgno nPtrmap;
1.2398 + const int pgsz = pBt->pageSize;
1.2399 + int nOrig = pagerPagecount(pBt->pPager);
1.2400 +
1.2401 + if( PTRMAP_ISPAGE(pBt, nOrig) ){
1.2402 + return SQLITE_CORRUPT_BKPT;
1.2403 + }
1.2404 + if( nOrig==PENDING_BYTE_PAGE(pBt) ){
1.2405 + nOrig--;
1.2406 + }
1.2407 + nFree = get4byte(&pBt->pPage1->aData[36]);
1.2408 + nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+pgsz/5)/(pgsz/5);
1.2409 + nFin = nOrig - nFree - nPtrmap;
1.2410 + if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<=PENDING_BYTE_PAGE(pBt) ){
1.2411 + nFin--;
1.2412 + }
1.2413 + while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
1.2414 + nFin--;
1.2415 + }
1.2416 + }
1.2417 +
1.2418 + while( rc==SQLITE_OK ){
1.2419 + rc = incrVacuumStep(pBt, nFin);
1.2420 + }
1.2421 + if( rc==SQLITE_DONE ){
1.2422 + assert(nFin==0 || pBt->nTrunc==0 || nFin<=pBt->nTrunc);
1.2423 + rc = SQLITE_OK;
1.2424 + if( pBt->nTrunc && nFin ){
1.2425 + rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
1.2426 + put4byte(&pBt->pPage1->aData[32], 0);
1.2427 + put4byte(&pBt->pPage1->aData[36], 0);
1.2428 + pBt->nTrunc = nFin;
1.2429 + }
1.2430 + }
1.2431 + if( rc!=SQLITE_OK ){
1.2432 + sqlite3PagerRollback(pPager);
1.2433 + }
1.2434 + }
1.2435 +
1.2436 + if( rc==SQLITE_OK ){
1.2437 + *pnTrunc = pBt->nTrunc;
1.2438 + pBt->nTrunc = 0;
1.2439 + }
1.2440 + assert( nRef==sqlite3PagerRefcount(pPager) );
1.2441 + return rc;
1.2442 +}
1.2443 +
1.2444 +#endif
1.2445 +
1.2446 +/*
1.2447 +** This routine does the first phase of a two-phase commit. This routine
1.2448 +** causes a rollback journal to be created (if it does not already exist)
1.2449 +** and populated with enough information so that if a power loss occurs
1.2450 +** the database can be restored to its original state by playing back
1.2451 +** the journal. Then the contents of the journal are flushed out to
1.2452 +** the disk. After the journal is safely on oxide, the changes to the
1.2453 +** database are written into the database file and flushed to oxide.
1.2454 +** At the end of this call, the rollback journal still exists on the
1.2455 +** disk and we are still holding all locks, so the transaction has not
1.2456 +** committed. See sqlite3BtreeCommit() for the second phase of the
1.2457 +** commit process.
1.2458 +**
1.2459 +** This call is a no-op if no write-transaction is currently active on pBt.
1.2460 +**
1.2461 +** Otherwise, sync the database file for the btree pBt. zMaster points to
1.2462 +** the name of a master journal file that should be written into the
1.2463 +** individual journal file, or is NULL, indicating no master journal file
1.2464 +** (single database transaction).
1.2465 +**
1.2466 +** When this is called, the master journal should already have been
1.2467 +** created, populated with this journal pointer and synced to disk.
1.2468 +**
1.2469 +** Once this is routine has returned, the only thing required to commit
1.2470 +** the write-transaction for this database file is to delete the journal.
1.2471 +*/
1.2472 +int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
1.2473 + int rc = SQLITE_OK;
1.2474 + if( p->inTrans==TRANS_WRITE ){
1.2475 + BtShared *pBt = p->pBt;
1.2476 + Pgno nTrunc = 0;
1.2477 + sqlite3BtreeEnter(p);
1.2478 + pBt->db = p->db;
1.2479 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.2480 + if( pBt->autoVacuum ){
1.2481 + rc = autoVacuumCommit(pBt, &nTrunc);
1.2482 + if( rc!=SQLITE_OK ){
1.2483 + sqlite3BtreeLeave(p);
1.2484 + return rc;
1.2485 + }
1.2486 + }
1.2487 +#endif
1.2488 + rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, nTrunc, 0);
1.2489 + sqlite3BtreeLeave(p);
1.2490 + }
1.2491 + return rc;
1.2492 +}
1.2493 +
1.2494 +/*
1.2495 +** Commit the transaction currently in progress.
1.2496 +**
1.2497 +** This routine implements the second phase of a 2-phase commit. The
1.2498 +** sqlite3BtreeSync() routine does the first phase and should be invoked
1.2499 +** prior to calling this routine. The sqlite3BtreeSync() routine did
1.2500 +** all the work of writing information out to disk and flushing the
1.2501 +** contents so that they are written onto the disk platter. All this
1.2502 +** routine has to do is delete or truncate the rollback journal
1.2503 +** (which causes the transaction to commit) and drop locks.
1.2504 +**
1.2505 +** This will release the write lock on the database file. If there
1.2506 +** are no active cursors, it also releases the read lock.
1.2507 +*/
1.2508 +int sqlite3BtreeCommitPhaseTwo(Btree *p){
1.2509 + BtShared *pBt = p->pBt;
1.2510 +
1.2511 + sqlite3BtreeEnter(p);
1.2512 + pBt->db = p->db;
1.2513 + btreeIntegrity(p);
1.2514 +
1.2515 + /* If the handle has a write-transaction open, commit the shared-btrees
1.2516 + ** transaction and set the shared state to TRANS_READ.
1.2517 + */
1.2518 + if( p->inTrans==TRANS_WRITE ){
1.2519 + int rc;
1.2520 + assert( pBt->inTransaction==TRANS_WRITE );
1.2521 + assert( pBt->nTransaction>0 );
1.2522 + rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
1.2523 + if( rc!=SQLITE_OK ){
1.2524 + sqlite3BtreeLeave(p);
1.2525 + return rc;
1.2526 + }
1.2527 + pBt->inTransaction = TRANS_READ;
1.2528 + pBt->inStmt = 0;
1.2529 + }
1.2530 + unlockAllTables(p);
1.2531 +
1.2532 + /* If the handle has any kind of transaction open, decrement the transaction
1.2533 + ** count of the shared btree. If the transaction count reaches 0, set
1.2534 + ** the shared state to TRANS_NONE. The unlockBtreeIfUnused() call below
1.2535 + ** will unlock the pager.
1.2536 + */
1.2537 + if( p->inTrans!=TRANS_NONE ){
1.2538 + pBt->nTransaction--;
1.2539 + if( 0==pBt->nTransaction ){
1.2540 + pBt->inTransaction = TRANS_NONE;
1.2541 + }
1.2542 + }
1.2543 +
1.2544 + /* Set the handles current transaction state to TRANS_NONE and unlock
1.2545 + ** the pager if this call closed the only read or write transaction.
1.2546 + */
1.2547 + p->inTrans = TRANS_NONE;
1.2548 + unlockBtreeIfUnused(pBt);
1.2549 +
1.2550 + btreeIntegrity(p);
1.2551 + sqlite3BtreeLeave(p);
1.2552 + return SQLITE_OK;
1.2553 +}
1.2554 +
1.2555 +/*
1.2556 +** Do both phases of a commit.
1.2557 +*/
1.2558 +int sqlite3BtreeCommit(Btree *p){
1.2559 + int rc;
1.2560 + sqlite3BtreeEnter(p);
1.2561 + rc = sqlite3BtreeCommitPhaseOne(p, 0);
1.2562 + if( rc==SQLITE_OK ){
1.2563 + rc = sqlite3BtreeCommitPhaseTwo(p);
1.2564 + }
1.2565 + sqlite3BtreeLeave(p);
1.2566 + return rc;
1.2567 +}
1.2568 +
1.2569 +#ifndef NDEBUG
1.2570 +/*
1.2571 +** Return the number of write-cursors open on this handle. This is for use
1.2572 +** in assert() expressions, so it is only compiled if NDEBUG is not
1.2573 +** defined.
1.2574 +**
1.2575 +** For the purposes of this routine, a write-cursor is any cursor that
1.2576 +** is capable of writing to the databse. That means the cursor was
1.2577 +** originally opened for writing and the cursor has not be disabled
1.2578 +** by having its state changed to CURSOR_FAULT.
1.2579 +*/
1.2580 +static int countWriteCursors(BtShared *pBt){
1.2581 + BtCursor *pCur;
1.2582 + int r = 0;
1.2583 + for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
1.2584 + if( pCur->wrFlag && pCur->eState!=CURSOR_FAULT ) r++;
1.2585 + }
1.2586 + return r;
1.2587 +}
1.2588 +#endif
1.2589 +
1.2590 +/*
1.2591 +** This routine sets the state to CURSOR_FAULT and the error
1.2592 +** code to errCode for every cursor on BtShared that pBtree
1.2593 +** references.
1.2594 +**
1.2595 +** Every cursor is tripped, including cursors that belong
1.2596 +** to other database connections that happen to be sharing
1.2597 +** the cache with pBtree.
1.2598 +**
1.2599 +** This routine gets called when a rollback occurs.
1.2600 +** All cursors using the same cache must be tripped
1.2601 +** to prevent them from trying to use the btree after
1.2602 +** the rollback. The rollback may have deleted tables
1.2603 +** or moved root pages, so it is not sufficient to
1.2604 +** save the state of the cursor. The cursor must be
1.2605 +** invalidated.
1.2606 +*/
1.2607 +void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){
1.2608 + BtCursor *p;
1.2609 + sqlite3BtreeEnter(pBtree);
1.2610 + for(p=pBtree->pBt->pCursor; p; p=p->pNext){
1.2611 + sqlite3BtreeClearCursor(p);
1.2612 + p->eState = CURSOR_FAULT;
1.2613 + p->skip = errCode;
1.2614 + }
1.2615 + sqlite3BtreeLeave(pBtree);
1.2616 +}
1.2617 +
1.2618 +/*
1.2619 +** Rollback the transaction in progress. All cursors will be
1.2620 +** invalided by this operation. Any attempt to use a cursor
1.2621 +** that was open at the beginning of this operation will result
1.2622 +** in an error.
1.2623 +**
1.2624 +** This will release the write lock on the database file. If there
1.2625 +** are no active cursors, it also releases the read lock.
1.2626 +*/
1.2627 +int sqlite3BtreeRollback(Btree *p){
1.2628 + int rc;
1.2629 + BtShared *pBt = p->pBt;
1.2630 + MemPage *pPage1;
1.2631 +
1.2632 + sqlite3BtreeEnter(p);
1.2633 + pBt->db = p->db;
1.2634 + rc = saveAllCursors(pBt, 0, 0);
1.2635 +#ifndef SQLITE_OMIT_SHARED_CACHE
1.2636 + if( rc!=SQLITE_OK ){
1.2637 + /* This is a horrible situation. An IO or malloc() error occured whilst
1.2638 + ** trying to save cursor positions. If this is an automatic rollback (as
1.2639 + ** the result of a constraint, malloc() failure or IO error) then
1.2640 + ** the cache may be internally inconsistent (not contain valid trees) so
1.2641 + ** we cannot simply return the error to the caller. Instead, abort
1.2642 + ** all queries that may be using any of the cursors that failed to save.
1.2643 + */
1.2644 + sqlite3BtreeTripAllCursors(p, rc);
1.2645 + }
1.2646 +#endif
1.2647 + btreeIntegrity(p);
1.2648 + unlockAllTables(p);
1.2649 +
1.2650 + if( p->inTrans==TRANS_WRITE ){
1.2651 + int rc2;
1.2652 +
1.2653 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.2654 + pBt->nTrunc = 0;
1.2655 +#endif
1.2656 +
1.2657 + assert( TRANS_WRITE==pBt->inTransaction );
1.2658 + rc2 = sqlite3PagerRollback(pBt->pPager);
1.2659 + if( rc2!=SQLITE_OK ){
1.2660 + rc = rc2;
1.2661 + }
1.2662 +
1.2663 + /* The rollback may have destroyed the pPage1->aData value. So
1.2664 + ** call sqlite3BtreeGetPage() on page 1 again to make
1.2665 + ** sure pPage1->aData is set correctly. */
1.2666 + if( sqlite3BtreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
1.2667 + releasePage(pPage1);
1.2668 + }
1.2669 + assert( countWriteCursors(pBt)==0 );
1.2670 + pBt->inTransaction = TRANS_READ;
1.2671 + }
1.2672 +
1.2673 + if( p->inTrans!=TRANS_NONE ){
1.2674 + assert( pBt->nTransaction>0 );
1.2675 + pBt->nTransaction--;
1.2676 + if( 0==pBt->nTransaction ){
1.2677 + pBt->inTransaction = TRANS_NONE;
1.2678 + }
1.2679 + }
1.2680 +
1.2681 + p->inTrans = TRANS_NONE;
1.2682 + pBt->inStmt = 0;
1.2683 + unlockBtreeIfUnused(pBt);
1.2684 +
1.2685 + btreeIntegrity(p);
1.2686 + sqlite3BtreeLeave(p);
1.2687 + return rc;
1.2688 +}
1.2689 +
1.2690 +/*
1.2691 +** Start a statement subtransaction. The subtransaction can
1.2692 +** can be rolled back independently of the main transaction.
1.2693 +** You must start a transaction before starting a subtransaction.
1.2694 +** The subtransaction is ended automatically if the main transaction
1.2695 +** commits or rolls back.
1.2696 +**
1.2697 +** Only one subtransaction may be active at a time. It is an error to try
1.2698 +** to start a new subtransaction if another subtransaction is already active.
1.2699 +**
1.2700 +** Statement subtransactions are used around individual SQL statements
1.2701 +** that are contained within a BEGIN...COMMIT block. If a constraint
1.2702 +** error occurs within the statement, the effect of that one statement
1.2703 +** can be rolled back without having to rollback the entire transaction.
1.2704 +*/
1.2705 +int sqlite3BtreeBeginStmt(Btree *p){
1.2706 + int rc;
1.2707 + BtShared *pBt = p->pBt;
1.2708 + sqlite3BtreeEnter(p);
1.2709 + pBt->db = p->db;
1.2710 + if( (p->inTrans!=TRANS_WRITE) || pBt->inStmt ){
1.2711 + rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
1.2712 + }else{
1.2713 + assert( pBt->inTransaction==TRANS_WRITE );
1.2714 + rc = pBt->readOnly ? SQLITE_OK : sqlite3PagerStmtBegin(pBt->pPager);
1.2715 + pBt->inStmt = 1;
1.2716 + }
1.2717 + sqlite3BtreeLeave(p);
1.2718 + return rc;
1.2719 +}
1.2720 +
1.2721 +
1.2722 +/*
1.2723 +** Commit the statment subtransaction currently in progress. If no
1.2724 +** subtransaction is active, this is a no-op.
1.2725 +*/
1.2726 +int sqlite3BtreeCommitStmt(Btree *p){
1.2727 + int rc;
1.2728 + BtShared *pBt = p->pBt;
1.2729 + sqlite3BtreeEnter(p);
1.2730 + pBt->db = p->db;
1.2731 + if( pBt->inStmt && !pBt->readOnly ){
1.2732 + rc = sqlite3PagerStmtCommit(pBt->pPager);
1.2733 + }else{
1.2734 + rc = SQLITE_OK;
1.2735 + }
1.2736 + pBt->inStmt = 0;
1.2737 + sqlite3BtreeLeave(p);
1.2738 + return rc;
1.2739 +}
1.2740 +
1.2741 +/*
1.2742 +** Rollback the active statement subtransaction. If no subtransaction
1.2743 +** is active this routine is a no-op.
1.2744 +**
1.2745 +** All cursors will be invalidated by this operation. Any attempt
1.2746 +** to use a cursor that was open at the beginning of this operation
1.2747 +** will result in an error.
1.2748 +*/
1.2749 +int sqlite3BtreeRollbackStmt(Btree *p){
1.2750 + int rc = SQLITE_OK;
1.2751 + BtShared *pBt = p->pBt;
1.2752 + sqlite3BtreeEnter(p);
1.2753 + pBt->db = p->db;
1.2754 + if( pBt->inStmt && !pBt->readOnly ){
1.2755 + rc = sqlite3PagerStmtRollback(pBt->pPager);
1.2756 + pBt->inStmt = 0;
1.2757 + }
1.2758 + sqlite3BtreeLeave(p);
1.2759 + return rc;
1.2760 +}
1.2761 +
1.2762 +/*
1.2763 +** Create a new cursor for the BTree whose root is on the page
1.2764 +** iTable. The act of acquiring a cursor gets a read lock on
1.2765 +** the database file.
1.2766 +**
1.2767 +** If wrFlag==0, then the cursor can only be used for reading.
1.2768 +** If wrFlag==1, then the cursor can be used for reading or for
1.2769 +** writing if other conditions for writing are also met. These
1.2770 +** are the conditions that must be met in order for writing to
1.2771 +** be allowed:
1.2772 +**
1.2773 +** 1: The cursor must have been opened with wrFlag==1
1.2774 +**
1.2775 +** 2: Other database connections that share the same pager cache
1.2776 +** but which are not in the READ_UNCOMMITTED state may not have
1.2777 +** cursors open with wrFlag==0 on the same table. Otherwise
1.2778 +** the changes made by this write cursor would be visible to
1.2779 +** the read cursors in the other database connection.
1.2780 +**
1.2781 +** 3: The database must be writable (not on read-only media)
1.2782 +**
1.2783 +** 4: There must be an active transaction.
1.2784 +**
1.2785 +** No checking is done to make sure that page iTable really is the
1.2786 +** root page of a b-tree. If it is not, then the cursor acquired
1.2787 +** will not work correctly.
1.2788 +**
1.2789 +** It is assumed that the sqlite3BtreeCursorSize() bytes of memory
1.2790 +** pointed to by pCur have been zeroed by the caller.
1.2791 +*/
1.2792 +static int btreeCursor(
1.2793 + Btree *p, /* The btree */
1.2794 + int iTable, /* Root page of table to open */
1.2795 + int wrFlag, /* 1 to write. 0 read-only */
1.2796 + struct KeyInfo *pKeyInfo, /* First arg to comparison function */
1.2797 + BtCursor *pCur /* Space for new cursor */
1.2798 +){
1.2799 + int rc;
1.2800 + BtShared *pBt = p->pBt;
1.2801 +
1.2802 + assert( sqlite3BtreeHoldsMutex(p) );
1.2803 + if( wrFlag ){
1.2804 + if( pBt->readOnly ){
1.2805 + return SQLITE_READONLY;
1.2806 + }
1.2807 + if( checkReadLocks(p, iTable, 0, 0) ){
1.2808 + return SQLITE_LOCKED;
1.2809 + }
1.2810 + }
1.2811 +
1.2812 + if( pBt->pPage1==0 ){
1.2813 + rc = lockBtreeWithRetry(p);
1.2814 + if( rc!=SQLITE_OK ){
1.2815 + return rc;
1.2816 + }
1.2817 + if( pBt->readOnly && wrFlag ){
1.2818 + return SQLITE_READONLY;
1.2819 + }
1.2820 + }
1.2821 + pCur->pgnoRoot = (Pgno)iTable;
1.2822 + if( iTable==1 && pagerPagecount(pBt->pPager)==0 ){
1.2823 + rc = SQLITE_EMPTY;
1.2824 + goto create_cursor_exception;
1.2825 + }
1.2826 + rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]);
1.2827 + if( rc!=SQLITE_OK ){
1.2828 + goto create_cursor_exception;
1.2829 + }
1.2830 +
1.2831 + /* Now that no other errors can occur, finish filling in the BtCursor
1.2832 + ** variables, link the cursor into the BtShared list and set *ppCur (the
1.2833 + ** output argument to this function).
1.2834 + */
1.2835 + pCur->pKeyInfo = pKeyInfo;
1.2836 + pCur->pBtree = p;
1.2837 + pCur->pBt = pBt;
1.2838 + pCur->wrFlag = wrFlag;
1.2839 + pCur->pNext = pBt->pCursor;
1.2840 + if( pCur->pNext ){
1.2841 + pCur->pNext->pPrev = pCur;
1.2842 + }
1.2843 + pBt->pCursor = pCur;
1.2844 + pCur->eState = CURSOR_INVALID;
1.2845 +
1.2846 + return SQLITE_OK;
1.2847 +
1.2848 +create_cursor_exception:
1.2849 + releasePage(pCur->apPage[0]);
1.2850 + unlockBtreeIfUnused(pBt);
1.2851 + return rc;
1.2852 +}
1.2853 +int sqlite3BtreeCursor(
1.2854 + Btree *p, /* The btree */
1.2855 + int iTable, /* Root page of table to open */
1.2856 + int wrFlag, /* 1 to write. 0 read-only */
1.2857 + struct KeyInfo *pKeyInfo, /* First arg to xCompare() */
1.2858 + BtCursor *pCur /* Write new cursor here */
1.2859 +){
1.2860 + int rc;
1.2861 + sqlite3BtreeEnter(p);
1.2862 + p->pBt->db = p->db;
1.2863 + rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
1.2864 + sqlite3BtreeLeave(p);
1.2865 + return rc;
1.2866 +}
1.2867 +int sqlite3BtreeCursorSize(){
1.2868 + return sizeof(BtCursor);
1.2869 +}
1.2870 +
1.2871 +
1.2872 +
1.2873 +/*
1.2874 +** Close a cursor. The read lock on the database file is released
1.2875 +** when the last cursor is closed.
1.2876 +*/
1.2877 +int sqlite3BtreeCloseCursor(BtCursor *pCur){
1.2878 + Btree *pBtree = pCur->pBtree;
1.2879 + if( pBtree ){
1.2880 + int i;
1.2881 + BtShared *pBt = pCur->pBt;
1.2882 + sqlite3BtreeEnter(pBtree);
1.2883 + pBt->db = pBtree->db;
1.2884 + sqlite3BtreeClearCursor(pCur);
1.2885 + if( pCur->pPrev ){
1.2886 + pCur->pPrev->pNext = pCur->pNext;
1.2887 + }else{
1.2888 + pBt->pCursor = pCur->pNext;
1.2889 + }
1.2890 + if( pCur->pNext ){
1.2891 + pCur->pNext->pPrev = pCur->pPrev;
1.2892 + }
1.2893 + for(i=0; i<=pCur->iPage; i++){
1.2894 + releasePage(pCur->apPage[i]);
1.2895 + }
1.2896 + unlockBtreeIfUnused(pBt);
1.2897 + invalidateOverflowCache(pCur);
1.2898 + /* sqlite3_free(pCur); */
1.2899 + sqlite3BtreeLeave(pBtree);
1.2900 + }
1.2901 + return SQLITE_OK;
1.2902 +}
1.2903 +
1.2904 +/*
1.2905 +** Make a temporary cursor by filling in the fields of pTempCur.
1.2906 +** The temporary cursor is not on the cursor list for the Btree.
1.2907 +*/
1.2908 +void sqlite3BtreeGetTempCursor(BtCursor *pCur, BtCursor *pTempCur){
1.2909 + int i;
1.2910 + assert( cursorHoldsMutex(pCur) );
1.2911 + memcpy(pTempCur, pCur, sizeof(BtCursor));
1.2912 + pTempCur->pNext = 0;
1.2913 + pTempCur->pPrev = 0;
1.2914 + for(i=0; i<=pTempCur->iPage; i++){
1.2915 + sqlite3PagerRef(pTempCur->apPage[i]->pDbPage);
1.2916 + }
1.2917 +}
1.2918 +
1.2919 +/*
1.2920 +** Delete a temporary cursor such as was made by the CreateTemporaryCursor()
1.2921 +** function above.
1.2922 +*/
1.2923 +void sqlite3BtreeReleaseTempCursor(BtCursor *pCur){
1.2924 + int i;
1.2925 + assert( cursorHoldsMutex(pCur) );
1.2926 + for(i=0; i<=pCur->iPage; i++){
1.2927 + sqlite3PagerUnref(pCur->apPage[i]->pDbPage);
1.2928 + }
1.2929 +}
1.2930 +
1.2931 +/*
1.2932 +** Make sure the BtCursor* given in the argument has a valid
1.2933 +** BtCursor.info structure. If it is not already valid, call
1.2934 +** sqlite3BtreeParseCell() to fill it in.
1.2935 +**
1.2936 +** BtCursor.info is a cache of the information in the current cell.
1.2937 +** Using this cache reduces the number of calls to sqlite3BtreeParseCell().
1.2938 +**
1.2939 +** 2007-06-25: There is a bug in some versions of MSVC that cause the
1.2940 +** compiler to crash when getCellInfo() is implemented as a macro.
1.2941 +** But there is a measureable speed advantage to using the macro on gcc
1.2942 +** (when less compiler optimizations like -Os or -O0 are used and the
1.2943 +** compiler is not doing agressive inlining.) So we use a real function
1.2944 +** for MSVC and a macro for everything else. Ticket #2457.
1.2945 +*/
1.2946 +#ifndef NDEBUG
1.2947 + static void assertCellInfo(BtCursor *pCur){
1.2948 + CellInfo info;
1.2949 + int iPage = pCur->iPage;
1.2950 + memset(&info, 0, sizeof(info));
1.2951 + sqlite3BtreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info);
1.2952 + assert( memcmp(&info, &pCur->info, sizeof(info))==0 );
1.2953 + }
1.2954 +#else
1.2955 + #define assertCellInfo(x)
1.2956 +#endif
1.2957 +#ifdef _MSC_VER
1.2958 + /* Use a real function in MSVC to work around bugs in that compiler. */
1.2959 + static void getCellInfo(BtCursor *pCur){
1.2960 + if( pCur->info.nSize==0 ){
1.2961 + int iPage = pCur->iPage;
1.2962 + sqlite3BtreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);
1.2963 + pCur->validNKey = 1;
1.2964 + }else{
1.2965 + assertCellInfo(pCur);
1.2966 + }
1.2967 + }
1.2968 +#else /* if not _MSC_VER */
1.2969 + /* Use a macro in all other compilers so that the function is inlined */
1.2970 +#define getCellInfo(pCur) \
1.2971 + if( pCur->info.nSize==0 ){ \
1.2972 + int iPage = pCur->iPage; \
1.2973 + sqlite3BtreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); \
1.2974 + pCur->validNKey = 1; \
1.2975 + }else{ \
1.2976 + assertCellInfo(pCur); \
1.2977 + }
1.2978 +#endif /* _MSC_VER */
1.2979 +
1.2980 +/*
1.2981 +** Set *pSize to the size of the buffer needed to hold the value of
1.2982 +** the key for the current entry. If the cursor is not pointing
1.2983 +** to a valid entry, *pSize is set to 0.
1.2984 +**
1.2985 +** For a table with the INTKEY flag set, this routine returns the key
1.2986 +** itself, not the number of bytes in the key.
1.2987 +*/
1.2988 +int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
1.2989 + int rc;
1.2990 +
1.2991 + assert( cursorHoldsMutex(pCur) );
1.2992 + rc = restoreCursorPosition(pCur);
1.2993 + if( rc==SQLITE_OK ){
1.2994 + assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
1.2995 + if( pCur->eState==CURSOR_INVALID ){
1.2996 + *pSize = 0;
1.2997 + }else{
1.2998 + getCellInfo(pCur);
1.2999 + *pSize = pCur->info.nKey;
1.3000 + }
1.3001 + }
1.3002 + return rc;
1.3003 +}
1.3004 +
1.3005 +/*
1.3006 +** Set *pSize to the number of bytes of data in the entry the
1.3007 +** cursor currently points to. Always return SQLITE_OK.
1.3008 +** Failure is not possible. If the cursor is not currently
1.3009 +** pointing to an entry (which can happen, for example, if
1.3010 +** the database is empty) then *pSize is set to 0.
1.3011 +*/
1.3012 +int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
1.3013 + int rc;
1.3014 +
1.3015 + assert( cursorHoldsMutex(pCur) );
1.3016 + rc = restoreCursorPosition(pCur);
1.3017 + if( rc==SQLITE_OK ){
1.3018 + assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
1.3019 + if( pCur->eState==CURSOR_INVALID ){
1.3020 + /* Not pointing at a valid entry - set *pSize to 0. */
1.3021 + *pSize = 0;
1.3022 + }else{
1.3023 + getCellInfo(pCur);
1.3024 + *pSize = pCur->info.nData;
1.3025 + }
1.3026 + }
1.3027 + return rc;
1.3028 +}
1.3029 +
1.3030 +/*
1.3031 +** Given the page number of an overflow page in the database (parameter
1.3032 +** ovfl), this function finds the page number of the next page in the
1.3033 +** linked list of overflow pages. If possible, it uses the auto-vacuum
1.3034 +** pointer-map data instead of reading the content of page ovfl to do so.
1.3035 +**
1.3036 +** If an error occurs an SQLite error code is returned. Otherwise:
1.3037 +**
1.3038 +** Unless pPgnoNext is NULL, the page number of the next overflow
1.3039 +** page in the linked list is written to *pPgnoNext. If page ovfl
1.3040 +** is the last page in its linked list, *pPgnoNext is set to zero.
1.3041 +**
1.3042 +** If ppPage is not NULL, *ppPage is set to the MemPage* handle
1.3043 +** for page ovfl. The underlying pager page may have been requested
1.3044 +** with the noContent flag set, so the page data accessable via
1.3045 +** this handle may not be trusted.
1.3046 +*/
1.3047 +static int getOverflowPage(
1.3048 + BtShared *pBt,
1.3049 + Pgno ovfl, /* Overflow page */
1.3050 + MemPage **ppPage, /* OUT: MemPage handle */
1.3051 + Pgno *pPgnoNext /* OUT: Next overflow page number */
1.3052 +){
1.3053 + Pgno next = 0;
1.3054 + int rc;
1.3055 +
1.3056 + assert( sqlite3_mutex_held(pBt->mutex) );
1.3057 + /* One of these must not be NULL. Otherwise, why call this function? */
1.3058 + assert(ppPage || pPgnoNext);
1.3059 +
1.3060 + /* If pPgnoNext is NULL, then this function is being called to obtain
1.3061 + ** a MemPage* reference only. No page-data is required in this case.
1.3062 + */
1.3063 + if( !pPgnoNext ){
1.3064 + return sqlite3BtreeGetPage(pBt, ovfl, ppPage, 1);
1.3065 + }
1.3066 +
1.3067 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.3068 + /* Try to find the next page in the overflow list using the
1.3069 + ** autovacuum pointer-map pages. Guess that the next page in
1.3070 + ** the overflow list is page number (ovfl+1). If that guess turns
1.3071 + ** out to be wrong, fall back to loading the data of page
1.3072 + ** number ovfl to determine the next page number.
1.3073 + */
1.3074 + if( pBt->autoVacuum ){
1.3075 + Pgno pgno;
1.3076 + Pgno iGuess = ovfl+1;
1.3077 + u8 eType;
1.3078 +
1.3079 + while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
1.3080 + iGuess++;
1.3081 + }
1.3082 +
1.3083 + if( iGuess<=pagerPagecount(pBt->pPager) ){
1.3084 + rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
1.3085 + if( rc!=SQLITE_OK ){
1.3086 + return rc;
1.3087 + }
1.3088 + if( eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
1.3089 + next = iGuess;
1.3090 + }
1.3091 + }
1.3092 + }
1.3093 +#endif
1.3094 +
1.3095 + if( next==0 || ppPage ){
1.3096 + MemPage *pPage = 0;
1.3097 +
1.3098 + rc = sqlite3BtreeGetPage(pBt, ovfl, &pPage, next!=0);
1.3099 + assert(rc==SQLITE_OK || pPage==0);
1.3100 + if( next==0 && rc==SQLITE_OK ){
1.3101 + next = get4byte(pPage->aData);
1.3102 + }
1.3103 +
1.3104 + if( ppPage ){
1.3105 + *ppPage = pPage;
1.3106 + }else{
1.3107 + releasePage(pPage);
1.3108 + }
1.3109 + }
1.3110 + *pPgnoNext = next;
1.3111 +
1.3112 + return rc;
1.3113 +}
1.3114 +
1.3115 +/*
1.3116 +** Copy data from a buffer to a page, or from a page to a buffer.
1.3117 +**
1.3118 +** pPayload is a pointer to data stored on database page pDbPage.
1.3119 +** If argument eOp is false, then nByte bytes of data are copied
1.3120 +** from pPayload to the buffer pointed at by pBuf. If eOp is true,
1.3121 +** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
1.3122 +** of data are copied from the buffer pBuf to pPayload.
1.3123 +**
1.3124 +** SQLITE_OK is returned on success, otherwise an error code.
1.3125 +*/
1.3126 +static int copyPayload(
1.3127 + void *pPayload, /* Pointer to page data */
1.3128 + void *pBuf, /* Pointer to buffer */
1.3129 + int nByte, /* Number of bytes to copy */
1.3130 + int eOp, /* 0 -> copy from page, 1 -> copy to page */
1.3131 + DbPage *pDbPage /* Page containing pPayload */
1.3132 +){
1.3133 + if( eOp ){
1.3134 + /* Copy data from buffer to page (a write operation) */
1.3135 + int rc = sqlite3PagerWrite(pDbPage);
1.3136 + if( rc!=SQLITE_OK ){
1.3137 + return rc;
1.3138 + }
1.3139 + memcpy(pPayload, pBuf, nByte);
1.3140 + }else{
1.3141 + /* Copy data from page to buffer (a read operation) */
1.3142 + memcpy(pBuf, pPayload, nByte);
1.3143 + }
1.3144 + return SQLITE_OK;
1.3145 +}
1.3146 +
1.3147 +/*
1.3148 +** This function is used to read or overwrite payload information
1.3149 +** for the entry that the pCur cursor is pointing to. If the eOp
1.3150 +** parameter is 0, this is a read operation (data copied into
1.3151 +** buffer pBuf). If it is non-zero, a write (data copied from
1.3152 +** buffer pBuf).
1.3153 +**
1.3154 +** A total of "amt" bytes are read or written beginning at "offset".
1.3155 +** Data is read to or from the buffer pBuf.
1.3156 +**
1.3157 +** This routine does not make a distinction between key and data.
1.3158 +** It just reads or writes bytes from the payload area. Data might
1.3159 +** appear on the main page or be scattered out on multiple overflow
1.3160 +** pages.
1.3161 +**
1.3162 +** If the BtCursor.isIncrblobHandle flag is set, and the current
1.3163 +** cursor entry uses one or more overflow pages, this function
1.3164 +** allocates space for and lazily popluates the overflow page-list
1.3165 +** cache array (BtCursor.aOverflow). Subsequent calls use this
1.3166 +** cache to make seeking to the supplied offset more efficient.
1.3167 +**
1.3168 +** Once an overflow page-list cache has been allocated, it may be
1.3169 +** invalidated if some other cursor writes to the same table, or if
1.3170 +** the cursor is moved to a different row. Additionally, in auto-vacuum
1.3171 +** mode, the following events may invalidate an overflow page-list cache.
1.3172 +**
1.3173 +** * An incremental vacuum,
1.3174 +** * A commit in auto_vacuum="full" mode,
1.3175 +** * Creating a table (may require moving an overflow page).
1.3176 +*/
1.3177 +static int accessPayload(
1.3178 + BtCursor *pCur, /* Cursor pointing to entry to read from */
1.3179 + int offset, /* Begin reading this far into payload */
1.3180 + int amt, /* Read this many bytes */
1.3181 + unsigned char *pBuf, /* Write the bytes into this buffer */
1.3182 + int skipKey, /* offset begins at data if this is true */
1.3183 + int eOp /* zero to read. non-zero to write. */
1.3184 +){
1.3185 + unsigned char *aPayload;
1.3186 + int rc = SQLITE_OK;
1.3187 + u32 nKey;
1.3188 + int iIdx = 0;
1.3189 + MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */
1.3190 + BtShared *pBt = pCur->pBt; /* Btree this cursor belongs to */
1.3191 +
1.3192 + assert( pPage );
1.3193 + assert( pCur->eState==CURSOR_VALID );
1.3194 + assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
1.3195 + assert( offset>=0 );
1.3196 + assert( cursorHoldsMutex(pCur) );
1.3197 +
1.3198 + getCellInfo(pCur);
1.3199 + aPayload = pCur->info.pCell + pCur->info.nHeader;
1.3200 + nKey = (pPage->intKey ? 0 : pCur->info.nKey);
1.3201 +
1.3202 + if( skipKey ){
1.3203 + offset += nKey;
1.3204 + }
1.3205 + if( offset+amt > nKey+pCur->info.nData
1.3206 + || &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize]
1.3207 + ){
1.3208 + /* Trying to read or write past the end of the data is an error */
1.3209 + return SQLITE_CORRUPT_BKPT;
1.3210 + }
1.3211 +
1.3212 + /* Check if data must be read/written to/from the btree page itself. */
1.3213 + if( offset<pCur->info.nLocal ){
1.3214 + int a = amt;
1.3215 + if( a+offset>pCur->info.nLocal ){
1.3216 + a = pCur->info.nLocal - offset;
1.3217 + }
1.3218 + rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
1.3219 + offset = 0;
1.3220 + pBuf += a;
1.3221 + amt -= a;
1.3222 + }else{
1.3223 + offset -= pCur->info.nLocal;
1.3224 + }
1.3225 +
1.3226 + pBt = pCur->pBt;
1.3227 + if( rc==SQLITE_OK && amt>0 ){
1.3228 + const int ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */
1.3229 + Pgno nextPage;
1.3230 +
1.3231 + nextPage = get4byte(&aPayload[pCur->info.nLocal]);
1.3232 +
1.3233 +#ifndef SQLITE_OMIT_INCRBLOB
1.3234 + /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[]
1.3235 + ** has not been allocated, allocate it now. The array is sized at
1.3236 + ** one entry for each overflow page in the overflow chain. The
1.3237 + ** page number of the first overflow page is stored in aOverflow[0],
1.3238 + ** etc. A value of 0 in the aOverflow[] array means "not yet known"
1.3239 + ** (the cache is lazily populated).
1.3240 + */
1.3241 + if( pCur->isIncrblobHandle && !pCur->aOverflow ){
1.3242 + int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
1.3243 + pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl);
1.3244 + if( nOvfl && !pCur->aOverflow ){
1.3245 + rc = SQLITE_NOMEM;
1.3246 + }
1.3247 + }
1.3248 +
1.3249 + /* If the overflow page-list cache has been allocated and the
1.3250 + ** entry for the first required overflow page is valid, skip
1.3251 + ** directly to it.
1.3252 + */
1.3253 + if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){
1.3254 + iIdx = (offset/ovflSize);
1.3255 + nextPage = pCur->aOverflow[iIdx];
1.3256 + offset = (offset%ovflSize);
1.3257 + }
1.3258 +#endif
1.3259 +
1.3260 + for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){
1.3261 +
1.3262 +#ifndef SQLITE_OMIT_INCRBLOB
1.3263 + /* If required, populate the overflow page-list cache. */
1.3264 + if( pCur->aOverflow ){
1.3265 + assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage);
1.3266 + pCur->aOverflow[iIdx] = nextPage;
1.3267 + }
1.3268 +#endif
1.3269 +
1.3270 + if( offset>=ovflSize ){
1.3271 + /* The only reason to read this page is to obtain the page
1.3272 + ** number for the next page in the overflow chain. The page
1.3273 + ** data is not required. So first try to lookup the overflow
1.3274 + ** page-list cache, if any, then fall back to the getOverflowPage()
1.3275 + ** function.
1.3276 + */
1.3277 +#ifndef SQLITE_OMIT_INCRBLOB
1.3278 + if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){
1.3279 + nextPage = pCur->aOverflow[iIdx+1];
1.3280 + } else
1.3281 +#endif
1.3282 + rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
1.3283 + offset -= ovflSize;
1.3284 + }else{
1.3285 + /* Need to read this page properly. It contains some of the
1.3286 + ** range of data that is being read (eOp==0) or written (eOp!=0).
1.3287 + */
1.3288 + DbPage *pDbPage;
1.3289 + int a = amt;
1.3290 + rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage);
1.3291 + if( rc==SQLITE_OK ){
1.3292 + aPayload = sqlite3PagerGetData(pDbPage);
1.3293 + nextPage = get4byte(aPayload);
1.3294 + if( a + offset > ovflSize ){
1.3295 + a = ovflSize - offset;
1.3296 + }
1.3297 + rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
1.3298 + sqlite3PagerUnref(pDbPage);
1.3299 + offset = 0;
1.3300 + amt -= a;
1.3301 + pBuf += a;
1.3302 + }
1.3303 + }
1.3304 + }
1.3305 + }
1.3306 +
1.3307 + if( rc==SQLITE_OK && amt>0 ){
1.3308 + return SQLITE_CORRUPT_BKPT;
1.3309 + }
1.3310 + return rc;
1.3311 +}
1.3312 +
1.3313 +/*
1.3314 +** Read part of the key associated with cursor pCur. Exactly
1.3315 +** "amt" bytes will be transfered into pBuf[]. The transfer
1.3316 +** begins at "offset".
1.3317 +**
1.3318 +** Return SQLITE_OK on success or an error code if anything goes
1.3319 +** wrong. An error is returned if "offset+amt" is larger than
1.3320 +** the available payload.
1.3321 +*/
1.3322 +int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
1.3323 + int rc;
1.3324 +
1.3325 + assert( cursorHoldsMutex(pCur) );
1.3326 + rc = restoreCursorPosition(pCur);
1.3327 + if( rc==SQLITE_OK ){
1.3328 + assert( pCur->eState==CURSOR_VALID );
1.3329 + assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
1.3330 + if( pCur->apPage[0]->intKey ){
1.3331 + return SQLITE_CORRUPT_BKPT;
1.3332 + }
1.3333 + assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
1.3334 + rc = accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0, 0);
1.3335 + }
1.3336 + return rc;
1.3337 +}
1.3338 +
1.3339 +/*
1.3340 +** Read part of the data associated with cursor pCur. Exactly
1.3341 +** "amt" bytes will be transfered into pBuf[]. The transfer
1.3342 +** begins at "offset".
1.3343 +**
1.3344 +** Return SQLITE_OK on success or an error code if anything goes
1.3345 +** wrong. An error is returned if "offset+amt" is larger than
1.3346 +** the available payload.
1.3347 +*/
1.3348 +int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
1.3349 + int rc;
1.3350 +
1.3351 +#ifndef SQLITE_OMIT_INCRBLOB
1.3352 + if ( pCur->eState==CURSOR_INVALID ){
1.3353 + return SQLITE_ABORT;
1.3354 + }
1.3355 +#endif
1.3356 +
1.3357 + assert( cursorHoldsMutex(pCur) );
1.3358 + rc = restoreCursorPosition(pCur);
1.3359 + if( rc==SQLITE_OK ){
1.3360 + assert( pCur->eState==CURSOR_VALID );
1.3361 + assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
1.3362 + assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
1.3363 + rc = accessPayload(pCur, offset, amt, pBuf, 1, 0);
1.3364 + }
1.3365 + return rc;
1.3366 +}
1.3367 +
1.3368 +/*
1.3369 +** Return a pointer to payload information from the entry that the
1.3370 +** pCur cursor is pointing to. The pointer is to the beginning of
1.3371 +** the key if skipKey==0 and it points to the beginning of data if
1.3372 +** skipKey==1. The number of bytes of available key/data is written
1.3373 +** into *pAmt. If *pAmt==0, then the value returned will not be
1.3374 +** a valid pointer.
1.3375 +**
1.3376 +** This routine is an optimization. It is common for the entire key
1.3377 +** and data to fit on the local page and for there to be no overflow
1.3378 +** pages. When that is so, this routine can be used to access the
1.3379 +** key and data without making a copy. If the key and/or data spills
1.3380 +** onto overflow pages, then accessPayload() must be used to reassembly
1.3381 +** the key/data and copy it into a preallocated buffer.
1.3382 +**
1.3383 +** The pointer returned by this routine looks directly into the cached
1.3384 +** page of the database. The data might change or move the next time
1.3385 +** any btree routine is called.
1.3386 +*/
1.3387 +static const unsigned char *fetchPayload(
1.3388 + BtCursor *pCur, /* Cursor pointing to entry to read from */
1.3389 + int *pAmt, /* Write the number of available bytes here */
1.3390 + int skipKey /* read beginning at data if this is true */
1.3391 +){
1.3392 + unsigned char *aPayload;
1.3393 + MemPage *pPage;
1.3394 + u32 nKey;
1.3395 + int nLocal;
1.3396 +
1.3397 + assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]);
1.3398 + assert( pCur->eState==CURSOR_VALID );
1.3399 + assert( cursorHoldsMutex(pCur) );
1.3400 + pPage = pCur->apPage[pCur->iPage];
1.3401 + assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
1.3402 + getCellInfo(pCur);
1.3403 + aPayload = pCur->info.pCell;
1.3404 + aPayload += pCur->info.nHeader;
1.3405 + if( pPage->intKey ){
1.3406 + nKey = 0;
1.3407 + }else{
1.3408 + nKey = pCur->info.nKey;
1.3409 + }
1.3410 + if( skipKey ){
1.3411 + aPayload += nKey;
1.3412 + nLocal = pCur->info.nLocal - nKey;
1.3413 + }else{
1.3414 + nLocal = pCur->info.nLocal;
1.3415 + if( nLocal>nKey ){
1.3416 + nLocal = nKey;
1.3417 + }
1.3418 + }
1.3419 + *pAmt = nLocal;
1.3420 + return aPayload;
1.3421 +}
1.3422 +
1.3423 +
1.3424 +/*
1.3425 +** For the entry that cursor pCur is point to, return as
1.3426 +** many bytes of the key or data as are available on the local
1.3427 +** b-tree page. Write the number of available bytes into *pAmt.
1.3428 +**
1.3429 +** The pointer returned is ephemeral. The key/data may move
1.3430 +** or be destroyed on the next call to any Btree routine,
1.3431 +** including calls from other threads against the same cache.
1.3432 +** Hence, a mutex on the BtShared should be held prior to calling
1.3433 +** this routine.
1.3434 +**
1.3435 +** These routines is used to get quick access to key and data
1.3436 +** in the common case where no overflow pages are used.
1.3437 +*/
1.3438 +const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){
1.3439 + assert( cursorHoldsMutex(pCur) );
1.3440 + if( pCur->eState==CURSOR_VALID ){
1.3441 + return (const void*)fetchPayload(pCur, pAmt, 0);
1.3442 + }
1.3443 + return 0;
1.3444 +}
1.3445 +const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){
1.3446 + assert( cursorHoldsMutex(pCur) );
1.3447 + if( pCur->eState==CURSOR_VALID ){
1.3448 + return (const void*)fetchPayload(pCur, pAmt, 1);
1.3449 + }
1.3450 + return 0;
1.3451 +}
1.3452 +
1.3453 +
1.3454 +/*
1.3455 +** Move the cursor down to a new child page. The newPgno argument is the
1.3456 +** page number of the child page to move to.
1.3457 +*/
1.3458 +static int moveToChild(BtCursor *pCur, u32 newPgno){
1.3459 + int rc;
1.3460 + int i = pCur->iPage;
1.3461 + MemPage *pNewPage;
1.3462 + BtShared *pBt = pCur->pBt;
1.3463 +
1.3464 + assert( cursorHoldsMutex(pCur) );
1.3465 + assert( pCur->eState==CURSOR_VALID );
1.3466 + assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
1.3467 + if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
1.3468 + return SQLITE_CORRUPT_BKPT;
1.3469 + }
1.3470 + rc = getAndInitPage(pBt, newPgno, &pNewPage);
1.3471 + if( rc ) return rc;
1.3472 + pCur->apPage[i+1] = pNewPage;
1.3473 + pCur->aiIdx[i+1] = 0;
1.3474 + pCur->iPage++;
1.3475 +
1.3476 + pCur->info.nSize = 0;
1.3477 + pCur->validNKey = 0;
1.3478 + if( pNewPage->nCell<1 ){
1.3479 + return SQLITE_CORRUPT_BKPT;
1.3480 + }
1.3481 + return SQLITE_OK;
1.3482 +}
1.3483 +
1.3484 +#ifndef NDEBUG
1.3485 +/*
1.3486 +** Page pParent is an internal (non-leaf) tree page. This function
1.3487 +** asserts that page number iChild is the left-child if the iIdx'th
1.3488 +** cell in page pParent. Or, if iIdx is equal to the total number of
1.3489 +** cells in pParent, that page number iChild is the right-child of
1.3490 +** the page.
1.3491 +*/
1.3492 +static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
1.3493 + assert( iIdx<=pParent->nCell );
1.3494 + if( iIdx==pParent->nCell ){
1.3495 + assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
1.3496 + }else{
1.3497 + assert( get4byte(findCell(pParent, iIdx))==iChild );
1.3498 + }
1.3499 +}
1.3500 +#else
1.3501 +# define assertParentIndex(x,y,z)
1.3502 +#endif
1.3503 +
1.3504 +/*
1.3505 +** Move the cursor up to the parent page.
1.3506 +**
1.3507 +** pCur->idx is set to the cell index that contains the pointer
1.3508 +** to the page we are coming from. If we are coming from the
1.3509 +** right-most child page then pCur->idx is set to one more than
1.3510 +** the largest cell index.
1.3511 +*/
1.3512 +void sqlite3BtreeMoveToParent(BtCursor *pCur){
1.3513 + assert( cursorHoldsMutex(pCur) );
1.3514 + assert( pCur->eState==CURSOR_VALID );
1.3515 + assert( pCur->iPage>0 );
1.3516 + assert( pCur->apPage[pCur->iPage] );
1.3517 + assertParentIndex(
1.3518 + pCur->apPage[pCur->iPage-1],
1.3519 + pCur->aiIdx[pCur->iPage-1],
1.3520 + pCur->apPage[pCur->iPage]->pgno
1.3521 + );
1.3522 + releasePage(pCur->apPage[pCur->iPage]);
1.3523 + pCur->iPage--;
1.3524 + pCur->info.nSize = 0;
1.3525 + pCur->validNKey = 0;
1.3526 +}
1.3527 +
1.3528 +/*
1.3529 +** Move the cursor to the root page
1.3530 +*/
1.3531 +static int moveToRoot(BtCursor *pCur){
1.3532 + MemPage *pRoot;
1.3533 + int rc = SQLITE_OK;
1.3534 + Btree *p = pCur->pBtree;
1.3535 + BtShared *pBt = p->pBt;
1.3536 +
1.3537 + assert( cursorHoldsMutex(pCur) );
1.3538 + assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
1.3539 + assert( CURSOR_VALID < CURSOR_REQUIRESEEK );
1.3540 + assert( CURSOR_FAULT > CURSOR_REQUIRESEEK );
1.3541 + if( pCur->eState>=CURSOR_REQUIRESEEK ){
1.3542 + if( pCur->eState==CURSOR_FAULT ){
1.3543 + return pCur->skip;
1.3544 + }
1.3545 + sqlite3BtreeClearCursor(pCur);
1.3546 + }
1.3547 +
1.3548 + if( pCur->iPage>=0 ){
1.3549 + int i;
1.3550 + for(i=1; i<=pCur->iPage; i++){
1.3551 + releasePage(pCur->apPage[i]);
1.3552 + }
1.3553 + }else{
1.3554 + if(
1.3555 + SQLITE_OK!=(rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]))
1.3556 + ){
1.3557 + pCur->eState = CURSOR_INVALID;
1.3558 + return rc;
1.3559 + }
1.3560 + }
1.3561 +
1.3562 + pRoot = pCur->apPage[0];
1.3563 + assert( pRoot->pgno==pCur->pgnoRoot );
1.3564 + pCur->iPage = 0;
1.3565 + pCur->aiIdx[0] = 0;
1.3566 + pCur->info.nSize = 0;
1.3567 + pCur->atLast = 0;
1.3568 + pCur->validNKey = 0;
1.3569 +
1.3570 + if( pRoot->nCell==0 && !pRoot->leaf ){
1.3571 + Pgno subpage;
1.3572 + assert( pRoot->pgno==1 );
1.3573 + subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
1.3574 + assert( subpage>0 );
1.3575 + pCur->eState = CURSOR_VALID;
1.3576 + rc = moveToChild(pCur, subpage);
1.3577 + }else{
1.3578 + pCur->eState = ((pRoot->nCell>0)?CURSOR_VALID:CURSOR_INVALID);
1.3579 + }
1.3580 + return rc;
1.3581 +}
1.3582 +
1.3583 +/*
1.3584 +** Move the cursor down to the left-most leaf entry beneath the
1.3585 +** entry to which it is currently pointing.
1.3586 +**
1.3587 +** The left-most leaf is the one with the smallest key - the first
1.3588 +** in ascending order.
1.3589 +*/
1.3590 +static int moveToLeftmost(BtCursor *pCur){
1.3591 + Pgno pgno;
1.3592 + int rc = SQLITE_OK;
1.3593 + MemPage *pPage;
1.3594 +
1.3595 + assert( cursorHoldsMutex(pCur) );
1.3596 + assert( pCur->eState==CURSOR_VALID );
1.3597 + while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
1.3598 + assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
1.3599 + pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage]));
1.3600 + rc = moveToChild(pCur, pgno);
1.3601 + }
1.3602 + return rc;
1.3603 +}
1.3604 +
1.3605 +/*
1.3606 +** Move the cursor down to the right-most leaf entry beneath the
1.3607 +** page to which it is currently pointing. Notice the difference
1.3608 +** between moveToLeftmost() and moveToRightmost(). moveToLeftmost()
1.3609 +** finds the left-most entry beneath the *entry* whereas moveToRightmost()
1.3610 +** finds the right-most entry beneath the *page*.
1.3611 +**
1.3612 +** The right-most entry is the one with the largest key - the last
1.3613 +** key in ascending order.
1.3614 +*/
1.3615 +static int moveToRightmost(BtCursor *pCur){
1.3616 + Pgno pgno;
1.3617 + int rc = SQLITE_OK;
1.3618 + MemPage *pPage;
1.3619 +
1.3620 + assert( cursorHoldsMutex(pCur) );
1.3621 + assert( pCur->eState==CURSOR_VALID );
1.3622 + while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
1.3623 + pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
1.3624 + pCur->aiIdx[pCur->iPage] = pPage->nCell;
1.3625 + rc = moveToChild(pCur, pgno);
1.3626 + }
1.3627 + if( rc==SQLITE_OK ){
1.3628 + pCur->aiIdx[pCur->iPage] = pPage->nCell-1;
1.3629 + pCur->info.nSize = 0;
1.3630 + pCur->validNKey = 0;
1.3631 + }
1.3632 + return rc;
1.3633 +}
1.3634 +
1.3635 +/* Move the cursor to the first entry in the table. Return SQLITE_OK
1.3636 +** on success. Set *pRes to 0 if the cursor actually points to something
1.3637 +** or set *pRes to 1 if the table is empty.
1.3638 +*/
1.3639 +int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
1.3640 + int rc;
1.3641 +
1.3642 + assert( cursorHoldsMutex(pCur) );
1.3643 + assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
1.3644 + rc = moveToRoot(pCur);
1.3645 + if( rc==SQLITE_OK ){
1.3646 + if( pCur->eState==CURSOR_INVALID ){
1.3647 + assert( pCur->apPage[pCur->iPage]->nCell==0 );
1.3648 + *pRes = 1;
1.3649 + rc = SQLITE_OK;
1.3650 + }else{
1.3651 + assert( pCur->apPage[pCur->iPage]->nCell>0 );
1.3652 + *pRes = 0;
1.3653 + rc = moveToLeftmost(pCur);
1.3654 + }
1.3655 + }
1.3656 + return rc;
1.3657 +}
1.3658 +
1.3659 +/* Move the cursor to the last entry in the table. Return SQLITE_OK
1.3660 +** on success. Set *pRes to 0 if the cursor actually points to something
1.3661 +** or set *pRes to 1 if the table is empty.
1.3662 +*/
1.3663 +int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
1.3664 + int rc;
1.3665 +
1.3666 + assert( cursorHoldsMutex(pCur) );
1.3667 + assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
1.3668 + rc = moveToRoot(pCur);
1.3669 + if( rc==SQLITE_OK ){
1.3670 + if( CURSOR_INVALID==pCur->eState ){
1.3671 + assert( pCur->apPage[pCur->iPage]->nCell==0 );
1.3672 + *pRes = 1;
1.3673 + }else{
1.3674 + assert( pCur->eState==CURSOR_VALID );
1.3675 + *pRes = 0;
1.3676 + rc = moveToRightmost(pCur);
1.3677 + getCellInfo(pCur);
1.3678 + pCur->atLast = rc==SQLITE_OK;
1.3679 + }
1.3680 + }
1.3681 + return rc;
1.3682 +}
1.3683 +
1.3684 +/* Move the cursor so that it points to an entry near the key
1.3685 +** specified by pIdxKey or intKey. Return a success code.
1.3686 +**
1.3687 +** For INTKEY tables, the intKey parameter is used. pIdxKey
1.3688 +** must be NULL. For index tables, pIdxKey is used and intKey
1.3689 +** is ignored.
1.3690 +**
1.3691 +** If an exact match is not found, then the cursor is always
1.3692 +** left pointing at a leaf page which would hold the entry if it
1.3693 +** were present. The cursor might point to an entry that comes
1.3694 +** before or after the key.
1.3695 +**
1.3696 +** The result of comparing the key with the entry to which the
1.3697 +** cursor is written to *pRes if pRes!=NULL. The meaning of
1.3698 +** this value is as follows:
1.3699 +**
1.3700 +** *pRes<0 The cursor is left pointing at an entry that
1.3701 +** is smaller than pKey or if the table is empty
1.3702 +** and the cursor is therefore left point to nothing.
1.3703 +**
1.3704 +** *pRes==0 The cursor is left pointing at an entry that
1.3705 +** exactly matches pKey.
1.3706 +**
1.3707 +** *pRes>0 The cursor is left pointing at an entry that
1.3708 +** is larger than pKey.
1.3709 +**
1.3710 +*/
1.3711 +int sqlite3BtreeMovetoUnpacked(
1.3712 + BtCursor *pCur, /* The cursor to be moved */
1.3713 + UnpackedRecord *pIdxKey, /* Unpacked index key */
1.3714 + i64 intKey, /* The table key */
1.3715 + int biasRight, /* If true, bias the search to the high end */
1.3716 + int *pRes /* Write search results here */
1.3717 +){
1.3718 + int rc;
1.3719 +
1.3720 + assert( cursorHoldsMutex(pCur) );
1.3721 + assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
1.3722 +
1.3723 + /* If the cursor is already positioned at the point we are trying
1.3724 + ** to move to, then just return without doing any work */
1.3725 + if( pCur->eState==CURSOR_VALID && pCur->validNKey
1.3726 + && pCur->apPage[0]->intKey
1.3727 + ){
1.3728 + if( pCur->info.nKey==intKey ){
1.3729 + *pRes = 0;
1.3730 + return SQLITE_OK;
1.3731 + }
1.3732 + if( pCur->atLast && pCur->info.nKey<intKey ){
1.3733 + *pRes = -1;
1.3734 + return SQLITE_OK;
1.3735 + }
1.3736 + }
1.3737 +
1.3738 + rc = moveToRoot(pCur);
1.3739 + if( rc ){
1.3740 + return rc;
1.3741 + }
1.3742 + assert( pCur->apPage[pCur->iPage] );
1.3743 + assert( pCur->apPage[pCur->iPage]->isInit );
1.3744 + if( pCur->eState==CURSOR_INVALID ){
1.3745 + *pRes = -1;
1.3746 + assert( pCur->apPage[pCur->iPage]->nCell==0 );
1.3747 + return SQLITE_OK;
1.3748 + }
1.3749 + assert( pCur->apPage[0]->intKey || pIdxKey );
1.3750 + for(;;){
1.3751 + int lwr, upr;
1.3752 + Pgno chldPg;
1.3753 + MemPage *pPage = pCur->apPage[pCur->iPage];
1.3754 + int c = -1; /* pRes return if table is empty must be -1 */
1.3755 + lwr = 0;
1.3756 + upr = pPage->nCell-1;
1.3757 + if( !pPage->intKey && pIdxKey==0 ){
1.3758 + rc = SQLITE_CORRUPT_BKPT;
1.3759 + goto moveto_finish;
1.3760 + }
1.3761 + if( biasRight ){
1.3762 + pCur->aiIdx[pCur->iPage] = upr;
1.3763 + }else{
1.3764 + pCur->aiIdx[pCur->iPage] = (upr+lwr)/2;
1.3765 + }
1.3766 + if( lwr<=upr ) for(;;){
1.3767 + void *pCellKey;
1.3768 + i64 nCellKey;
1.3769 + int idx = pCur->aiIdx[pCur->iPage];
1.3770 + pCur->info.nSize = 0;
1.3771 + pCur->validNKey = 1;
1.3772 + if( pPage->intKey ){
1.3773 + u8 *pCell;
1.3774 + pCell = findCell(pPage, idx) + pPage->childPtrSize;
1.3775 + if( pPage->hasData ){
1.3776 + u32 dummy;
1.3777 + pCell += getVarint32(pCell, dummy);
1.3778 + }
1.3779 + getVarint(pCell, (u64*)&nCellKey);
1.3780 + if( nCellKey==intKey ){
1.3781 + c = 0;
1.3782 + }else if( nCellKey<intKey ){
1.3783 + c = -1;
1.3784 + }else{
1.3785 + assert( nCellKey>intKey );
1.3786 + c = +1;
1.3787 + }
1.3788 + }else{
1.3789 + int available;
1.3790 + pCellKey = (void *)fetchPayload(pCur, &available, 0);
1.3791 + nCellKey = pCur->info.nKey;
1.3792 + if( available>=nCellKey ){
1.3793 + c = sqlite3VdbeRecordCompare(nCellKey, pCellKey, pIdxKey);
1.3794 + }else{
1.3795 + pCellKey = sqlite3Malloc( nCellKey );
1.3796 + if( pCellKey==0 ){
1.3797 + rc = SQLITE_NOMEM;
1.3798 + goto moveto_finish;
1.3799 + }
1.3800 + rc = sqlite3BtreeKey(pCur, 0, nCellKey, (void *)pCellKey);
1.3801 + c = sqlite3VdbeRecordCompare(nCellKey, pCellKey, pIdxKey);
1.3802 + sqlite3_free(pCellKey);
1.3803 + if( rc ) goto moveto_finish;
1.3804 + }
1.3805 + }
1.3806 + if( c==0 ){
1.3807 + pCur->info.nKey = nCellKey;
1.3808 + if( pPage->intKey && !pPage->leaf ){
1.3809 + lwr = idx;
1.3810 + upr = lwr - 1;
1.3811 + break;
1.3812 + }else{
1.3813 + if( pRes ) *pRes = 0;
1.3814 + rc = SQLITE_OK;
1.3815 + goto moveto_finish;
1.3816 + }
1.3817 + }
1.3818 + if( c<0 ){
1.3819 + lwr = idx+1;
1.3820 + }else{
1.3821 + upr = idx-1;
1.3822 + }
1.3823 + if( lwr>upr ){
1.3824 + pCur->info.nKey = nCellKey;
1.3825 + break;
1.3826 + }
1.3827 + pCur->aiIdx[pCur->iPage] = (lwr+upr)/2;
1.3828 + }
1.3829 + assert( lwr==upr+1 );
1.3830 + assert( pPage->isInit );
1.3831 + if( pPage->leaf ){
1.3832 + chldPg = 0;
1.3833 + }else if( lwr>=pPage->nCell ){
1.3834 + chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
1.3835 + }else{
1.3836 + chldPg = get4byte(findCell(pPage, lwr));
1.3837 + }
1.3838 + if( chldPg==0 ){
1.3839 + assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
1.3840 + if( pRes ) *pRes = c;
1.3841 + rc = SQLITE_OK;
1.3842 + goto moveto_finish;
1.3843 + }
1.3844 + pCur->aiIdx[pCur->iPage] = lwr;
1.3845 + pCur->info.nSize = 0;
1.3846 + pCur->validNKey = 0;
1.3847 + rc = moveToChild(pCur, chldPg);
1.3848 + if( rc ) goto moveto_finish;
1.3849 + }
1.3850 +moveto_finish:
1.3851 + return rc;
1.3852 +}
1.3853 +
1.3854 +/*
1.3855 +** In this version of BtreeMoveto, pKey is a packed index record
1.3856 +** such as is generated by the OP_MakeRecord opcode. Unpack the
1.3857 +** record and then call BtreeMovetoUnpacked() to do the work.
1.3858 +*/
1.3859 +int sqlite3BtreeMoveto(
1.3860 + BtCursor *pCur, /* Cursor open on the btree to be searched */
1.3861 + const void *pKey, /* Packed key if the btree is an index */
1.3862 + i64 nKey, /* Integer key for tables. Size of pKey for indices */
1.3863 + int bias, /* Bias search to the high end */
1.3864 + int *pRes /* Write search results here */
1.3865 +){
1.3866 + int rc; /* Status code */
1.3867 + UnpackedRecord *pIdxKey; /* Unpacked index key */
1.3868 + UnpackedRecord aSpace[16]; /* Temp space for pIdxKey - to avoid a malloc */
1.3869 +
1.3870 + if( pKey ){
1.3871 + pIdxKey = sqlite3VdbeRecordUnpack(pCur->pKeyInfo, nKey, pKey,
1.3872 + aSpace, sizeof(aSpace));
1.3873 + if( pIdxKey==0 ) return SQLITE_NOMEM;
1.3874 + }else{
1.3875 + pIdxKey = 0;
1.3876 + }
1.3877 + rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
1.3878 + if( pKey ){
1.3879 + sqlite3VdbeDeleteUnpackedRecord(pIdxKey);
1.3880 + }
1.3881 + return rc;
1.3882 +}
1.3883 +
1.3884 +
1.3885 +/*
1.3886 +** Return TRUE if the cursor is not pointing at an entry of the table.
1.3887 +**
1.3888 +** TRUE will be returned after a call to sqlite3BtreeNext() moves
1.3889 +** past the last entry in the table or sqlite3BtreePrev() moves past
1.3890 +** the first entry. TRUE is also returned if the table is empty.
1.3891 +*/
1.3892 +int sqlite3BtreeEof(BtCursor *pCur){
1.3893 + /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
1.3894 + ** have been deleted? This API will need to change to return an error code
1.3895 + ** as well as the boolean result value.
1.3896 + */
1.3897 + return (CURSOR_VALID!=pCur->eState);
1.3898 +}
1.3899 +
1.3900 +/*
1.3901 +** Return the database connection handle for a cursor.
1.3902 +*/
1.3903 +sqlite3 *sqlite3BtreeCursorDb(const BtCursor *pCur){
1.3904 + assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
1.3905 + return pCur->pBtree->db;
1.3906 +}
1.3907 +
1.3908 +/*
1.3909 +** Advance the cursor to the next entry in the database. If
1.3910 +** successful then set *pRes=0. If the cursor
1.3911 +** was already pointing to the last entry in the database before
1.3912 +** this routine was called, then set *pRes=1.
1.3913 +*/
1.3914 +int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
1.3915 + int rc;
1.3916 + int idx;
1.3917 + MemPage *pPage;
1.3918 +
1.3919 + assert( cursorHoldsMutex(pCur) );
1.3920 + rc = restoreCursorPosition(pCur);
1.3921 + if( rc!=SQLITE_OK ){
1.3922 + return rc;
1.3923 + }
1.3924 + assert( pRes!=0 );
1.3925 + if( CURSOR_INVALID==pCur->eState ){
1.3926 + *pRes = 1;
1.3927 + return SQLITE_OK;
1.3928 + }
1.3929 + if( pCur->skip>0 ){
1.3930 + pCur->skip = 0;
1.3931 + *pRes = 0;
1.3932 + return SQLITE_OK;
1.3933 + }
1.3934 + pCur->skip = 0;
1.3935 +
1.3936 + pPage = pCur->apPage[pCur->iPage];
1.3937 + idx = ++pCur->aiIdx[pCur->iPage];
1.3938 + assert( pPage->isInit );
1.3939 + assert( idx<=pPage->nCell );
1.3940 +
1.3941 + pCur->info.nSize = 0;
1.3942 + pCur->validNKey = 0;
1.3943 + if( idx>=pPage->nCell ){
1.3944 + if( !pPage->leaf ){
1.3945 + rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
1.3946 + if( rc ) return rc;
1.3947 + rc = moveToLeftmost(pCur);
1.3948 + *pRes = 0;
1.3949 + return rc;
1.3950 + }
1.3951 + do{
1.3952 + if( pCur->iPage==0 ){
1.3953 + *pRes = 1;
1.3954 + pCur->eState = CURSOR_INVALID;
1.3955 + return SQLITE_OK;
1.3956 + }
1.3957 + sqlite3BtreeMoveToParent(pCur);
1.3958 + pPage = pCur->apPage[pCur->iPage];
1.3959 + }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell );
1.3960 + *pRes = 0;
1.3961 + if( pPage->intKey ){
1.3962 + rc = sqlite3BtreeNext(pCur, pRes);
1.3963 + }else{
1.3964 + rc = SQLITE_OK;
1.3965 + }
1.3966 + return rc;
1.3967 + }
1.3968 + *pRes = 0;
1.3969 + if( pPage->leaf ){
1.3970 + return SQLITE_OK;
1.3971 + }
1.3972 + rc = moveToLeftmost(pCur);
1.3973 + return rc;
1.3974 +}
1.3975 +
1.3976 +
1.3977 +/*
1.3978 +** Step the cursor to the back to the previous entry in the database. If
1.3979 +** successful then set *pRes=0. If the cursor
1.3980 +** was already pointing to the first entry in the database before
1.3981 +** this routine was called, then set *pRes=1.
1.3982 +*/
1.3983 +int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
1.3984 + int rc;
1.3985 + MemPage *pPage;
1.3986 +
1.3987 + assert( cursorHoldsMutex(pCur) );
1.3988 + rc = restoreCursorPosition(pCur);
1.3989 + if( rc!=SQLITE_OK ){
1.3990 + return rc;
1.3991 + }
1.3992 + pCur->atLast = 0;
1.3993 + if( CURSOR_INVALID==pCur->eState ){
1.3994 + *pRes = 1;
1.3995 + return SQLITE_OK;
1.3996 + }
1.3997 + if( pCur->skip<0 ){
1.3998 + pCur->skip = 0;
1.3999 + *pRes = 0;
1.4000 + return SQLITE_OK;
1.4001 + }
1.4002 + pCur->skip = 0;
1.4003 +
1.4004 + pPage = pCur->apPage[pCur->iPage];
1.4005 + assert( pPage->isInit );
1.4006 + if( !pPage->leaf ){
1.4007 + int idx = pCur->aiIdx[pCur->iPage];
1.4008 + rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
1.4009 + if( rc ){
1.4010 + return rc;
1.4011 + }
1.4012 + rc = moveToRightmost(pCur);
1.4013 + }else{
1.4014 + while( pCur->aiIdx[pCur->iPage]==0 ){
1.4015 + if( pCur->iPage==0 ){
1.4016 + pCur->eState = CURSOR_INVALID;
1.4017 + *pRes = 1;
1.4018 + return SQLITE_OK;
1.4019 + }
1.4020 + sqlite3BtreeMoveToParent(pCur);
1.4021 + }
1.4022 + pCur->info.nSize = 0;
1.4023 + pCur->validNKey = 0;
1.4024 +
1.4025 + pCur->aiIdx[pCur->iPage]--;
1.4026 + pPage = pCur->apPage[pCur->iPage];
1.4027 + if( pPage->intKey && !pPage->leaf ){
1.4028 + rc = sqlite3BtreePrevious(pCur, pRes);
1.4029 + }else{
1.4030 + rc = SQLITE_OK;
1.4031 + }
1.4032 + }
1.4033 + *pRes = 0;
1.4034 + return rc;
1.4035 +}
1.4036 +
1.4037 +/*
1.4038 +** Allocate a new page from the database file.
1.4039 +**
1.4040 +** The new page is marked as dirty. (In other words, sqlite3PagerWrite()
1.4041 +** has already been called on the new page.) The new page has also
1.4042 +** been referenced and the calling routine is responsible for calling
1.4043 +** sqlite3PagerUnref() on the new page when it is done.
1.4044 +**
1.4045 +** SQLITE_OK is returned on success. Any other return value indicates
1.4046 +** an error. *ppPage and *pPgno are undefined in the event of an error.
1.4047 +** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned.
1.4048 +**
1.4049 +** If the "nearby" parameter is not 0, then a (feeble) effort is made to
1.4050 +** locate a page close to the page number "nearby". This can be used in an
1.4051 +** attempt to keep related pages close to each other in the database file,
1.4052 +** which in turn can make database access faster.
1.4053 +**
1.4054 +** If the "exact" parameter is not 0, and the page-number nearby exists
1.4055 +** anywhere on the free-list, then it is guarenteed to be returned. This
1.4056 +** is only used by auto-vacuum databases when allocating a new table.
1.4057 +*/
1.4058 +static int allocateBtreePage(
1.4059 + BtShared *pBt,
1.4060 + MemPage **ppPage,
1.4061 + Pgno *pPgno,
1.4062 + Pgno nearby,
1.4063 + u8 exact
1.4064 +){
1.4065 + MemPage *pPage1;
1.4066 + int rc;
1.4067 + int n; /* Number of pages on the freelist */
1.4068 + int k; /* Number of leaves on the trunk of the freelist */
1.4069 + MemPage *pTrunk = 0;
1.4070 + MemPage *pPrevTrunk = 0;
1.4071 +
1.4072 + assert( sqlite3_mutex_held(pBt->mutex) );
1.4073 + pPage1 = pBt->pPage1;
1.4074 + n = get4byte(&pPage1->aData[36]);
1.4075 + if( n>0 ){
1.4076 + /* There are pages on the freelist. Reuse one of those pages. */
1.4077 + Pgno iTrunk;
1.4078 + u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
1.4079 +
1.4080 + /* If the 'exact' parameter was true and a query of the pointer-map
1.4081 + ** shows that the page 'nearby' is somewhere on the free-list, then
1.4082 + ** the entire-list will be searched for that page.
1.4083 + */
1.4084 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.4085 + if( exact && nearby<=pagerPagecount(pBt->pPager) ){
1.4086 + u8 eType;
1.4087 + assert( nearby>0 );
1.4088 + assert( pBt->autoVacuum );
1.4089 + rc = ptrmapGet(pBt, nearby, &eType, 0);
1.4090 + if( rc ) return rc;
1.4091 + if( eType==PTRMAP_FREEPAGE ){
1.4092 + searchList = 1;
1.4093 + }
1.4094 + *pPgno = nearby;
1.4095 + }
1.4096 +#endif
1.4097 +
1.4098 + /* Decrement the free-list count by 1. Set iTrunk to the index of the
1.4099 + ** first free-list trunk page. iPrevTrunk is initially 1.
1.4100 + */
1.4101 + rc = sqlite3PagerWrite(pPage1->pDbPage);
1.4102 + if( rc ) return rc;
1.4103 + put4byte(&pPage1->aData[36], n-1);
1.4104 +
1.4105 + /* The code within this loop is run only once if the 'searchList' variable
1.4106 + ** is not true. Otherwise, it runs once for each trunk-page on the
1.4107 + ** free-list until the page 'nearby' is located.
1.4108 + */
1.4109 + do {
1.4110 + pPrevTrunk = pTrunk;
1.4111 + if( pPrevTrunk ){
1.4112 + iTrunk = get4byte(&pPrevTrunk->aData[0]);
1.4113 + }else{
1.4114 + iTrunk = get4byte(&pPage1->aData[32]);
1.4115 + }
1.4116 + rc = sqlite3BtreeGetPage(pBt, iTrunk, &pTrunk, 0);
1.4117 + if( rc ){
1.4118 + pTrunk = 0;
1.4119 + goto end_allocate_page;
1.4120 + }
1.4121 +
1.4122 + k = get4byte(&pTrunk->aData[4]);
1.4123 + if( k==0 && !searchList ){
1.4124 + /* The trunk has no leaves and the list is not being searched.
1.4125 + ** So extract the trunk page itself and use it as the newly
1.4126 + ** allocated page */
1.4127 + assert( pPrevTrunk==0 );
1.4128 + rc = sqlite3PagerWrite(pTrunk->pDbPage);
1.4129 + if( rc ){
1.4130 + goto end_allocate_page;
1.4131 + }
1.4132 + *pPgno = iTrunk;
1.4133 + memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
1.4134 + *ppPage = pTrunk;
1.4135 + pTrunk = 0;
1.4136 + TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
1.4137 + }else if( k>pBt->usableSize/4 - 2 ){
1.4138 + /* Value of k is out of range. Database corruption */
1.4139 + rc = SQLITE_CORRUPT_BKPT;
1.4140 + goto end_allocate_page;
1.4141 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.4142 + }else if( searchList && nearby==iTrunk ){
1.4143 + /* The list is being searched and this trunk page is the page
1.4144 + ** to allocate, regardless of whether it has leaves.
1.4145 + */
1.4146 + assert( *pPgno==iTrunk );
1.4147 + *ppPage = pTrunk;
1.4148 + searchList = 0;
1.4149 + rc = sqlite3PagerWrite(pTrunk->pDbPage);
1.4150 + if( rc ){
1.4151 + goto end_allocate_page;
1.4152 + }
1.4153 + if( k==0 ){
1.4154 + if( !pPrevTrunk ){
1.4155 + memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
1.4156 + }else{
1.4157 + memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
1.4158 + }
1.4159 + }else{
1.4160 + /* The trunk page is required by the caller but it contains
1.4161 + ** pointers to free-list leaves. The first leaf becomes a trunk
1.4162 + ** page in this case.
1.4163 + */
1.4164 + MemPage *pNewTrunk;
1.4165 + Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
1.4166 + rc = sqlite3BtreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0);
1.4167 + if( rc!=SQLITE_OK ){
1.4168 + goto end_allocate_page;
1.4169 + }
1.4170 + rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
1.4171 + if( rc!=SQLITE_OK ){
1.4172 + releasePage(pNewTrunk);
1.4173 + goto end_allocate_page;
1.4174 + }
1.4175 + memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
1.4176 + put4byte(&pNewTrunk->aData[4], k-1);
1.4177 + memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
1.4178 + releasePage(pNewTrunk);
1.4179 + if( !pPrevTrunk ){
1.4180 + put4byte(&pPage1->aData[32], iNewTrunk);
1.4181 + }else{
1.4182 + rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
1.4183 + if( rc ){
1.4184 + goto end_allocate_page;
1.4185 + }
1.4186 + put4byte(&pPrevTrunk->aData[0], iNewTrunk);
1.4187 + }
1.4188 + }
1.4189 + pTrunk = 0;
1.4190 + TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
1.4191 +#endif
1.4192 + }else{
1.4193 + /* Extract a leaf from the trunk */
1.4194 + int closest;
1.4195 + Pgno iPage;
1.4196 + unsigned char *aData = pTrunk->aData;
1.4197 + rc = sqlite3PagerWrite(pTrunk->pDbPage);
1.4198 + if( rc ){
1.4199 + goto end_allocate_page;
1.4200 + }
1.4201 + if( nearby>0 ){
1.4202 + int i, dist;
1.4203 + closest = 0;
1.4204 + dist = get4byte(&aData[8]) - nearby;
1.4205 + if( dist<0 ) dist = -dist;
1.4206 + for(i=1; i<k; i++){
1.4207 + int d2 = get4byte(&aData[8+i*4]) - nearby;
1.4208 + if( d2<0 ) d2 = -d2;
1.4209 + if( d2<dist ){
1.4210 + closest = i;
1.4211 + dist = d2;
1.4212 + }
1.4213 + }
1.4214 + }else{
1.4215 + closest = 0;
1.4216 + }
1.4217 +
1.4218 + iPage = get4byte(&aData[8+closest*4]);
1.4219 + if( !searchList || iPage==nearby ){
1.4220 + int nPage;
1.4221 + *pPgno = iPage;
1.4222 + nPage = pagerPagecount(pBt->pPager);
1.4223 + if( *pPgno>nPage ){
1.4224 + /* Free page off the end of the file */
1.4225 + rc = SQLITE_CORRUPT_BKPT;
1.4226 + goto end_allocate_page;
1.4227 + }
1.4228 + TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
1.4229 + ": %d more free pages\n",
1.4230 + *pPgno, closest+1, k, pTrunk->pgno, n-1));
1.4231 + if( closest<k-1 ){
1.4232 + memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
1.4233 + }
1.4234 + put4byte(&aData[4], k-1);
1.4235 + rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 1);
1.4236 + if( rc==SQLITE_OK ){
1.4237 + sqlite3PagerDontRollback((*ppPage)->pDbPage);
1.4238 + rc = sqlite3PagerWrite((*ppPage)->pDbPage);
1.4239 + if( rc!=SQLITE_OK ){
1.4240 + releasePage(*ppPage);
1.4241 + }
1.4242 + }
1.4243 + searchList = 0;
1.4244 + }
1.4245 + }
1.4246 + releasePage(pPrevTrunk);
1.4247 + pPrevTrunk = 0;
1.4248 + }while( searchList );
1.4249 + }else{
1.4250 + /* There are no pages on the freelist, so create a new page at the
1.4251 + ** end of the file */
1.4252 + int nPage = pagerPagecount(pBt->pPager);
1.4253 + *pPgno = nPage + 1;
1.4254 +
1.4255 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.4256 + if( pBt->nTrunc ){
1.4257 + /* An incr-vacuum has already run within this transaction. So the
1.4258 + ** page to allocate is not from the physical end of the file, but
1.4259 + ** at pBt->nTrunc.
1.4260 + */
1.4261 + *pPgno = pBt->nTrunc+1;
1.4262 + if( *pPgno==PENDING_BYTE_PAGE(pBt) ){
1.4263 + (*pPgno)++;
1.4264 + }
1.4265 + }
1.4266 + if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, *pPgno) ){
1.4267 + /* If *pPgno refers to a pointer-map page, allocate two new pages
1.4268 + ** at the end of the file instead of one. The first allocated page
1.4269 + ** becomes a new pointer-map page, the second is used by the caller.
1.4270 + */
1.4271 + TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", *pPgno));
1.4272 + assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
1.4273 + (*pPgno)++;
1.4274 + if( *pPgno==PENDING_BYTE_PAGE(pBt) ){ (*pPgno)++; }
1.4275 + }
1.4276 + if( pBt->nTrunc ){
1.4277 + pBt->nTrunc = *pPgno;
1.4278 + }
1.4279 +#endif
1.4280 +
1.4281 + assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
1.4282 + rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 0);
1.4283 + if( rc ) return rc;
1.4284 + rc = sqlite3PagerWrite((*ppPage)->pDbPage);
1.4285 + if( rc!=SQLITE_OK ){
1.4286 + releasePage(*ppPage);
1.4287 + }
1.4288 + TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
1.4289 + }
1.4290 +
1.4291 + assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
1.4292 +
1.4293 +end_allocate_page:
1.4294 + releasePage(pTrunk);
1.4295 + releasePage(pPrevTrunk);
1.4296 + if( rc==SQLITE_OK ){
1.4297 + if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
1.4298 + releasePage(*ppPage);
1.4299 + return SQLITE_CORRUPT_BKPT;
1.4300 + }
1.4301 + (*ppPage)->isInit = 0;
1.4302 + }
1.4303 + return rc;
1.4304 +}
1.4305 +
1.4306 +/*
1.4307 +** Add a page of the database file to the freelist.
1.4308 +**
1.4309 +** sqlite3PagerUnref() is NOT called for pPage.
1.4310 +*/
1.4311 +static int freePage(MemPage *pPage){
1.4312 + BtShared *pBt = pPage->pBt;
1.4313 + MemPage *pPage1 = pBt->pPage1;
1.4314 + int rc, n, k;
1.4315 +
1.4316 + /* Prepare the page for freeing */
1.4317 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.4318 + assert( pPage->pgno>1 );
1.4319 + pPage->isInit = 0;
1.4320 +
1.4321 + /* Increment the free page count on pPage1 */
1.4322 + rc = sqlite3PagerWrite(pPage1->pDbPage);
1.4323 + if( rc ) return rc;
1.4324 + n = get4byte(&pPage1->aData[36]);
1.4325 + put4byte(&pPage1->aData[36], n+1);
1.4326 +
1.4327 +#ifdef SQLITE_SECURE_DELETE
1.4328 + /* If the SQLITE_SECURE_DELETE compile-time option is enabled, then
1.4329 + ** always fully overwrite deleted information with zeros.
1.4330 + */
1.4331 + rc = sqlite3PagerWrite(pPage->pDbPage);
1.4332 + if( rc ) return rc;
1.4333 + memset(pPage->aData, 0, pPage->pBt->pageSize);
1.4334 +#endif
1.4335 +
1.4336 + /* If the database supports auto-vacuum, write an entry in the pointer-map
1.4337 + ** to indicate that the page is free.
1.4338 + */
1.4339 + if( ISAUTOVACUUM ){
1.4340 + rc = ptrmapPut(pBt, pPage->pgno, PTRMAP_FREEPAGE, 0);
1.4341 + if( rc ) return rc;
1.4342 + }
1.4343 +
1.4344 + if( n==0 ){
1.4345 + /* This is the first free page */
1.4346 + rc = sqlite3PagerWrite(pPage->pDbPage);
1.4347 + if( rc ) return rc;
1.4348 + memset(pPage->aData, 0, 8);
1.4349 + put4byte(&pPage1->aData[32], pPage->pgno);
1.4350 + TRACE(("FREE-PAGE: %d first\n", pPage->pgno));
1.4351 + }else{
1.4352 + /* Other free pages already exist. Retrive the first trunk page
1.4353 + ** of the freelist and find out how many leaves it has. */
1.4354 + MemPage *pTrunk;
1.4355 + rc = sqlite3BtreeGetPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk, 0);
1.4356 + if( rc ) return rc;
1.4357 + k = get4byte(&pTrunk->aData[4]);
1.4358 + if( k>=pBt->usableSize/4 - 8 ){
1.4359 + /* The trunk is full. Turn the page being freed into a new
1.4360 + ** trunk page with no leaves.
1.4361 + **
1.4362 + ** Note that the trunk page is not really full until it contains
1.4363 + ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
1.4364 + ** coded. But due to a coding error in versions of SQLite prior to
1.4365 + ** 3.6.0, databases with freelist trunk pages holding more than
1.4366 + ** usableSize/4 - 8 entries will be reported as corrupt. In order
1.4367 + ** to maintain backwards compatibility with older versions of SQLite,
1.4368 + ** we will contain to restrict the number of entries to usableSize/4 - 8
1.4369 + ** for now. At some point in the future (once everyone has upgraded
1.4370 + ** to 3.6.0 or later) we should consider fixing the conditional above
1.4371 + ** to read "usableSize/4-2" instead of "usableSize/4-8".
1.4372 + */
1.4373 + rc = sqlite3PagerWrite(pPage->pDbPage);
1.4374 + if( rc==SQLITE_OK ){
1.4375 + put4byte(pPage->aData, pTrunk->pgno);
1.4376 + put4byte(&pPage->aData[4], 0);
1.4377 + put4byte(&pPage1->aData[32], pPage->pgno);
1.4378 + TRACE(("FREE-PAGE: %d new trunk page replacing %d\n",
1.4379 + pPage->pgno, pTrunk->pgno));
1.4380 + }
1.4381 + }else if( k<0 ){
1.4382 + rc = SQLITE_CORRUPT;
1.4383 + }else{
1.4384 + /* Add the newly freed page as a leaf on the current trunk */
1.4385 + rc = sqlite3PagerWrite(pTrunk->pDbPage);
1.4386 + if( rc==SQLITE_OK ){
1.4387 + put4byte(&pTrunk->aData[4], k+1);
1.4388 + put4byte(&pTrunk->aData[8+k*4], pPage->pgno);
1.4389 +#ifndef SQLITE_SECURE_DELETE
1.4390 + rc = sqlite3PagerDontWrite(pPage->pDbPage);
1.4391 +#endif
1.4392 + }
1.4393 + TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
1.4394 + }
1.4395 + releasePage(pTrunk);
1.4396 + }
1.4397 + return rc;
1.4398 +}
1.4399 +
1.4400 +/*
1.4401 +** Free any overflow pages associated with the given Cell.
1.4402 +*/
1.4403 +static int clearCell(MemPage *pPage, unsigned char *pCell){
1.4404 + BtShared *pBt = pPage->pBt;
1.4405 + CellInfo info;
1.4406 + Pgno ovflPgno;
1.4407 + int rc;
1.4408 + int nOvfl;
1.4409 + int ovflPageSize;
1.4410 +
1.4411 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.4412 + sqlite3BtreeParseCellPtr(pPage, pCell, &info);
1.4413 + if( info.iOverflow==0 ){
1.4414 + return SQLITE_OK; /* No overflow pages. Return without doing anything */
1.4415 + }
1.4416 + ovflPgno = get4byte(&pCell[info.iOverflow]);
1.4417 + ovflPageSize = pBt->usableSize - 4;
1.4418 + nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;
1.4419 + assert( ovflPgno==0 || nOvfl>0 );
1.4420 + while( nOvfl-- ){
1.4421 + MemPage *pOvfl;
1.4422 + if( ovflPgno==0 || ovflPgno>pagerPagecount(pBt->pPager) ){
1.4423 + return SQLITE_CORRUPT_BKPT;
1.4424 + }
1.4425 +
1.4426 + rc = getOverflowPage(pBt, ovflPgno, &pOvfl, (nOvfl==0)?0:&ovflPgno);
1.4427 + if( rc ) return rc;
1.4428 + rc = freePage(pOvfl);
1.4429 + sqlite3PagerUnref(pOvfl->pDbPage);
1.4430 + if( rc ) return rc;
1.4431 + }
1.4432 + return SQLITE_OK;
1.4433 +}
1.4434 +
1.4435 +/*
1.4436 +** Create the byte sequence used to represent a cell on page pPage
1.4437 +** and write that byte sequence into pCell[]. Overflow pages are
1.4438 +** allocated and filled in as necessary. The calling procedure
1.4439 +** is responsible for making sure sufficient space has been allocated
1.4440 +** for pCell[].
1.4441 +**
1.4442 +** Note that pCell does not necessary need to point to the pPage->aData
1.4443 +** area. pCell might point to some temporary storage. The cell will
1.4444 +** be constructed in this temporary area then copied into pPage->aData
1.4445 +** later.
1.4446 +*/
1.4447 +static int fillInCell(
1.4448 + MemPage *pPage, /* The page that contains the cell */
1.4449 + unsigned char *pCell, /* Complete text of the cell */
1.4450 + const void *pKey, i64 nKey, /* The key */
1.4451 + const void *pData,int nData, /* The data */
1.4452 + int nZero, /* Extra zero bytes to append to pData */
1.4453 + int *pnSize /* Write cell size here */
1.4454 +){
1.4455 + int nPayload;
1.4456 + const u8 *pSrc;
1.4457 + int nSrc, n, rc;
1.4458 + int spaceLeft;
1.4459 + MemPage *pOvfl = 0;
1.4460 + MemPage *pToRelease = 0;
1.4461 + unsigned char *pPrior;
1.4462 + unsigned char *pPayload;
1.4463 + BtShared *pBt = pPage->pBt;
1.4464 + Pgno pgnoOvfl = 0;
1.4465 + int nHeader;
1.4466 + CellInfo info;
1.4467 +
1.4468 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.4469 +
1.4470 + /* Fill in the header. */
1.4471 + nHeader = 0;
1.4472 + if( !pPage->leaf ){
1.4473 + nHeader += 4;
1.4474 + }
1.4475 + if( pPage->hasData ){
1.4476 + nHeader += putVarint(&pCell[nHeader], nData+nZero);
1.4477 + }else{
1.4478 + nData = nZero = 0;
1.4479 + }
1.4480 + nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
1.4481 + sqlite3BtreeParseCellPtr(pPage, pCell, &info);
1.4482 + assert( info.nHeader==nHeader );
1.4483 + assert( info.nKey==nKey );
1.4484 + assert( info.nData==nData+nZero );
1.4485 +
1.4486 + /* Fill in the payload */
1.4487 + nPayload = nData + nZero;
1.4488 + if( pPage->intKey ){
1.4489 + pSrc = pData;
1.4490 + nSrc = nData;
1.4491 + nData = 0;
1.4492 + }else{
1.4493 + nPayload += nKey;
1.4494 + pSrc = pKey;
1.4495 + nSrc = nKey;
1.4496 + }
1.4497 + *pnSize = info.nSize;
1.4498 + spaceLeft = info.nLocal;
1.4499 + pPayload = &pCell[nHeader];
1.4500 + pPrior = &pCell[info.iOverflow];
1.4501 +
1.4502 + while( nPayload>0 ){
1.4503 + if( spaceLeft==0 ){
1.4504 + int isExact = 0;
1.4505 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.4506 + Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
1.4507 + if( pBt->autoVacuum ){
1.4508 + do{
1.4509 + pgnoOvfl++;
1.4510 + } while(
1.4511 + PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
1.4512 + );
1.4513 + if( pgnoOvfl>1 ){
1.4514 + /* isExact = 1; */
1.4515 + }
1.4516 + }
1.4517 +#endif
1.4518 + rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, isExact);
1.4519 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.4520 + /* If the database supports auto-vacuum, and the second or subsequent
1.4521 + ** overflow page is being allocated, add an entry to the pointer-map
1.4522 + ** for that page now.
1.4523 + **
1.4524 + ** If this is the first overflow page, then write a partial entry
1.4525 + ** to the pointer-map. If we write nothing to this pointer-map slot,
1.4526 + ** then the optimistic overflow chain processing in clearCell()
1.4527 + ** may misinterpret the uninitialised values and delete the
1.4528 + ** wrong pages from the database.
1.4529 + */
1.4530 + if( pBt->autoVacuum && rc==SQLITE_OK ){
1.4531 + u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
1.4532 + rc = ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap);
1.4533 + if( rc ){
1.4534 + releasePage(pOvfl);
1.4535 + }
1.4536 + }
1.4537 +#endif
1.4538 + if( rc ){
1.4539 + releasePage(pToRelease);
1.4540 + return rc;
1.4541 + }
1.4542 + put4byte(pPrior, pgnoOvfl);
1.4543 + releasePage(pToRelease);
1.4544 + pToRelease = pOvfl;
1.4545 + pPrior = pOvfl->aData;
1.4546 + put4byte(pPrior, 0);
1.4547 + pPayload = &pOvfl->aData[4];
1.4548 + spaceLeft = pBt->usableSize - 4;
1.4549 + }
1.4550 + n = nPayload;
1.4551 + if( n>spaceLeft ) n = spaceLeft;
1.4552 + if( nSrc>0 ){
1.4553 + if( n>nSrc ) n = nSrc;
1.4554 + assert( pSrc );
1.4555 + memcpy(pPayload, pSrc, n);
1.4556 + }else{
1.4557 + memset(pPayload, 0, n);
1.4558 + }
1.4559 + nPayload -= n;
1.4560 + pPayload += n;
1.4561 + pSrc += n;
1.4562 + nSrc -= n;
1.4563 + spaceLeft -= n;
1.4564 + if( nSrc==0 ){
1.4565 + nSrc = nData;
1.4566 + pSrc = pData;
1.4567 + }
1.4568 + }
1.4569 + releasePage(pToRelease);
1.4570 + return SQLITE_OK;
1.4571 +}
1.4572 +
1.4573 +/*
1.4574 +** Remove the i-th cell from pPage. This routine effects pPage only.
1.4575 +** The cell content is not freed or deallocated. It is assumed that
1.4576 +** the cell content has been copied someplace else. This routine just
1.4577 +** removes the reference to the cell from pPage.
1.4578 +**
1.4579 +** "sz" must be the number of bytes in the cell.
1.4580 +*/
1.4581 +static int dropCell(MemPage *pPage, int idx, int sz){
1.4582 + int i; /* Loop counter */
1.4583 + int pc; /* Offset to cell content of cell being deleted */
1.4584 + u8 *data; /* pPage->aData */
1.4585 + u8 *ptr; /* Used to move bytes around within data[] */
1.4586 + int rc; /* Return code */
1.4587 +
1.4588 + assert( idx>=0 && idx<pPage->nCell );
1.4589 + assert( sz==cellSize(pPage, idx) );
1.4590 + assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1.4591 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.4592 + data = pPage->aData;
1.4593 + ptr = &data[pPage->cellOffset + 2*idx];
1.4594 + pc = get2byte(ptr);
1.4595 + if( pc<pPage->hdrOffset+6+(pPage->leaf?0:4)
1.4596 + || pc+sz>pPage->pBt->usableSize
1.4597 + ){
1.4598 + return SQLITE_CORRUPT_BKPT;
1.4599 + }
1.4600 + rc = freeSpace(pPage, pc, sz);
1.4601 + if( rc ) return rc;
1.4602 + for(i=idx+1; i<pPage->nCell; i++, ptr+=2){
1.4603 + ptr[0] = ptr[2];
1.4604 + ptr[1] = ptr[3];
1.4605 + }
1.4606 + pPage->nCell--;
1.4607 + put2byte(&data[pPage->hdrOffset+3], pPage->nCell);
1.4608 + pPage->nFree += 2;
1.4609 + return SQLITE_OK;
1.4610 +}
1.4611 +
1.4612 +/*
1.4613 +** Insert a new cell on pPage at cell index "i". pCell points to the
1.4614 +** content of the cell.
1.4615 +**
1.4616 +** If the cell content will fit on the page, then put it there. If it
1.4617 +** will not fit, then make a copy of the cell content into pTemp if
1.4618 +** pTemp is not null. Regardless of pTemp, allocate a new entry
1.4619 +** in pPage->aOvfl[] and make it point to the cell content (either
1.4620 +** in pTemp or the original pCell) and also record its index.
1.4621 +** Allocating a new entry in pPage->aCell[] implies that
1.4622 +** pPage->nOverflow is incremented.
1.4623 +**
1.4624 +** If nSkip is non-zero, then do not copy the first nSkip bytes of the
1.4625 +** cell. The caller will overwrite them after this function returns. If
1.4626 +** nSkip is non-zero, then pCell may not point to an invalid memory location
1.4627 +** (but pCell+nSkip is always valid).
1.4628 +*/
1.4629 +static int insertCell(
1.4630 + MemPage *pPage, /* Page into which we are copying */
1.4631 + int i, /* New cell becomes the i-th cell of the page */
1.4632 + u8 *pCell, /* Content of the new cell */
1.4633 + int sz, /* Bytes of content in pCell */
1.4634 + u8 *pTemp, /* Temp storage space for pCell, if needed */
1.4635 + u8 nSkip /* Do not write the first nSkip bytes of the cell */
1.4636 +){
1.4637 + int idx; /* Where to write new cell content in data[] */
1.4638 + int j; /* Loop counter */
1.4639 + int top; /* First byte of content for any cell in data[] */
1.4640 + int end; /* First byte past the last cell pointer in data[] */
1.4641 + int ins; /* Index in data[] where new cell pointer is inserted */
1.4642 + int hdr; /* Offset into data[] of the page header */
1.4643 + int cellOffset; /* Address of first cell pointer in data[] */
1.4644 + u8 *data; /* The content of the whole page */
1.4645 + u8 *ptr; /* Used for moving information around in data[] */
1.4646 +
1.4647 + assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
1.4648 + assert( sz==cellSizePtr(pPage, pCell) );
1.4649 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.4650 + if( pPage->nOverflow || sz+2>pPage->nFree ){
1.4651 + if( pTemp ){
1.4652 + memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip);
1.4653 + pCell = pTemp;
1.4654 + }
1.4655 + j = pPage->nOverflow++;
1.4656 + assert( j<sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0]) );
1.4657 + pPage->aOvfl[j].pCell = pCell;
1.4658 + pPage->aOvfl[j].idx = i;
1.4659 + pPage->nFree = 0;
1.4660 + }else{
1.4661 + int rc = sqlite3PagerWrite(pPage->pDbPage);
1.4662 + if( rc!=SQLITE_OK ){
1.4663 + return rc;
1.4664 + }
1.4665 + assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1.4666 + data = pPage->aData;
1.4667 + hdr = pPage->hdrOffset;
1.4668 + top = get2byte(&data[hdr+5]);
1.4669 + cellOffset = pPage->cellOffset;
1.4670 + end = cellOffset + 2*pPage->nCell + 2;
1.4671 + ins = cellOffset + 2*i;
1.4672 + if( end > top - sz ){
1.4673 + rc = defragmentPage(pPage);
1.4674 + if( rc ) return rc;
1.4675 + top = get2byte(&data[hdr+5]);
1.4676 + assert( end + sz <= top );
1.4677 + }
1.4678 + idx = allocateSpace(pPage, sz);
1.4679 + assert( idx>0 );
1.4680 + assert( end <= get2byte(&data[hdr+5]) );
1.4681 + if( idx+sz > pPage->pBt->usableSize ){
1.4682 + return SQLITE_CORRUPT_BKPT;
1.4683 + }
1.4684 + pPage->nCell++;
1.4685 + pPage->nFree -= 2;
1.4686 + memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip);
1.4687 + for(j=end-2, ptr=&data[j]; j>ins; j-=2, ptr-=2){
1.4688 + ptr[0] = ptr[-2];
1.4689 + ptr[1] = ptr[-1];
1.4690 + }
1.4691 + put2byte(&data[ins], idx);
1.4692 + put2byte(&data[hdr+3], pPage->nCell);
1.4693 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.4694 + if( pPage->pBt->autoVacuum ){
1.4695 + /* The cell may contain a pointer to an overflow page. If so, write
1.4696 + ** the entry for the overflow page into the pointer map.
1.4697 + */
1.4698 + CellInfo info;
1.4699 + sqlite3BtreeParseCellPtr(pPage, pCell, &info);
1.4700 + assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
1.4701 + if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
1.4702 + Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
1.4703 + rc = ptrmapPut(pPage->pBt, pgnoOvfl, PTRMAP_OVERFLOW1, pPage->pgno);
1.4704 + if( rc!=SQLITE_OK ) return rc;
1.4705 + }
1.4706 + }
1.4707 +#endif
1.4708 + }
1.4709 +
1.4710 + return SQLITE_OK;
1.4711 +}
1.4712 +
1.4713 +/*
1.4714 +** Add a list of cells to a page. The page should be initially empty.
1.4715 +** The cells are guaranteed to fit on the page.
1.4716 +*/
1.4717 +static void assemblePage(
1.4718 + MemPage *pPage, /* The page to be assemblied */
1.4719 + int nCell, /* The number of cells to add to this page */
1.4720 + u8 **apCell, /* Pointers to cell bodies */
1.4721 + u16 *aSize /* Sizes of the cells */
1.4722 +){
1.4723 + int i; /* Loop counter */
1.4724 + int totalSize; /* Total size of all cells */
1.4725 + int hdr; /* Index of page header */
1.4726 + int cellptr; /* Address of next cell pointer */
1.4727 + int cellbody; /* Address of next cell body */
1.4728 + u8 *data; /* Data for the page */
1.4729 +
1.4730 + assert( pPage->nOverflow==0 );
1.4731 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.4732 + totalSize = 0;
1.4733 + for(i=0; i<nCell; i++){
1.4734 + totalSize += aSize[i];
1.4735 + }
1.4736 + assert( totalSize+2*nCell<=pPage->nFree );
1.4737 + assert( pPage->nCell==0 );
1.4738 + cellptr = pPage->cellOffset;
1.4739 + data = pPage->aData;
1.4740 + hdr = pPage->hdrOffset;
1.4741 + put2byte(&data[hdr+3], nCell);
1.4742 + if( nCell ){
1.4743 + cellbody = allocateSpace(pPage, totalSize);
1.4744 + assert( cellbody>0 );
1.4745 + assert( pPage->nFree >= 2*nCell );
1.4746 + pPage->nFree -= 2*nCell;
1.4747 + for(i=0; i<nCell; i++){
1.4748 + put2byte(&data[cellptr], cellbody);
1.4749 + memcpy(&data[cellbody], apCell[i], aSize[i]);
1.4750 + cellptr += 2;
1.4751 + cellbody += aSize[i];
1.4752 + }
1.4753 + assert( cellbody==pPage->pBt->usableSize );
1.4754 + }
1.4755 + pPage->nCell = nCell;
1.4756 +}
1.4757 +
1.4758 +/*
1.4759 +** The following parameters determine how many adjacent pages get involved
1.4760 +** in a balancing operation. NN is the number of neighbors on either side
1.4761 +** of the page that participate in the balancing operation. NB is the
1.4762 +** total number of pages that participate, including the target page and
1.4763 +** NN neighbors on either side.
1.4764 +**
1.4765 +** The minimum value of NN is 1 (of course). Increasing NN above 1
1.4766 +** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
1.4767 +** in exchange for a larger degradation in INSERT and UPDATE performance.
1.4768 +** The value of NN appears to give the best results overall.
1.4769 +*/
1.4770 +#define NN 1 /* Number of neighbors on either side of pPage */
1.4771 +#define NB (NN*2+1) /* Total pages involved in the balance */
1.4772 +
1.4773 +/* Forward reference */
1.4774 +static int balance(BtCursor*, int);
1.4775 +
1.4776 +#ifndef SQLITE_OMIT_QUICKBALANCE
1.4777 +/*
1.4778 +** This version of balance() handles the common special case where
1.4779 +** a new entry is being inserted on the extreme right-end of the
1.4780 +** tree, in other words, when the new entry will become the largest
1.4781 +** entry in the tree.
1.4782 +**
1.4783 +** Instead of trying balance the 3 right-most leaf pages, just add
1.4784 +** a new page to the right-hand side and put the one new entry in
1.4785 +** that page. This leaves the right side of the tree somewhat
1.4786 +** unbalanced. But odds are that we will be inserting new entries
1.4787 +** at the end soon afterwards so the nearly empty page will quickly
1.4788 +** fill up. On average.
1.4789 +**
1.4790 +** pPage is the leaf page which is the right-most page in the tree.
1.4791 +** pParent is its parent. pPage must have a single overflow entry
1.4792 +** which is also the right-most entry on the page.
1.4793 +*/
1.4794 +static int balance_quick(BtCursor *pCur){
1.4795 + int rc;
1.4796 + MemPage *pNew = 0;
1.4797 + Pgno pgnoNew;
1.4798 + u8 *pCell;
1.4799 + u16 szCell;
1.4800 + CellInfo info;
1.4801 + MemPage *pPage = pCur->apPage[pCur->iPage];
1.4802 + MemPage *pParent = pCur->apPage[pCur->iPage-1];
1.4803 + BtShared *pBt = pPage->pBt;
1.4804 + int parentIdx = pParent->nCell; /* pParent new divider cell index */
1.4805 + int parentSize; /* Size of new divider cell */
1.4806 + u8 parentCell[64]; /* Space for the new divider cell */
1.4807 +
1.4808 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.4809 +
1.4810 + /* Allocate a new page. Insert the overflow cell from pPage
1.4811 + ** into it. Then remove the overflow cell from pPage.
1.4812 + */
1.4813 + rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
1.4814 + if( rc==SQLITE_OK ){
1.4815 + pCell = pPage->aOvfl[0].pCell;
1.4816 + szCell = cellSizePtr(pPage, pCell);
1.4817 + zeroPage(pNew, pPage->aData[0]);
1.4818 + assemblePage(pNew, 1, &pCell, &szCell);
1.4819 + pPage->nOverflow = 0;
1.4820 +
1.4821 + /* pPage is currently the right-child of pParent. Change this
1.4822 + ** so that the right-child is the new page allocated above and
1.4823 + ** pPage is the next-to-right child.
1.4824 + **
1.4825 + ** Ignore the return value of the call to fillInCell(). fillInCell()
1.4826 + ** may only return other than SQLITE_OK if it is required to allocate
1.4827 + ** one or more overflow pages. Since an internal table B-Tree cell
1.4828 + ** may never spill over onto an overflow page (it is a maximum of
1.4829 + ** 13 bytes in size), it is not neccessary to check the return code.
1.4830 + **
1.4831 + ** Similarly, the insertCell() function cannot fail if the page
1.4832 + ** being inserted into is already writable and the cell does not
1.4833 + ** contain an overflow pointer. So ignore this return code too.
1.4834 + */
1.4835 + assert( pPage->nCell>0 );
1.4836 + pCell = findCell(pPage, pPage->nCell-1);
1.4837 + sqlite3BtreeParseCellPtr(pPage, pCell, &info);
1.4838 + fillInCell(pParent, parentCell, 0, info.nKey, 0, 0, 0, &parentSize);
1.4839 + assert( parentSize<64 );
1.4840 + assert( sqlite3PagerIswriteable(pParent->pDbPage) );
1.4841 + insertCell(pParent, parentIdx, parentCell, parentSize, 0, 4);
1.4842 + put4byte(findOverflowCell(pParent,parentIdx), pPage->pgno);
1.4843 + put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
1.4844 +
1.4845 + /* If this is an auto-vacuum database, update the pointer map
1.4846 + ** with entries for the new page, and any pointer from the
1.4847 + ** cell on the page to an overflow page.
1.4848 + */
1.4849 + if( ISAUTOVACUUM ){
1.4850 + rc = ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno);
1.4851 + if( rc==SQLITE_OK ){
1.4852 + rc = ptrmapPutOvfl(pNew, 0);
1.4853 + }
1.4854 + }
1.4855 +
1.4856 + /* Release the reference to the new page. */
1.4857 + releasePage(pNew);
1.4858 + }
1.4859 +
1.4860 + /* At this point the pPage->nFree variable is not set correctly with
1.4861 + ** respect to the content of the page (because it was set to 0 by
1.4862 + ** insertCell). So call sqlite3BtreeInitPage() to make sure it is
1.4863 + ** correct.
1.4864 + **
1.4865 + ** This has to be done even if an error will be returned. Normally, if
1.4866 + ** an error occurs during tree balancing, the contents of MemPage are
1.4867 + ** not important, as they will be recalculated when the page is rolled
1.4868 + ** back. But here, in balance_quick(), it is possible that pPage has
1.4869 + ** not yet been marked dirty or written into the journal file. Therefore
1.4870 + ** it will not be rolled back and so it is important to make sure that
1.4871 + ** the page data and contents of MemPage are consistent.
1.4872 + */
1.4873 + pPage->isInit = 0;
1.4874 + sqlite3BtreeInitPage(pPage);
1.4875 +
1.4876 + /* If everything else succeeded, balance the parent page, in
1.4877 + ** case the divider cell inserted caused it to become overfull.
1.4878 + */
1.4879 + if( rc==SQLITE_OK ){
1.4880 + releasePage(pPage);
1.4881 + pCur->iPage--;
1.4882 + rc = balance(pCur, 0);
1.4883 + }
1.4884 + return rc;
1.4885 +}
1.4886 +#endif /* SQLITE_OMIT_QUICKBALANCE */
1.4887 +
1.4888 +/*
1.4889 +** This routine redistributes Cells on pPage and up to NN*2 siblings
1.4890 +** of pPage so that all pages have about the same amount of free space.
1.4891 +** Usually NN siblings on either side of pPage is used in the balancing,
1.4892 +** though more siblings might come from one side if pPage is the first
1.4893 +** or last child of its parent. If pPage has fewer than 2*NN siblings
1.4894 +** (something which can only happen if pPage is the root page or a
1.4895 +** child of root) then all available siblings participate in the balancing.
1.4896 +**
1.4897 +** The number of siblings of pPage might be increased or decreased by one or
1.4898 +** two in an effort to keep pages nearly full but not over full. The root page
1.4899 +** is special and is allowed to be nearly empty. If pPage is
1.4900 +** the root page, then the depth of the tree might be increased
1.4901 +** or decreased by one, as necessary, to keep the root page from being
1.4902 +** overfull or completely empty.
1.4903 +**
1.4904 +** Note that when this routine is called, some of the Cells on pPage
1.4905 +** might not actually be stored in pPage->aData[]. This can happen
1.4906 +** if the page is overfull. Part of the job of this routine is to
1.4907 +** make sure all Cells for pPage once again fit in pPage->aData[].
1.4908 +**
1.4909 +** In the course of balancing the siblings of pPage, the parent of pPage
1.4910 +** might become overfull or underfull. If that happens, then this routine
1.4911 +** is called recursively on the parent.
1.4912 +**
1.4913 +** If this routine fails for any reason, it might leave the database
1.4914 +** in a corrupted state. So if this routine fails, the database should
1.4915 +** be rolled back.
1.4916 +*/
1.4917 +static int balance_nonroot(BtCursor *pCur){
1.4918 + MemPage *pPage; /* The over or underfull page to balance */
1.4919 + MemPage *pParent; /* The parent of pPage */
1.4920 + BtShared *pBt; /* The whole database */
1.4921 + int nCell = 0; /* Number of cells in apCell[] */
1.4922 + int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */
1.4923 + int nOld; /* Number of pages in apOld[] */
1.4924 + int nNew; /* Number of pages in apNew[] */
1.4925 + int nDiv; /* Number of cells in apDiv[] */
1.4926 + int i, j, k; /* Loop counters */
1.4927 + int idx; /* Index of pPage in pParent->aCell[] */
1.4928 + int nxDiv; /* Next divider slot in pParent->aCell[] */
1.4929 + int rc; /* The return code */
1.4930 + int leafCorrection; /* 4 if pPage is a leaf. 0 if not */
1.4931 + int leafData; /* True if pPage is a leaf of a LEAFDATA tree */
1.4932 + int usableSpace; /* Bytes in pPage beyond the header */
1.4933 + int pageFlags; /* Value of pPage->aData[0] */
1.4934 + int subtotal; /* Subtotal of bytes in cells on one page */
1.4935 + int iSpace1 = 0; /* First unused byte of aSpace1[] */
1.4936 + int iSpace2 = 0; /* First unused byte of aSpace2[] */
1.4937 + int szScratch; /* Size of scratch memory requested */
1.4938 + MemPage *apOld[NB]; /* pPage and up to two siblings */
1.4939 + Pgno pgnoOld[NB]; /* Page numbers for each page in apOld[] */
1.4940 + MemPage *apCopy[NB]; /* Private copies of apOld[] pages */
1.4941 + MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */
1.4942 + Pgno pgnoNew[NB+2]; /* Page numbers for each page in apNew[] */
1.4943 + u8 *apDiv[NB]; /* Divider cells in pParent */
1.4944 + int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */
1.4945 + int szNew[NB+2]; /* Combined size of cells place on i-th page */
1.4946 + u8 **apCell = 0; /* All cells begin balanced */
1.4947 + u16 *szCell; /* Local size of all cells in apCell[] */
1.4948 + u8 *aCopy[NB]; /* Space for holding data of apCopy[] */
1.4949 + u8 *aSpace1; /* Space for copies of dividers cells before balance */
1.4950 + u8 *aSpace2 = 0; /* Space for overflow dividers cells after balance */
1.4951 + u8 *aFrom = 0;
1.4952 +
1.4953 + pPage = pCur->apPage[pCur->iPage];
1.4954 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.4955 + VVA_ONLY( pCur->pagesShuffled = 1 );
1.4956 +
1.4957 + /*
1.4958 + ** Find the parent page.
1.4959 + */
1.4960 + assert( pCur->iPage>0 );
1.4961 + assert( pPage->isInit );
1.4962 + assert( sqlite3PagerIswriteable(pPage->pDbPage) || pPage->nOverflow==1 );
1.4963 + pBt = pPage->pBt;
1.4964 + pParent = pCur->apPage[pCur->iPage-1];
1.4965 + assert( pParent );
1.4966 + if( SQLITE_OK!=(rc = sqlite3PagerWrite(pParent->pDbPage)) ){
1.4967 + return rc;
1.4968 + }
1.4969 +
1.4970 + TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
1.4971 +
1.4972 +#ifndef SQLITE_OMIT_QUICKBALANCE
1.4973 + /*
1.4974 + ** A special case: If a new entry has just been inserted into a
1.4975 + ** table (that is, a btree with integer keys and all data at the leaves)
1.4976 + ** and the new entry is the right-most entry in the tree (it has the
1.4977 + ** largest key) then use the special balance_quick() routine for
1.4978 + ** balancing. balance_quick() is much faster and results in a tighter
1.4979 + ** packing of data in the common case.
1.4980 + */
1.4981 + if( pPage->leaf &&
1.4982 + pPage->intKey &&
1.4983 + pPage->nOverflow==1 &&
1.4984 + pPage->aOvfl[0].idx==pPage->nCell &&
1.4985 + pParent->pgno!=1 &&
1.4986 + get4byte(&pParent->aData[pParent->hdrOffset+8])==pPage->pgno
1.4987 + ){
1.4988 + assert( pPage->intKey );
1.4989 + /*
1.4990 + ** TODO: Check the siblings to the left of pPage. It may be that
1.4991 + ** they are not full and no new page is required.
1.4992 + */
1.4993 + return balance_quick(pCur);
1.4994 + }
1.4995 +#endif
1.4996 +
1.4997 + if( SQLITE_OK!=(rc = sqlite3PagerWrite(pPage->pDbPage)) ){
1.4998 + return rc;
1.4999 + }
1.5000 +
1.5001 + /*
1.5002 + ** Find the cell in the parent page whose left child points back
1.5003 + ** to pPage. The "idx" variable is the index of that cell. If pPage
1.5004 + ** is the rightmost child of pParent then set idx to pParent->nCell
1.5005 + */
1.5006 + idx = pCur->aiIdx[pCur->iPage-1];
1.5007 + assertParentIndex(pParent, idx, pPage->pgno);
1.5008 +
1.5009 + /*
1.5010 + ** Initialize variables so that it will be safe to jump
1.5011 + ** directly to balance_cleanup at any moment.
1.5012 + */
1.5013 + nOld = nNew = 0;
1.5014 +
1.5015 + /*
1.5016 + ** Find sibling pages to pPage and the cells in pParent that divide
1.5017 + ** the siblings. An attempt is made to find NN siblings on either
1.5018 + ** side of pPage. More siblings are taken from one side, however, if
1.5019 + ** pPage there are fewer than NN siblings on the other side. If pParent
1.5020 + ** has NB or fewer children then all children of pParent are taken.
1.5021 + */
1.5022 + nxDiv = idx - NN;
1.5023 + if( nxDiv + NB > pParent->nCell ){
1.5024 + nxDiv = pParent->nCell - NB + 1;
1.5025 + }
1.5026 + if( nxDiv<0 ){
1.5027 + nxDiv = 0;
1.5028 + }
1.5029 + nDiv = 0;
1.5030 + for(i=0, k=nxDiv; i<NB; i++, k++){
1.5031 + if( k<pParent->nCell ){
1.5032 + apDiv[i] = findCell(pParent, k);
1.5033 + nDiv++;
1.5034 + assert( !pParent->leaf );
1.5035 + pgnoOld[i] = get4byte(apDiv[i]);
1.5036 + }else if( k==pParent->nCell ){
1.5037 + pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+8]);
1.5038 + }else{
1.5039 + break;
1.5040 + }
1.5041 + rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i]);
1.5042 + if( rc ) goto balance_cleanup;
1.5043 + /* apOld[i]->idxParent = k; */
1.5044 + apCopy[i] = 0;
1.5045 + assert( i==nOld );
1.5046 + nOld++;
1.5047 + nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
1.5048 + }
1.5049 +
1.5050 + /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
1.5051 + ** alignment */
1.5052 + nMaxCells = (nMaxCells + 3)&~3;
1.5053 +
1.5054 + /*
1.5055 + ** Allocate space for memory structures
1.5056 + */
1.5057 + szScratch =
1.5058 + nMaxCells*sizeof(u8*) /* apCell */
1.5059 + + nMaxCells*sizeof(u16) /* szCell */
1.5060 + + (ROUND8(sizeof(MemPage))+pBt->pageSize)*NB /* aCopy */
1.5061 + + pBt->pageSize /* aSpace1 */
1.5062 + + (ISAUTOVACUUM ? nMaxCells : 0); /* aFrom */
1.5063 + apCell = sqlite3ScratchMalloc( szScratch );
1.5064 + if( apCell==0 ){
1.5065 + rc = SQLITE_NOMEM;
1.5066 + goto balance_cleanup;
1.5067 + }
1.5068 + szCell = (u16*)&apCell[nMaxCells];
1.5069 + aCopy[0] = (u8*)&szCell[nMaxCells];
1.5070 + assert( ((aCopy[0] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
1.5071 + for(i=1; i<NB; i++){
1.5072 + aCopy[i] = &aCopy[i-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
1.5073 + assert( ((aCopy[i] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
1.5074 + }
1.5075 + aSpace1 = &aCopy[NB-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
1.5076 + assert( ((aSpace1 - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
1.5077 + if( ISAUTOVACUUM ){
1.5078 + aFrom = &aSpace1[pBt->pageSize];
1.5079 + }
1.5080 + aSpace2 = sqlite3PageMalloc(pBt->pageSize);
1.5081 + if( aSpace2==0 ){
1.5082 + rc = SQLITE_NOMEM;
1.5083 + goto balance_cleanup;
1.5084 + }
1.5085 +
1.5086 + /*
1.5087 + ** Make copies of the content of pPage and its siblings into aOld[].
1.5088 + ** The rest of this function will use data from the copies rather
1.5089 + ** that the original pages since the original pages will be in the
1.5090 + ** process of being overwritten.
1.5091 + */
1.5092 + for(i=0; i<nOld; i++){
1.5093 + MemPage *p = apCopy[i] = (MemPage*)aCopy[i];
1.5094 + memcpy(p, apOld[i], sizeof(MemPage));
1.5095 + p->aData = (void*)&p[1];
1.5096 + memcpy(p->aData, apOld[i]->aData, pBt->pageSize);
1.5097 + }
1.5098 +
1.5099 + /*
1.5100 + ** Load pointers to all cells on sibling pages and the divider cells
1.5101 + ** into the local apCell[] array. Make copies of the divider cells
1.5102 + ** into space obtained form aSpace1[] and remove the the divider Cells
1.5103 + ** from pParent.
1.5104 + **
1.5105 + ** If the siblings are on leaf pages, then the child pointers of the
1.5106 + ** divider cells are stripped from the cells before they are copied
1.5107 + ** into aSpace1[]. In this way, all cells in apCell[] are without
1.5108 + ** child pointers. If siblings are not leaves, then all cell in
1.5109 + ** apCell[] include child pointers. Either way, all cells in apCell[]
1.5110 + ** are alike.
1.5111 + **
1.5112 + ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf.
1.5113 + ** leafData: 1 if pPage holds key+data and pParent holds only keys.
1.5114 + */
1.5115 + nCell = 0;
1.5116 + leafCorrection = pPage->leaf*4;
1.5117 + leafData = pPage->hasData;
1.5118 + for(i=0; i<nOld; i++){
1.5119 + MemPage *pOld = apCopy[i];
1.5120 + int limit = pOld->nCell+pOld->nOverflow;
1.5121 + for(j=0; j<limit; j++){
1.5122 + assert( nCell<nMaxCells );
1.5123 + apCell[nCell] = findOverflowCell(pOld, j);
1.5124 + szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
1.5125 + if( ISAUTOVACUUM ){
1.5126 + int a;
1.5127 + aFrom[nCell] = i;
1.5128 + for(a=0; a<pOld->nOverflow; a++){
1.5129 + if( pOld->aOvfl[a].pCell==apCell[nCell] ){
1.5130 + aFrom[nCell] = 0xFF;
1.5131 + break;
1.5132 + }
1.5133 + }
1.5134 + }
1.5135 + nCell++;
1.5136 + }
1.5137 + if( i<nOld-1 ){
1.5138 + u16 sz = cellSizePtr(pParent, apDiv[i]);
1.5139 + if( leafData ){
1.5140 + /* With the LEAFDATA flag, pParent cells hold only INTKEYs that
1.5141 + ** are duplicates of keys on the child pages. We need to remove
1.5142 + ** the divider cells from pParent, but the dividers cells are not
1.5143 + ** added to apCell[] because they are duplicates of child cells.
1.5144 + */
1.5145 + dropCell(pParent, nxDiv, sz);
1.5146 + }else{
1.5147 + u8 *pTemp;
1.5148 + assert( nCell<nMaxCells );
1.5149 + szCell[nCell] = sz;
1.5150 + pTemp = &aSpace1[iSpace1];
1.5151 + iSpace1 += sz;
1.5152 + assert( sz<=pBt->pageSize/4 );
1.5153 + assert( iSpace1<=pBt->pageSize );
1.5154 + memcpy(pTemp, apDiv[i], sz);
1.5155 + apCell[nCell] = pTemp+leafCorrection;
1.5156 + if( ISAUTOVACUUM ){
1.5157 + aFrom[nCell] = 0xFF;
1.5158 + }
1.5159 + dropCell(pParent, nxDiv, sz);
1.5160 + szCell[nCell] -= leafCorrection;
1.5161 + assert( get4byte(pTemp)==pgnoOld[i] );
1.5162 + if( !pOld->leaf ){
1.5163 + assert( leafCorrection==0 );
1.5164 + /* The right pointer of the child page pOld becomes the left
1.5165 + ** pointer of the divider cell */
1.5166 + memcpy(apCell[nCell], &pOld->aData[pOld->hdrOffset+8], 4);
1.5167 + }else{
1.5168 + assert( leafCorrection==4 );
1.5169 + if( szCell[nCell]<4 ){
1.5170 + /* Do not allow any cells smaller than 4 bytes. */
1.5171 + szCell[nCell] = 4;
1.5172 + }
1.5173 + }
1.5174 + nCell++;
1.5175 + }
1.5176 + }
1.5177 + }
1.5178 +
1.5179 + /*
1.5180 + ** Figure out the number of pages needed to hold all nCell cells.
1.5181 + ** Store this number in "k". Also compute szNew[] which is the total
1.5182 + ** size of all cells on the i-th page and cntNew[] which is the index
1.5183 + ** in apCell[] of the cell that divides page i from page i+1.
1.5184 + ** cntNew[k] should equal nCell.
1.5185 + **
1.5186 + ** Values computed by this block:
1.5187 + **
1.5188 + ** k: The total number of sibling pages
1.5189 + ** szNew[i]: Spaced used on the i-th sibling page.
1.5190 + ** cntNew[i]: Index in apCell[] and szCell[] for the first cell to
1.5191 + ** the right of the i-th sibling page.
1.5192 + ** usableSpace: Number of bytes of space available on each sibling.
1.5193 + **
1.5194 + */
1.5195 + usableSpace = pBt->usableSize - 12 + leafCorrection;
1.5196 + for(subtotal=k=i=0; i<nCell; i++){
1.5197 + assert( i<nMaxCells );
1.5198 + subtotal += szCell[i] + 2;
1.5199 + if( subtotal > usableSpace ){
1.5200 + szNew[k] = subtotal - szCell[i];
1.5201 + cntNew[k] = i;
1.5202 + if( leafData ){ i--; }
1.5203 + subtotal = 0;
1.5204 + k++;
1.5205 + }
1.5206 + }
1.5207 + szNew[k] = subtotal;
1.5208 + cntNew[k] = nCell;
1.5209 + k++;
1.5210 +
1.5211 + /*
1.5212 + ** The packing computed by the previous block is biased toward the siblings
1.5213 + ** on the left side. The left siblings are always nearly full, while the
1.5214 + ** right-most sibling might be nearly empty. This block of code attempts
1.5215 + ** to adjust the packing of siblings to get a better balance.
1.5216 + **
1.5217 + ** This adjustment is more than an optimization. The packing above might
1.5218 + ** be so out of balance as to be illegal. For example, the right-most
1.5219 + ** sibling might be completely empty. This adjustment is not optional.
1.5220 + */
1.5221 + for(i=k-1; i>0; i--){
1.5222 + int szRight = szNew[i]; /* Size of sibling on the right */
1.5223 + int szLeft = szNew[i-1]; /* Size of sibling on the left */
1.5224 + int r; /* Index of right-most cell in left sibling */
1.5225 + int d; /* Index of first cell to the left of right sibling */
1.5226 +
1.5227 + r = cntNew[i-1] - 1;
1.5228 + d = r + 1 - leafData;
1.5229 + assert( d<nMaxCells );
1.5230 + assert( r<nMaxCells );
1.5231 + while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){
1.5232 + szRight += szCell[d] + 2;
1.5233 + szLeft -= szCell[r] + 2;
1.5234 + cntNew[i-1]--;
1.5235 + r = cntNew[i-1] - 1;
1.5236 + d = r + 1 - leafData;
1.5237 + }
1.5238 + szNew[i] = szRight;
1.5239 + szNew[i-1] = szLeft;
1.5240 + }
1.5241 +
1.5242 + /* Either we found one or more cells (cntnew[0])>0) or we are the
1.5243 + ** a virtual root page. A virtual root page is when the real root
1.5244 + ** page is page 1 and we are the only child of that page.
1.5245 + */
1.5246 + assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
1.5247 +
1.5248 + /*
1.5249 + ** Allocate k new pages. Reuse old pages where possible.
1.5250 + */
1.5251 + assert( pPage->pgno>1 );
1.5252 + pageFlags = pPage->aData[0];
1.5253 + for(i=0; i<k; i++){
1.5254 + MemPage *pNew;
1.5255 + if( i<nOld ){
1.5256 + pNew = apNew[i] = apOld[i];
1.5257 + pgnoNew[i] = pgnoOld[i];
1.5258 + apOld[i] = 0;
1.5259 + rc = sqlite3PagerWrite(pNew->pDbPage);
1.5260 + nNew++;
1.5261 + if( rc ) goto balance_cleanup;
1.5262 + }else{
1.5263 + assert( i>0 );
1.5264 + rc = allocateBtreePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1], 0);
1.5265 + if( rc ) goto balance_cleanup;
1.5266 + apNew[i] = pNew;
1.5267 + nNew++;
1.5268 + }
1.5269 + }
1.5270 +
1.5271 + /* Free any old pages that were not reused as new pages.
1.5272 + */
1.5273 + while( i<nOld ){
1.5274 + rc = freePage(apOld[i]);
1.5275 + if( rc ) goto balance_cleanup;
1.5276 + releasePage(apOld[i]);
1.5277 + apOld[i] = 0;
1.5278 + i++;
1.5279 + }
1.5280 +
1.5281 + /*
1.5282 + ** Put the new pages in accending order. This helps to
1.5283 + ** keep entries in the disk file in order so that a scan
1.5284 + ** of the table is a linear scan through the file. That
1.5285 + ** in turn helps the operating system to deliver pages
1.5286 + ** from the disk more rapidly.
1.5287 + **
1.5288 + ** An O(n^2) insertion sort algorithm is used, but since
1.5289 + ** n is never more than NB (a small constant), that should
1.5290 + ** not be a problem.
1.5291 + **
1.5292 + ** When NB==3, this one optimization makes the database
1.5293 + ** about 25% faster for large insertions and deletions.
1.5294 + */
1.5295 + for(i=0; i<k-1; i++){
1.5296 + int minV = pgnoNew[i];
1.5297 + int minI = i;
1.5298 + for(j=i+1; j<k; j++){
1.5299 + if( pgnoNew[j]<(unsigned)minV ){
1.5300 + minI = j;
1.5301 + minV = pgnoNew[j];
1.5302 + }
1.5303 + }
1.5304 + if( minI>i ){
1.5305 + int t;
1.5306 + MemPage *pT;
1.5307 + t = pgnoNew[i];
1.5308 + pT = apNew[i];
1.5309 + pgnoNew[i] = pgnoNew[minI];
1.5310 + apNew[i] = apNew[minI];
1.5311 + pgnoNew[minI] = t;
1.5312 + apNew[minI] = pT;
1.5313 + }
1.5314 + }
1.5315 + TRACE(("BALANCE: old: %d %d %d new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
1.5316 + pgnoOld[0],
1.5317 + nOld>=2 ? pgnoOld[1] : 0,
1.5318 + nOld>=3 ? pgnoOld[2] : 0,
1.5319 + pgnoNew[0], szNew[0],
1.5320 + nNew>=2 ? pgnoNew[1] : 0, nNew>=2 ? szNew[1] : 0,
1.5321 + nNew>=3 ? pgnoNew[2] : 0, nNew>=3 ? szNew[2] : 0,
1.5322 + nNew>=4 ? pgnoNew[3] : 0, nNew>=4 ? szNew[3] : 0,
1.5323 + nNew>=5 ? pgnoNew[4] : 0, nNew>=5 ? szNew[4] : 0));
1.5324 +
1.5325 + /*
1.5326 + ** Evenly distribute the data in apCell[] across the new pages.
1.5327 + ** Insert divider cells into pParent as necessary.
1.5328 + */
1.5329 + j = 0;
1.5330 + for(i=0; i<nNew; i++){
1.5331 + /* Assemble the new sibling page. */
1.5332 + MemPage *pNew = apNew[i];
1.5333 + assert( j<nMaxCells );
1.5334 + assert( pNew->pgno==pgnoNew[i] );
1.5335 + zeroPage(pNew, pageFlags);
1.5336 + assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
1.5337 + assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );
1.5338 + assert( pNew->nOverflow==0 );
1.5339 +
1.5340 + /* If this is an auto-vacuum database, update the pointer map entries
1.5341 + ** that point to the siblings that were rearranged. These can be: left
1.5342 + ** children of cells, the right-child of the page, or overflow pages
1.5343 + ** pointed to by cells.
1.5344 + */
1.5345 + if( ISAUTOVACUUM ){
1.5346 + for(k=j; k<cntNew[i]; k++){
1.5347 + assert( k<nMaxCells );
1.5348 + if( aFrom[k]==0xFF || apCopy[aFrom[k]]->pgno!=pNew->pgno ){
1.5349 + rc = ptrmapPutOvfl(pNew, k-j);
1.5350 + if( rc==SQLITE_OK && leafCorrection==0 ){
1.5351 + rc = ptrmapPut(pBt, get4byte(apCell[k]), PTRMAP_BTREE, pNew->pgno);
1.5352 + }
1.5353 + if( rc!=SQLITE_OK ){
1.5354 + goto balance_cleanup;
1.5355 + }
1.5356 + }
1.5357 + }
1.5358 + }
1.5359 +
1.5360 + j = cntNew[i];
1.5361 +
1.5362 + /* If the sibling page assembled above was not the right-most sibling,
1.5363 + ** insert a divider cell into the parent page.
1.5364 + */
1.5365 + if( i<nNew-1 && j<nCell ){
1.5366 + u8 *pCell;
1.5367 + u8 *pTemp;
1.5368 + int sz;
1.5369 +
1.5370 + assert( j<nMaxCells );
1.5371 + pCell = apCell[j];
1.5372 + sz = szCell[j] + leafCorrection;
1.5373 + pTemp = &aSpace2[iSpace2];
1.5374 + if( !pNew->leaf ){
1.5375 + memcpy(&pNew->aData[8], pCell, 4);
1.5376 + if( ISAUTOVACUUM
1.5377 + && (aFrom[j]==0xFF || apCopy[aFrom[j]]->pgno!=pNew->pgno)
1.5378 + ){
1.5379 + rc = ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno);
1.5380 + if( rc!=SQLITE_OK ){
1.5381 + goto balance_cleanup;
1.5382 + }
1.5383 + }
1.5384 + }else if( leafData ){
1.5385 + /* If the tree is a leaf-data tree, and the siblings are leaves,
1.5386 + ** then there is no divider cell in apCell[]. Instead, the divider
1.5387 + ** cell consists of the integer key for the right-most cell of
1.5388 + ** the sibling-page assembled above only.
1.5389 + */
1.5390 + CellInfo info;
1.5391 + j--;
1.5392 + sqlite3BtreeParseCellPtr(pNew, apCell[j], &info);
1.5393 + pCell = pTemp;
1.5394 + fillInCell(pParent, pCell, 0, info.nKey, 0, 0, 0, &sz);
1.5395 + pTemp = 0;
1.5396 + }else{
1.5397 + pCell -= 4;
1.5398 + /* Obscure case for non-leaf-data trees: If the cell at pCell was
1.5399 + ** previously stored on a leaf node, and its reported size was 4
1.5400 + ** bytes, then it may actually be smaller than this
1.5401 + ** (see sqlite3BtreeParseCellPtr(), 4 bytes is the minimum size of
1.5402 + ** any cell). But it is important to pass the correct size to
1.5403 + ** insertCell(), so reparse the cell now.
1.5404 + **
1.5405 + ** Note that this can never happen in an SQLite data file, as all
1.5406 + ** cells are at least 4 bytes. It only happens in b-trees used
1.5407 + ** to evaluate "IN (SELECT ...)" and similar clauses.
1.5408 + */
1.5409 + if( szCell[j]==4 ){
1.5410 + assert(leafCorrection==4);
1.5411 + sz = cellSizePtr(pParent, pCell);
1.5412 + }
1.5413 + }
1.5414 + iSpace2 += sz;
1.5415 + assert( sz<=pBt->pageSize/4 );
1.5416 + assert( iSpace2<=pBt->pageSize );
1.5417 + rc = insertCell(pParent, nxDiv, pCell, sz, pTemp, 4);
1.5418 + if( rc!=SQLITE_OK ) goto balance_cleanup;
1.5419 + put4byte(findOverflowCell(pParent,nxDiv), pNew->pgno);
1.5420 +
1.5421 + /* If this is an auto-vacuum database, and not a leaf-data tree,
1.5422 + ** then update the pointer map with an entry for the overflow page
1.5423 + ** that the cell just inserted points to (if any).
1.5424 + */
1.5425 + if( ISAUTOVACUUM && !leafData ){
1.5426 + rc = ptrmapPutOvfl(pParent, nxDiv);
1.5427 + if( rc!=SQLITE_OK ){
1.5428 + goto balance_cleanup;
1.5429 + }
1.5430 + }
1.5431 + j++;
1.5432 + nxDiv++;
1.5433 + }
1.5434 +
1.5435 + /* Set the pointer-map entry for the new sibling page. */
1.5436 + if( ISAUTOVACUUM ){
1.5437 + rc = ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno);
1.5438 + if( rc!=SQLITE_OK ){
1.5439 + goto balance_cleanup;
1.5440 + }
1.5441 + }
1.5442 + }
1.5443 + assert( j==nCell );
1.5444 + assert( nOld>0 );
1.5445 + assert( nNew>0 );
1.5446 + if( (pageFlags & PTF_LEAF)==0 ){
1.5447 + u8 *zChild = &apCopy[nOld-1]->aData[8];
1.5448 + memcpy(&apNew[nNew-1]->aData[8], zChild, 4);
1.5449 + if( ISAUTOVACUUM ){
1.5450 + rc = ptrmapPut(pBt, get4byte(zChild), PTRMAP_BTREE, apNew[nNew-1]->pgno);
1.5451 + if( rc!=SQLITE_OK ){
1.5452 + goto balance_cleanup;
1.5453 + }
1.5454 + }
1.5455 + }
1.5456 + if( nxDiv==pParent->nCell+pParent->nOverflow ){
1.5457 + /* Right-most sibling is the right-most child of pParent */
1.5458 + put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew[nNew-1]);
1.5459 + }else{
1.5460 + /* Right-most sibling is the left child of the first entry in pParent
1.5461 + ** past the right-most divider entry */
1.5462 + put4byte(findOverflowCell(pParent, nxDiv), pgnoNew[nNew-1]);
1.5463 + }
1.5464 +
1.5465 + /*
1.5466 + ** Balance the parent page. Note that the current page (pPage) might
1.5467 + ** have been added to the freelist so it might no longer be initialized.
1.5468 + ** But the parent page will always be initialized.
1.5469 + */
1.5470 + assert( pParent->isInit );
1.5471 + sqlite3ScratchFree(apCell);
1.5472 + apCell = 0;
1.5473 + releasePage(pPage);
1.5474 + pCur->iPage--;
1.5475 + rc = balance(pCur, 0);
1.5476 +
1.5477 + /*
1.5478 + ** Cleanup before returning.
1.5479 + */
1.5480 +balance_cleanup:
1.5481 + sqlite3PageFree(aSpace2);
1.5482 + sqlite3ScratchFree(apCell);
1.5483 + for(i=0; i<nOld; i++){
1.5484 + releasePage(apOld[i]);
1.5485 + }
1.5486 + for(i=0; i<nNew; i++){
1.5487 + releasePage(apNew[i]);
1.5488 + }
1.5489 +
1.5490 + /* releasePage(pParent); */
1.5491 + TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n",
1.5492 + pPage->pgno, nOld, nNew, nCell));
1.5493 +
1.5494 + return rc;
1.5495 +}
1.5496 +
1.5497 +/*
1.5498 +** This routine is called for the root page of a btree when the root
1.5499 +** page contains no cells. This is an opportunity to make the tree
1.5500 +** shallower by one level.
1.5501 +*/
1.5502 +static int balance_shallower(BtCursor *pCur){
1.5503 + MemPage *pPage; /* Root page of B-Tree */
1.5504 + MemPage *pChild; /* The only child page of pPage */
1.5505 + Pgno pgnoChild; /* Page number for pChild */
1.5506 + int rc = SQLITE_OK; /* Return code from subprocedures */
1.5507 + BtShared *pBt; /* The main BTree structure */
1.5508 + int mxCellPerPage; /* Maximum number of cells per page */
1.5509 + u8 **apCell; /* All cells from pages being balanced */
1.5510 + u16 *szCell; /* Local size of all cells */
1.5511 +
1.5512 + assert( pCur->iPage==0 );
1.5513 + pPage = pCur->apPage[0];
1.5514 +
1.5515 + assert( pPage->nCell==0 );
1.5516 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.5517 + pBt = pPage->pBt;
1.5518 + mxCellPerPage = MX_CELL(pBt);
1.5519 + apCell = sqlite3Malloc( mxCellPerPage*(sizeof(u8*)+sizeof(u16)) );
1.5520 + if( apCell==0 ) return SQLITE_NOMEM;
1.5521 + szCell = (u16*)&apCell[mxCellPerPage];
1.5522 + if( pPage->leaf ){
1.5523 + /* The table is completely empty */
1.5524 + TRACE(("BALANCE: empty table %d\n", pPage->pgno));
1.5525 + }else{
1.5526 + /* The root page is empty but has one child. Transfer the
1.5527 + ** information from that one child into the root page if it
1.5528 + ** will fit. This reduces the depth of the tree by one.
1.5529 + **
1.5530 + ** If the root page is page 1, it has less space available than
1.5531 + ** its child (due to the 100 byte header that occurs at the beginning
1.5532 + ** of the database fle), so it might not be able to hold all of the
1.5533 + ** information currently contained in the child. If this is the
1.5534 + ** case, then do not do the transfer. Leave page 1 empty except
1.5535 + ** for the right-pointer to the child page. The child page becomes
1.5536 + ** the virtual root of the tree.
1.5537 + */
1.5538 + VVA_ONLY( pCur->pagesShuffled = 1 );
1.5539 + pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+8]);
1.5540 + assert( pgnoChild>0 );
1.5541 + assert( pgnoChild<=pagerPagecount(pPage->pBt->pPager) );
1.5542 + rc = sqlite3BtreeGetPage(pPage->pBt, pgnoChild, &pChild, 0);
1.5543 + if( rc ) goto end_shallow_balance;
1.5544 + if( pPage->pgno==1 ){
1.5545 + rc = sqlite3BtreeInitPage(pChild);
1.5546 + if( rc ) goto end_shallow_balance;
1.5547 + assert( pChild->nOverflow==0 );
1.5548 + if( pChild->nFree>=100 ){
1.5549 + /* The child information will fit on the root page, so do the
1.5550 + ** copy */
1.5551 + int i;
1.5552 + zeroPage(pPage, pChild->aData[0]);
1.5553 + for(i=0; i<pChild->nCell; i++){
1.5554 + apCell[i] = findCell(pChild,i);
1.5555 + szCell[i] = cellSizePtr(pChild, apCell[i]);
1.5556 + }
1.5557 + assemblePage(pPage, pChild->nCell, apCell, szCell);
1.5558 + /* Copy the right-pointer of the child to the parent. */
1.5559 + put4byte(&pPage->aData[pPage->hdrOffset+8],
1.5560 + get4byte(&pChild->aData[pChild->hdrOffset+8]));
1.5561 + freePage(pChild);
1.5562 + TRACE(("BALANCE: child %d transfer to page 1\n", pChild->pgno));
1.5563 + }else{
1.5564 + /* The child has more information that will fit on the root.
1.5565 + ** The tree is already balanced. Do nothing. */
1.5566 + TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno));
1.5567 + }
1.5568 + }else{
1.5569 + memcpy(pPage->aData, pChild->aData, pPage->pBt->usableSize);
1.5570 + pPage->isInit = 0;
1.5571 + rc = sqlite3BtreeInitPage(pPage);
1.5572 + assert( rc==SQLITE_OK );
1.5573 + freePage(pChild);
1.5574 + TRACE(("BALANCE: transfer child %d into root %d\n",
1.5575 + pChild->pgno, pPage->pgno));
1.5576 + }
1.5577 + assert( pPage->nOverflow==0 );
1.5578 + if( ISAUTOVACUUM ){
1.5579 + rc = setChildPtrmaps(pPage);
1.5580 + }
1.5581 + releasePage(pChild);
1.5582 + }
1.5583 +end_shallow_balance:
1.5584 + sqlite3_free(apCell);
1.5585 + return rc;
1.5586 +}
1.5587 +
1.5588 +
1.5589 +/*
1.5590 +** The root page is overfull
1.5591 +**
1.5592 +** When this happens, Create a new child page and copy the
1.5593 +** contents of the root into the child. Then make the root
1.5594 +** page an empty page with rightChild pointing to the new
1.5595 +** child. Finally, call balance_internal() on the new child
1.5596 +** to cause it to split.
1.5597 +*/
1.5598 +static int balance_deeper(BtCursor *pCur){
1.5599 + int rc; /* Return value from subprocedures */
1.5600 + MemPage *pPage; /* Pointer to the root page */
1.5601 + MemPage *pChild; /* Pointer to a new child page */
1.5602 + Pgno pgnoChild; /* Page number of the new child page */
1.5603 + BtShared *pBt; /* The BTree */
1.5604 + int usableSize; /* Total usable size of a page */
1.5605 + u8 *data; /* Content of the parent page */
1.5606 + u8 *cdata; /* Content of the child page */
1.5607 + int hdr; /* Offset to page header in parent */
1.5608 + int cbrk; /* Offset to content of first cell in parent */
1.5609 +
1.5610 + assert( pCur->iPage==0 );
1.5611 + assert( pCur->apPage[0]->nOverflow>0 );
1.5612 +
1.5613 + VVA_ONLY( pCur->pagesShuffled = 1 );
1.5614 + pPage = pCur->apPage[0];
1.5615 + pBt = pPage->pBt;
1.5616 + assert( sqlite3_mutex_held(pBt->mutex) );
1.5617 + rc = allocateBtreePage(pBt, &pChild, &pgnoChild, pPage->pgno, 0);
1.5618 + if( rc ) return rc;
1.5619 + assert( sqlite3PagerIswriteable(pChild->pDbPage) );
1.5620 + usableSize = pBt->usableSize;
1.5621 + data = pPage->aData;
1.5622 + hdr = pPage->hdrOffset;
1.5623 + cbrk = get2byte(&data[hdr+5]);
1.5624 + cdata = pChild->aData;
1.5625 + memcpy(cdata, &data[hdr], pPage->cellOffset+2*pPage->nCell-hdr);
1.5626 + memcpy(&cdata[cbrk], &data[cbrk], usableSize-cbrk);
1.5627 +
1.5628 + rc = sqlite3BtreeInitPage(pChild);
1.5629 + if( rc==SQLITE_OK ){
1.5630 + int nCopy = pPage->nOverflow*sizeof(pPage->aOvfl[0]);
1.5631 + memcpy(pChild->aOvfl, pPage->aOvfl, nCopy);
1.5632 + pChild->nOverflow = pPage->nOverflow;
1.5633 + if( pChild->nOverflow ){
1.5634 + pChild->nFree = 0;
1.5635 + }
1.5636 + assert( pChild->nCell==pPage->nCell );
1.5637 + zeroPage(pPage, pChild->aData[0] & ~PTF_LEAF);
1.5638 + put4byte(&pPage->aData[pPage->hdrOffset+8], pgnoChild);
1.5639 + TRACE(("BALANCE: copy root %d into %d\n", pPage->pgno, pChild->pgno));
1.5640 + if( ISAUTOVACUUM ){
1.5641 + rc = ptrmapPut(pBt, pChild->pgno, PTRMAP_BTREE, pPage->pgno);
1.5642 + if( rc==SQLITE_OK ){
1.5643 + rc = setChildPtrmaps(pChild);
1.5644 + }
1.5645 + }
1.5646 + }
1.5647 +
1.5648 + if( rc==SQLITE_OK ){
1.5649 + pCur->iPage++;
1.5650 + pCur->apPage[1] = pChild;
1.5651 + pCur->aiIdx[0] = 0;
1.5652 + rc = balance_nonroot(pCur);
1.5653 + }else{
1.5654 + releasePage(pChild);
1.5655 + }
1.5656 +
1.5657 + return rc;
1.5658 +}
1.5659 +
1.5660 +/*
1.5661 +** The page that pCur currently points to has just been modified in
1.5662 +** some way. This function figures out if this modification means the
1.5663 +** tree needs to be balanced, and if so calls the appropriate balancing
1.5664 +** routine.
1.5665 +**
1.5666 +** Parameter isInsert is true if a new cell was just inserted into the
1.5667 +** page, or false otherwise.
1.5668 +*/
1.5669 +static int balance(BtCursor *pCur, int isInsert){
1.5670 + int rc = SQLITE_OK;
1.5671 + MemPage *pPage = pCur->apPage[pCur->iPage];
1.5672 +
1.5673 + assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1.5674 + if( pCur->iPage==0 ){
1.5675 + rc = sqlite3PagerWrite(pPage->pDbPage);
1.5676 + if( rc==SQLITE_OK && pPage->nOverflow>0 ){
1.5677 + rc = balance_deeper(pCur);
1.5678 + }
1.5679 + if( rc==SQLITE_OK && pPage->nCell==0 ){
1.5680 + rc = balance_shallower(pCur);
1.5681 + }
1.5682 + }else{
1.5683 + if( pPage->nOverflow>0 ||
1.5684 + (!isInsert && pPage->nFree>pPage->pBt->usableSize*2/3) ){
1.5685 + rc = balance_nonroot(pCur);
1.5686 + }
1.5687 + }
1.5688 + return rc;
1.5689 +}
1.5690 +
1.5691 +/*
1.5692 +** This routine checks all cursors that point to table pgnoRoot.
1.5693 +** If any of those cursors were opened with wrFlag==0 in a different
1.5694 +** database connection (a database connection that shares the pager
1.5695 +** cache with the current connection) and that other connection
1.5696 +** is not in the ReadUncommmitted state, then this routine returns
1.5697 +** SQLITE_LOCKED.
1.5698 +**
1.5699 +** As well as cursors with wrFlag==0, cursors with wrFlag==1 and
1.5700 +** isIncrblobHandle==1 are also considered 'read' cursors. Incremental
1.5701 +** blob cursors are used for both reading and writing.
1.5702 +**
1.5703 +** When pgnoRoot is the root page of an intkey table, this function is also
1.5704 +** responsible for invalidating incremental blob cursors when the table row
1.5705 +** on which they are opened is deleted or modified. Cursors are invalidated
1.5706 +** according to the following rules:
1.5707 +**
1.5708 +** 1) When BtreeClearTable() is called to completely delete the contents
1.5709 +** of a B-Tree table, pExclude is set to zero and parameter iRow is
1.5710 +** set to non-zero. In this case all incremental blob cursors open
1.5711 +** on the table rooted at pgnoRoot are invalidated.
1.5712 +**
1.5713 +** 2) When BtreeInsert(), BtreeDelete() or BtreePutData() is called to
1.5714 +** modify a table row via an SQL statement, pExclude is set to the
1.5715 +** write cursor used to do the modification and parameter iRow is set
1.5716 +** to the integer row id of the B-Tree entry being modified. Unless
1.5717 +** pExclude is itself an incremental blob cursor, then all incremental
1.5718 +** blob cursors open on row iRow of the B-Tree are invalidated.
1.5719 +**
1.5720 +** 3) If both pExclude and iRow are set to zero, no incremental blob
1.5721 +** cursors are invalidated.
1.5722 +*/
1.5723 +static int checkReadLocks(
1.5724 + Btree *pBtree,
1.5725 + Pgno pgnoRoot,
1.5726 + BtCursor *pExclude,
1.5727 + i64 iRow
1.5728 +){
1.5729 + BtCursor *p;
1.5730 + BtShared *pBt = pBtree->pBt;
1.5731 + sqlite3 *db = pBtree->db;
1.5732 + assert( sqlite3BtreeHoldsMutex(pBtree) );
1.5733 + for(p=pBt->pCursor; p; p=p->pNext){
1.5734 + if( p==pExclude ) continue;
1.5735 + if( p->pgnoRoot!=pgnoRoot ) continue;
1.5736 +#ifndef SQLITE_OMIT_INCRBLOB
1.5737 + if( p->isIncrblobHandle && (
1.5738 + (!pExclude && iRow)
1.5739 + || (pExclude && !pExclude->isIncrblobHandle && p->info.nKey==iRow)
1.5740 + )){
1.5741 + p->eState = CURSOR_INVALID;
1.5742 + }
1.5743 +#endif
1.5744 + if( p->eState!=CURSOR_VALID ) continue;
1.5745 + if( p->wrFlag==0
1.5746 +#ifndef SQLITE_OMIT_INCRBLOB
1.5747 + || p->isIncrblobHandle
1.5748 +#endif
1.5749 + ){
1.5750 + sqlite3 *dbOther = p->pBtree->db;
1.5751 + if( dbOther==0 ||
1.5752 + (dbOther!=db && (dbOther->flags & SQLITE_ReadUncommitted)==0) ){
1.5753 + return SQLITE_LOCKED;
1.5754 + }
1.5755 + }
1.5756 + }
1.5757 + return SQLITE_OK;
1.5758 +}
1.5759 +
1.5760 +/*
1.5761 +** Insert a new record into the BTree. The key is given by (pKey,nKey)
1.5762 +** and the data is given by (pData,nData). The cursor is used only to
1.5763 +** define what table the record should be inserted into. The cursor
1.5764 +** is left pointing at a random location.
1.5765 +**
1.5766 +** For an INTKEY table, only the nKey value of the key is used. pKey is
1.5767 +** ignored. For a ZERODATA table, the pData and nData are both ignored.
1.5768 +*/
1.5769 +int sqlite3BtreeInsert(
1.5770 + BtCursor *pCur, /* Insert data into the table of this cursor */
1.5771 + const void *pKey, i64 nKey, /* The key of the new record */
1.5772 + const void *pData, int nData, /* The data of the new record */
1.5773 + int nZero, /* Number of extra 0 bytes to append to data */
1.5774 + int appendBias /* True if this is likely an append */
1.5775 +){
1.5776 + int rc;
1.5777 + int loc;
1.5778 + int szNew;
1.5779 + int idx;
1.5780 + MemPage *pPage;
1.5781 + Btree *p = pCur->pBtree;
1.5782 + BtShared *pBt = p->pBt;
1.5783 + unsigned char *oldCell;
1.5784 + unsigned char *newCell = 0;
1.5785 +
1.5786 + assert( cursorHoldsMutex(pCur) );
1.5787 + if( pBt->inTransaction!=TRANS_WRITE ){
1.5788 + /* Must start a transaction before doing an insert */
1.5789 + rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
1.5790 + return rc;
1.5791 + }
1.5792 + assert( !pBt->readOnly );
1.5793 + if( !pCur->wrFlag ){
1.5794 + return SQLITE_PERM; /* Cursor not open for writing */
1.5795 + }
1.5796 + if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur, nKey) ){
1.5797 + return SQLITE_LOCKED; /* The table pCur points to has a read lock */
1.5798 + }
1.5799 + if( pCur->eState==CURSOR_FAULT ){
1.5800 + return pCur->skip;
1.5801 + }
1.5802 +
1.5803 + /* Save the positions of any other cursors open on this table */
1.5804 + sqlite3BtreeClearCursor(pCur);
1.5805 + if(
1.5806 + SQLITE_OK!=(rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur)) ||
1.5807 + SQLITE_OK!=(rc = sqlite3BtreeMoveto(pCur, pKey, nKey, appendBias, &loc))
1.5808 + ){
1.5809 + return rc;
1.5810 + }
1.5811 +
1.5812 + pPage = pCur->apPage[pCur->iPage];
1.5813 + assert( pPage->intKey || nKey>=0 );
1.5814 + assert( pPage->leaf || !pPage->intKey );
1.5815 + TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
1.5816 + pCur->pgnoRoot, nKey, nData, pPage->pgno,
1.5817 + loc==0 ? "overwrite" : "new entry"));
1.5818 + assert( pPage->isInit );
1.5819 + allocateTempSpace(pBt);
1.5820 + newCell = pBt->pTmpSpace;
1.5821 + if( newCell==0 ) return SQLITE_NOMEM;
1.5822 + rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);
1.5823 + if( rc ) goto end_insert;
1.5824 + assert( szNew==cellSizePtr(pPage, newCell) );
1.5825 + assert( szNew<=MX_CELL_SIZE(pBt) );
1.5826 + idx = pCur->aiIdx[pCur->iPage];
1.5827 + if( loc==0 && CURSOR_VALID==pCur->eState ){
1.5828 + u16 szOld;
1.5829 + assert( idx<pPage->nCell );
1.5830 + rc = sqlite3PagerWrite(pPage->pDbPage);
1.5831 + if( rc ){
1.5832 + goto end_insert;
1.5833 + }
1.5834 + oldCell = findCell(pPage, idx);
1.5835 + if( !pPage->leaf ){
1.5836 + memcpy(newCell, oldCell, 4);
1.5837 + }
1.5838 + szOld = cellSizePtr(pPage, oldCell);
1.5839 + rc = clearCell(pPage, oldCell);
1.5840 + if( rc ) goto end_insert;
1.5841 + rc = dropCell(pPage, idx, szOld);
1.5842 + if( rc ) goto end_insert;
1.5843 + }else if( loc<0 && pPage->nCell>0 ){
1.5844 + assert( pPage->leaf );
1.5845 + idx = ++pCur->aiIdx[pCur->iPage];
1.5846 + pCur->info.nSize = 0;
1.5847 + pCur->validNKey = 0;
1.5848 + }else{
1.5849 + assert( pPage->leaf );
1.5850 + }
1.5851 + rc = insertCell(pPage, idx, newCell, szNew, 0, 0);
1.5852 + if( rc!=SQLITE_OK ) goto end_insert;
1.5853 + rc = balance(pCur, 1);
1.5854 + if( rc==SQLITE_OK ){
1.5855 + moveToRoot(pCur);
1.5856 + }
1.5857 +end_insert:
1.5858 + return rc;
1.5859 +}
1.5860 +
1.5861 +/*
1.5862 +** Delete the entry that the cursor is pointing to. The cursor
1.5863 +** is left pointing at a arbitrary location.
1.5864 +*/
1.5865 +int sqlite3BtreeDelete(BtCursor *pCur){
1.5866 + MemPage *pPage = pCur->apPage[pCur->iPage];
1.5867 + int idx;
1.5868 + unsigned char *pCell;
1.5869 + int rc;
1.5870 + Pgno pgnoChild = 0;
1.5871 + Btree *p = pCur->pBtree;
1.5872 + BtShared *pBt = p->pBt;
1.5873 +
1.5874 + assert( cursorHoldsMutex(pCur) );
1.5875 + assert( pPage->isInit );
1.5876 + if( pBt->inTransaction!=TRANS_WRITE ){
1.5877 + /* Must start a transaction before doing a delete */
1.5878 + rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
1.5879 + return rc;
1.5880 + }
1.5881 + assert( !pBt->readOnly );
1.5882 + if( pCur->eState==CURSOR_FAULT ){
1.5883 + return pCur->skip;
1.5884 + }
1.5885 + if( pCur->aiIdx[pCur->iPage]>=pPage->nCell ){
1.5886 + return SQLITE_ERROR; /* The cursor is not pointing to anything */
1.5887 + }
1.5888 + if( !pCur->wrFlag ){
1.5889 + return SQLITE_PERM; /* Did not open this cursor for writing */
1.5890 + }
1.5891 + if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur, pCur->info.nKey) ){
1.5892 + return SQLITE_LOCKED; /* The table pCur points to has a read lock */
1.5893 + }
1.5894 +
1.5895 + /* Restore the current cursor position (a no-op if the cursor is not in
1.5896 + ** CURSOR_REQUIRESEEK state) and save the positions of any other cursors
1.5897 + ** open on the same table. Then call sqlite3PagerWrite() on the page
1.5898 + ** that the entry will be deleted from.
1.5899 + */
1.5900 + if(
1.5901 + (rc = restoreCursorPosition(pCur))!=0 ||
1.5902 + (rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur))!=0 ||
1.5903 + (rc = sqlite3PagerWrite(pPage->pDbPage))!=0
1.5904 + ){
1.5905 + return rc;
1.5906 + }
1.5907 +
1.5908 + /* Locate the cell within its page and leave pCell pointing to the
1.5909 + ** data. The clearCell() call frees any overflow pages associated with the
1.5910 + ** cell. The cell itself is still intact.
1.5911 + */
1.5912 + idx = pCur->aiIdx[pCur->iPage];
1.5913 + pCell = findCell(pPage, idx);
1.5914 + if( !pPage->leaf ){
1.5915 + pgnoChild = get4byte(pCell);
1.5916 + }
1.5917 + rc = clearCell(pPage, pCell);
1.5918 + if( rc ){
1.5919 + return rc;
1.5920 + }
1.5921 +
1.5922 + if( !pPage->leaf ){
1.5923 + /*
1.5924 + ** The entry we are about to delete is not a leaf so if we do not
1.5925 + ** do something we will leave a hole on an internal page.
1.5926 + ** We have to fill the hole by moving in a cell from a leaf. The
1.5927 + ** next Cell after the one to be deleted is guaranteed to exist and
1.5928 + ** to be a leaf so we can use it.
1.5929 + */
1.5930 + BtCursor leafCur;
1.5931 + MemPage *pLeafPage;
1.5932 +
1.5933 + unsigned char *pNext;
1.5934 + int notUsed;
1.5935 + unsigned char *tempCell = 0;
1.5936 + assert( !pPage->intKey );
1.5937 + sqlite3BtreeGetTempCursor(pCur, &leafCur);
1.5938 + rc = sqlite3BtreeNext(&leafCur, ¬Used);
1.5939 + if( rc==SQLITE_OK ){
1.5940 + assert( leafCur.aiIdx[leafCur.iPage]==0 );
1.5941 + pLeafPage = leafCur.apPage[leafCur.iPage];
1.5942 + rc = sqlite3PagerWrite(pLeafPage->pDbPage);
1.5943 + }
1.5944 + if( rc==SQLITE_OK ){
1.5945 + int leafCursorInvalid = 0;
1.5946 + u16 szNext;
1.5947 + TRACE(("DELETE: table=%d delete internal from %d replace from leaf %d\n",
1.5948 + pCur->pgnoRoot, pPage->pgno, pLeafPage->pgno));
1.5949 + dropCell(pPage, idx, cellSizePtr(pPage, pCell));
1.5950 + pNext = findCell(pLeafPage, 0);
1.5951 + szNext = cellSizePtr(pLeafPage, pNext);
1.5952 + assert( MX_CELL_SIZE(pBt)>=szNext+4 );
1.5953 + allocateTempSpace(pBt);
1.5954 + tempCell = pBt->pTmpSpace;
1.5955 + if( tempCell==0 ){
1.5956 + rc = SQLITE_NOMEM;
1.5957 + }
1.5958 + if( rc==SQLITE_OK ){
1.5959 + rc = insertCell(pPage, idx, pNext-4, szNext+4, tempCell, 0);
1.5960 + }
1.5961 +
1.5962 +
1.5963 + /* The "if" statement in the next code block is critical. The
1.5964 + ** slightest error in that statement would allow SQLite to operate
1.5965 + ** correctly most of the time but produce very rare failures. To
1.5966 + ** guard against this, the following macros help to verify that
1.5967 + ** the "if" statement is well tested.
1.5968 + */
1.5969 + testcase( pPage->nOverflow==0 && pPage->nFree<pBt->usableSize*2/3
1.5970 + && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
1.5971 + testcase( pPage->nOverflow==0 && pPage->nFree==pBt->usableSize*2/3
1.5972 + && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
1.5973 + testcase( pPage->nOverflow==0 && pPage->nFree==pBt->usableSize*2/3+1
1.5974 + && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
1.5975 + testcase( pPage->nOverflow>0 && pPage->nFree<=pBt->usableSize*2/3
1.5976 + && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
1.5977 + testcase( (pPage->nOverflow>0 || (pPage->nFree > pBt->usableSize*2/3))
1.5978 + && pLeafPage->nFree+2+szNext == pBt->usableSize*2/3 );
1.5979 +
1.5980 +
1.5981 + if( (pPage->nOverflow>0 || (pPage->nFree > pBt->usableSize*2/3)) &&
1.5982 + (pLeafPage->nFree+2+szNext > pBt->usableSize*2/3)
1.5983 + ){
1.5984 + /* This branch is taken if the internal node is now either overflowing
1.5985 + ** or underfull and the leaf node will be underfull after the just cell
1.5986 + ** copied to the internal node is deleted from it. This is a special
1.5987 + ** case because the call to balance() to correct the internal node
1.5988 + ** may change the tree structure and invalidate the contents of
1.5989 + ** the leafCur.apPage[] and leafCur.aiIdx[] arrays, which will be
1.5990 + ** used by the balance() required to correct the underfull leaf
1.5991 + ** node.
1.5992 + **
1.5993 + ** The formula used in the expression above are based on facets of
1.5994 + ** the SQLite file-format that do not change over time.
1.5995 + */
1.5996 + testcase( pPage->nFree==pBt->usableSize*2/3+1 );
1.5997 + testcase( pLeafPage->nFree+2+szNext==pBt->usableSize*2/3+1 );
1.5998 + leafCursorInvalid = 1;
1.5999 + }
1.6000 +
1.6001 + if( rc==SQLITE_OK ){
1.6002 + put4byte(findOverflowCell(pPage, idx), pgnoChild);
1.6003 + VVA_ONLY( pCur->pagesShuffled = 0 );
1.6004 + rc = balance(pCur, 0);
1.6005 + }
1.6006 +
1.6007 + if( rc==SQLITE_OK && leafCursorInvalid ){
1.6008 + /* The leaf-node is now underfull and so the tree needs to be
1.6009 + ** rebalanced. However, the balance() operation on the internal
1.6010 + ** node above may have modified the structure of the B-Tree and
1.6011 + ** so the current contents of leafCur.apPage[] and leafCur.aiIdx[]
1.6012 + ** may not be trusted.
1.6013 + **
1.6014 + ** It is not possible to copy the ancestry from pCur, as the same
1.6015 + ** balance() call has invalidated the pCur->apPage[] and aiIdx[]
1.6016 + ** arrays.
1.6017 + **
1.6018 + ** The call to saveCursorPosition() below internally saves the
1.6019 + ** key that leafCur is currently pointing to. Currently, there
1.6020 + ** are two copies of that key in the tree - one here on the leaf
1.6021 + ** page and one on some internal node in the tree. The copy on
1.6022 + ** the leaf node is always the next key in tree-order after the
1.6023 + ** copy on the internal node. So, the call to sqlite3BtreeNext()
1.6024 + ** calls restoreCursorPosition() to point the cursor to the copy
1.6025 + ** stored on the internal node, then advances to the next entry,
1.6026 + ** which happens to be the copy of the key on the internal node.
1.6027 + ** Net effect: leafCur is pointing back to the duplicate cell
1.6028 + ** that needs to be removed, and the leafCur.apPage[] and
1.6029 + ** leafCur.aiIdx[] arrays are correct.
1.6030 + */
1.6031 + VVA_ONLY( Pgno leafPgno = pLeafPage->pgno );
1.6032 + rc = saveCursorPosition(&leafCur);
1.6033 + if( rc==SQLITE_OK ){
1.6034 + rc = sqlite3BtreeNext(&leafCur, ¬Used);
1.6035 + }
1.6036 + pLeafPage = leafCur.apPage[leafCur.iPage];
1.6037 + assert( pLeafPage->pgno==leafPgno );
1.6038 + assert( leafCur.aiIdx[leafCur.iPage]==0 );
1.6039 + }
1.6040 +
1.6041 + if( rc==SQLITE_OK ){
1.6042 + dropCell(pLeafPage, 0, szNext);
1.6043 + VVA_ONLY( leafCur.pagesShuffled = 0 );
1.6044 + rc = balance(&leafCur, 0);
1.6045 + assert( leafCursorInvalid || !leafCur.pagesShuffled
1.6046 + || !pCur->pagesShuffled );
1.6047 + }
1.6048 + }
1.6049 + sqlite3BtreeReleaseTempCursor(&leafCur);
1.6050 + }else{
1.6051 + TRACE(("DELETE: table=%d delete from leaf %d\n",
1.6052 + pCur->pgnoRoot, pPage->pgno));
1.6053 + rc = dropCell(pPage, idx, cellSizePtr(pPage, pCell));
1.6054 + if( rc==SQLITE_OK ){
1.6055 + rc = balance(pCur, 0);
1.6056 + }
1.6057 + }
1.6058 + if( rc==SQLITE_OK ){
1.6059 + moveToRoot(pCur);
1.6060 + }
1.6061 + return rc;
1.6062 +}
1.6063 +
1.6064 +/*
1.6065 +** Create a new BTree table. Write into *piTable the page
1.6066 +** number for the root page of the new table.
1.6067 +**
1.6068 +** The type of type is determined by the flags parameter. Only the
1.6069 +** following values of flags are currently in use. Other values for
1.6070 +** flags might not work:
1.6071 +**
1.6072 +** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys
1.6073 +** BTREE_ZERODATA Used for SQL indices
1.6074 +*/
1.6075 +static int btreeCreateTable(Btree *p, int *piTable, int flags){
1.6076 + BtShared *pBt = p->pBt;
1.6077 + MemPage *pRoot;
1.6078 + Pgno pgnoRoot;
1.6079 + int rc;
1.6080 +
1.6081 + assert( sqlite3BtreeHoldsMutex(p) );
1.6082 + if( pBt->inTransaction!=TRANS_WRITE ){
1.6083 + /* Must start a transaction first */
1.6084 + rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
1.6085 + return rc;
1.6086 + }
1.6087 + assert( !pBt->readOnly );
1.6088 +
1.6089 +#ifdef SQLITE_OMIT_AUTOVACUUM
1.6090 + rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
1.6091 + if( rc ){
1.6092 + return rc;
1.6093 + }
1.6094 +#else
1.6095 + if( pBt->autoVacuum ){
1.6096 + Pgno pgnoMove; /* Move a page here to make room for the root-page */
1.6097 + MemPage *pPageMove; /* The page to move to. */
1.6098 +
1.6099 + /* Creating a new table may probably require moving an existing database
1.6100 + ** to make room for the new tables root page. In case this page turns
1.6101 + ** out to be an overflow page, delete all overflow page-map caches
1.6102 + ** held by open cursors.
1.6103 + */
1.6104 + invalidateAllOverflowCache(pBt);
1.6105 +
1.6106 + /* Read the value of meta[3] from the database to determine where the
1.6107 + ** root page of the new table should go. meta[3] is the largest root-page
1.6108 + ** created so far, so the new root-page is (meta[3]+1).
1.6109 + */
1.6110 + rc = sqlite3BtreeGetMeta(p, 4, &pgnoRoot);
1.6111 + if( rc!=SQLITE_OK ){
1.6112 + return rc;
1.6113 + }
1.6114 + pgnoRoot++;
1.6115 +
1.6116 + /* The new root-page may not be allocated on a pointer-map page, or the
1.6117 + ** PENDING_BYTE page.
1.6118 + */
1.6119 + while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
1.6120 + pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
1.6121 + pgnoRoot++;
1.6122 + }
1.6123 + assert( pgnoRoot>=3 );
1.6124 +
1.6125 + /* Allocate a page. The page that currently resides at pgnoRoot will
1.6126 + ** be moved to the allocated page (unless the allocated page happens
1.6127 + ** to reside at pgnoRoot).
1.6128 + */
1.6129 + rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1);
1.6130 + if( rc!=SQLITE_OK ){
1.6131 + return rc;
1.6132 + }
1.6133 +
1.6134 + if( pgnoMove!=pgnoRoot ){
1.6135 + /* pgnoRoot is the page that will be used for the root-page of
1.6136 + ** the new table (assuming an error did not occur). But we were
1.6137 + ** allocated pgnoMove. If required (i.e. if it was not allocated
1.6138 + ** by extending the file), the current page at position pgnoMove
1.6139 + ** is already journaled.
1.6140 + */
1.6141 + u8 eType;
1.6142 + Pgno iPtrPage;
1.6143 +
1.6144 + releasePage(pPageMove);
1.6145 +
1.6146 + /* Move the page currently at pgnoRoot to pgnoMove. */
1.6147 + rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0);
1.6148 + if( rc!=SQLITE_OK ){
1.6149 + return rc;
1.6150 + }
1.6151 + rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
1.6152 + if( rc!=SQLITE_OK || eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
1.6153 + releasePage(pRoot);
1.6154 + return rc;
1.6155 + }
1.6156 + assert( eType!=PTRMAP_ROOTPAGE );
1.6157 + assert( eType!=PTRMAP_FREEPAGE );
1.6158 + rc = sqlite3PagerWrite(pRoot->pDbPage);
1.6159 + if( rc!=SQLITE_OK ){
1.6160 + releasePage(pRoot);
1.6161 + return rc;
1.6162 + }
1.6163 + rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
1.6164 + releasePage(pRoot);
1.6165 +
1.6166 + /* Obtain the page at pgnoRoot */
1.6167 + if( rc!=SQLITE_OK ){
1.6168 + return rc;
1.6169 + }
1.6170 + rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0);
1.6171 + if( rc!=SQLITE_OK ){
1.6172 + return rc;
1.6173 + }
1.6174 + rc = sqlite3PagerWrite(pRoot->pDbPage);
1.6175 + if( rc!=SQLITE_OK ){
1.6176 + releasePage(pRoot);
1.6177 + return rc;
1.6178 + }
1.6179 + }else{
1.6180 + pRoot = pPageMove;
1.6181 + }
1.6182 +
1.6183 + /* Update the pointer-map and meta-data with the new root-page number. */
1.6184 + rc = ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0);
1.6185 + if( rc ){
1.6186 + releasePage(pRoot);
1.6187 + return rc;
1.6188 + }
1.6189 + rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
1.6190 + if( rc ){
1.6191 + releasePage(pRoot);
1.6192 + return rc;
1.6193 + }
1.6194 +
1.6195 + }else{
1.6196 + rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
1.6197 + if( rc ) return rc;
1.6198 + }
1.6199 +#endif
1.6200 + assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
1.6201 + zeroPage(pRoot, flags | PTF_LEAF);
1.6202 + sqlite3PagerUnref(pRoot->pDbPage);
1.6203 + *piTable = (int)pgnoRoot;
1.6204 + return SQLITE_OK;
1.6205 +}
1.6206 +int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
1.6207 + int rc;
1.6208 + sqlite3BtreeEnter(p);
1.6209 + p->pBt->db = p->db;
1.6210 + rc = btreeCreateTable(p, piTable, flags);
1.6211 + sqlite3BtreeLeave(p);
1.6212 + return rc;
1.6213 +}
1.6214 +
1.6215 +/*
1.6216 +** Erase the given database page and all its children. Return
1.6217 +** the page to the freelist.
1.6218 +*/
1.6219 +static int clearDatabasePage(
1.6220 + BtShared *pBt, /* The BTree that contains the table */
1.6221 + Pgno pgno, /* Page number to clear */
1.6222 + MemPage *pParent, /* Parent page. NULL for the root */
1.6223 + int freePageFlag /* Deallocate page if true */
1.6224 +){
1.6225 + MemPage *pPage = 0;
1.6226 + int rc;
1.6227 + unsigned char *pCell;
1.6228 + int i;
1.6229 +
1.6230 + assert( sqlite3_mutex_held(pBt->mutex) );
1.6231 + if( pgno>pagerPagecount(pBt->pPager) ){
1.6232 + return SQLITE_CORRUPT_BKPT;
1.6233 + }
1.6234 +
1.6235 + rc = getAndInitPage(pBt, pgno, &pPage);
1.6236 + if( rc ) goto cleardatabasepage_out;
1.6237 + for(i=0; i<pPage->nCell; i++){
1.6238 + pCell = findCell(pPage, i);
1.6239 + if( !pPage->leaf ){
1.6240 + rc = clearDatabasePage(pBt, get4byte(pCell), pPage, 1);
1.6241 + if( rc ) goto cleardatabasepage_out;
1.6242 + }
1.6243 + rc = clearCell(pPage, pCell);
1.6244 + if( rc ) goto cleardatabasepage_out;
1.6245 + }
1.6246 + if( !pPage->leaf ){
1.6247 + rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), pPage, 1);
1.6248 + if( rc ) goto cleardatabasepage_out;
1.6249 + }
1.6250 + if( freePageFlag ){
1.6251 + rc = freePage(pPage);
1.6252 + }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
1.6253 + zeroPage(pPage, pPage->aData[0] | PTF_LEAF);
1.6254 + }
1.6255 +
1.6256 +cleardatabasepage_out:
1.6257 + releasePage(pPage);
1.6258 + return rc;
1.6259 +}
1.6260 +
1.6261 +/*
1.6262 +** Delete all information from a single table in the database. iTable is
1.6263 +** the page number of the root of the table. After this routine returns,
1.6264 +** the root page is empty, but still exists.
1.6265 +**
1.6266 +** This routine will fail with SQLITE_LOCKED if there are any open
1.6267 +** read cursors on the table. Open write cursors are moved to the
1.6268 +** root of the table.
1.6269 +*/
1.6270 +int sqlite3BtreeClearTable(Btree *p, int iTable){
1.6271 + int rc;
1.6272 + BtShared *pBt = p->pBt;
1.6273 + sqlite3BtreeEnter(p);
1.6274 + pBt->db = p->db;
1.6275 + if( p->inTrans!=TRANS_WRITE ){
1.6276 + rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
1.6277 + }else if( (rc = checkReadLocks(p, iTable, 0, 1))!=SQLITE_OK ){
1.6278 + /* nothing to do */
1.6279 + }else if( SQLITE_OK!=(rc = saveAllCursors(pBt, iTable, 0)) ){
1.6280 + /* nothing to do */
1.6281 + }else{
1.6282 + rc = clearDatabasePage(pBt, (Pgno)iTable, 0, 0);
1.6283 + }
1.6284 + sqlite3BtreeLeave(p);
1.6285 + return rc;
1.6286 +}
1.6287 +
1.6288 +/*
1.6289 +** Erase all information in a table and add the root of the table to
1.6290 +** the freelist. Except, the root of the principle table (the one on
1.6291 +** page 1) is never added to the freelist.
1.6292 +**
1.6293 +** This routine will fail with SQLITE_LOCKED if there are any open
1.6294 +** cursors on the table.
1.6295 +**
1.6296 +** If AUTOVACUUM is enabled and the page at iTable is not the last
1.6297 +** root page in the database file, then the last root page
1.6298 +** in the database file is moved into the slot formerly occupied by
1.6299 +** iTable and that last slot formerly occupied by the last root page
1.6300 +** is added to the freelist instead of iTable. In this say, all
1.6301 +** root pages are kept at the beginning of the database file, which
1.6302 +** is necessary for AUTOVACUUM to work right. *piMoved is set to the
1.6303 +** page number that used to be the last root page in the file before
1.6304 +** the move. If no page gets moved, *piMoved is set to 0.
1.6305 +** The last root page is recorded in meta[3] and the value of
1.6306 +** meta[3] is updated by this procedure.
1.6307 +*/
1.6308 +static int btreeDropTable(Btree *p, int iTable, int *piMoved){
1.6309 + int rc;
1.6310 + MemPage *pPage = 0;
1.6311 + BtShared *pBt = p->pBt;
1.6312 +
1.6313 + assert( sqlite3BtreeHoldsMutex(p) );
1.6314 + if( p->inTrans!=TRANS_WRITE ){
1.6315 + return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
1.6316 + }
1.6317 +
1.6318 + /* It is illegal to drop a table if any cursors are open on the
1.6319 + ** database. This is because in auto-vacuum mode the backend may
1.6320 + ** need to move another root-page to fill a gap left by the deleted
1.6321 + ** root page. If an open cursor was using this page a problem would
1.6322 + ** occur.
1.6323 + */
1.6324 + if( pBt->pCursor ){
1.6325 + return SQLITE_LOCKED;
1.6326 + }
1.6327 +
1.6328 + rc = sqlite3BtreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
1.6329 + if( rc ) return rc;
1.6330 + rc = sqlite3BtreeClearTable(p, iTable);
1.6331 + if( rc ){
1.6332 + releasePage(pPage);
1.6333 + return rc;
1.6334 + }
1.6335 +
1.6336 + *piMoved = 0;
1.6337 +
1.6338 + if( iTable>1 ){
1.6339 +#ifdef SQLITE_OMIT_AUTOVACUUM
1.6340 + rc = freePage(pPage);
1.6341 + releasePage(pPage);
1.6342 +#else
1.6343 + if( pBt->autoVacuum ){
1.6344 + Pgno maxRootPgno;
1.6345 + rc = sqlite3BtreeGetMeta(p, 4, &maxRootPgno);
1.6346 + if( rc!=SQLITE_OK ){
1.6347 + releasePage(pPage);
1.6348 + return rc;
1.6349 + }
1.6350 +
1.6351 + if( iTable==maxRootPgno ){
1.6352 + /* If the table being dropped is the table with the largest root-page
1.6353 + ** number in the database, put the root page on the free list.
1.6354 + */
1.6355 + rc = freePage(pPage);
1.6356 + releasePage(pPage);
1.6357 + if( rc!=SQLITE_OK ){
1.6358 + return rc;
1.6359 + }
1.6360 + }else{
1.6361 + /* The table being dropped does not have the largest root-page
1.6362 + ** number in the database. So move the page that does into the
1.6363 + ** gap left by the deleted root-page.
1.6364 + */
1.6365 + MemPage *pMove;
1.6366 + releasePage(pPage);
1.6367 + rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0);
1.6368 + if( rc!=SQLITE_OK ){
1.6369 + return rc;
1.6370 + }
1.6371 + rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
1.6372 + releasePage(pMove);
1.6373 + if( rc!=SQLITE_OK ){
1.6374 + return rc;
1.6375 + }
1.6376 + rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0);
1.6377 + if( rc!=SQLITE_OK ){
1.6378 + return rc;
1.6379 + }
1.6380 + rc = freePage(pMove);
1.6381 + releasePage(pMove);
1.6382 + if( rc!=SQLITE_OK ){
1.6383 + return rc;
1.6384 + }
1.6385 + *piMoved = maxRootPgno;
1.6386 + }
1.6387 +
1.6388 + /* Set the new 'max-root-page' value in the database header. This
1.6389 + ** is the old value less one, less one more if that happens to
1.6390 + ** be a root-page number, less one again if that is the
1.6391 + ** PENDING_BYTE_PAGE.
1.6392 + */
1.6393 + maxRootPgno--;
1.6394 + if( maxRootPgno==PENDING_BYTE_PAGE(pBt) ){
1.6395 + maxRootPgno--;
1.6396 + }
1.6397 + if( maxRootPgno==PTRMAP_PAGENO(pBt, maxRootPgno) ){
1.6398 + maxRootPgno--;
1.6399 + }
1.6400 + assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
1.6401 +
1.6402 + rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
1.6403 + }else{
1.6404 + rc = freePage(pPage);
1.6405 + releasePage(pPage);
1.6406 + }
1.6407 +#endif
1.6408 + }else{
1.6409 + /* If sqlite3BtreeDropTable was called on page 1. */
1.6410 + zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
1.6411 + releasePage(pPage);
1.6412 + }
1.6413 + return rc;
1.6414 +}
1.6415 +int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
1.6416 + int rc;
1.6417 + sqlite3BtreeEnter(p);
1.6418 + p->pBt->db = p->db;
1.6419 + rc = btreeDropTable(p, iTable, piMoved);
1.6420 + sqlite3BtreeLeave(p);
1.6421 + return rc;
1.6422 +}
1.6423 +
1.6424 +
1.6425 +/*
1.6426 +** Read the meta-information out of a database file. Meta[0]
1.6427 +** is the number of free pages currently in the database. Meta[1]
1.6428 +** through meta[15] are available for use by higher layers. Meta[0]
1.6429 +** is read-only, the others are read/write.
1.6430 +**
1.6431 +** The schema layer numbers meta values differently. At the schema
1.6432 +** layer (and the SetCookie and ReadCookie opcodes) the number of
1.6433 +** free pages is not visible. So Cookie[0] is the same as Meta[1].
1.6434 +*/
1.6435 +int sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
1.6436 + DbPage *pDbPage;
1.6437 + int rc;
1.6438 + unsigned char *pP1;
1.6439 + BtShared *pBt = p->pBt;
1.6440 +
1.6441 + sqlite3BtreeEnter(p);
1.6442 + pBt->db = p->db;
1.6443 +
1.6444 + /* Reading a meta-data value requires a read-lock on page 1 (and hence
1.6445 + ** the sqlite_master table. We grab this lock regardless of whether or
1.6446 + ** not the SQLITE_ReadUncommitted flag is set (the table rooted at page
1.6447 + ** 1 is treated as a special case by queryTableLock() and lockTable()).
1.6448 + */
1.6449 + rc = queryTableLock(p, 1, READ_LOCK);
1.6450 + if( rc!=SQLITE_OK ){
1.6451 + sqlite3BtreeLeave(p);
1.6452 + return rc;
1.6453 + }
1.6454 +
1.6455 + assert( idx>=0 && idx<=15 );
1.6456 + if( pBt->pPage1 ){
1.6457 + /* The b-tree is already holding a reference to page 1 of the database
1.6458 + ** file. In this case the required meta-data value can be read directly
1.6459 + ** from the page data of this reference. This is slightly faster than
1.6460 + ** requesting a new reference from the pager layer.
1.6461 + */
1.6462 + pP1 = (unsigned char *)pBt->pPage1->aData;
1.6463 + }else{
1.6464 + /* The b-tree does not have a reference to page 1 of the database file.
1.6465 + ** Obtain one from the pager layer.
1.6466 + */
1.6467 + rc = sqlite3PagerGet(pBt->pPager, 1, &pDbPage);
1.6468 + if( rc ){
1.6469 + sqlite3BtreeLeave(p);
1.6470 + return rc;
1.6471 + }
1.6472 + pP1 = (unsigned char *)sqlite3PagerGetData(pDbPage);
1.6473 + }
1.6474 + *pMeta = get4byte(&pP1[36 + idx*4]);
1.6475 +
1.6476 + /* If the b-tree is not holding a reference to page 1, then one was
1.6477 + ** requested from the pager layer in the above block. Release it now.
1.6478 + */
1.6479 + if( !pBt->pPage1 ){
1.6480 + sqlite3PagerUnref(pDbPage);
1.6481 + }
1.6482 +
1.6483 + /* If autovacuumed is disabled in this build but we are trying to
1.6484 + ** access an autovacuumed database, then make the database readonly.
1.6485 + */
1.6486 +#ifdef SQLITE_OMIT_AUTOVACUUM
1.6487 + if( idx==4 && *pMeta>0 ) pBt->readOnly = 1;
1.6488 +#endif
1.6489 +
1.6490 + /* Grab the read-lock on page 1. */
1.6491 + rc = lockTable(p, 1, READ_LOCK);
1.6492 + sqlite3BtreeLeave(p);
1.6493 + return rc;
1.6494 +}
1.6495 +
1.6496 +/*
1.6497 +** Write meta-information back into the database. Meta[0] is
1.6498 +** read-only and may not be written.
1.6499 +*/
1.6500 +int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
1.6501 + BtShared *pBt = p->pBt;
1.6502 + unsigned char *pP1;
1.6503 + int rc;
1.6504 + assert( idx>=1 && idx<=15 );
1.6505 + sqlite3BtreeEnter(p);
1.6506 + pBt->db = p->db;
1.6507 + if( p->inTrans!=TRANS_WRITE ){
1.6508 + rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
1.6509 + }else{
1.6510 + assert( pBt->pPage1!=0 );
1.6511 + pP1 = pBt->pPage1->aData;
1.6512 + rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
1.6513 + if( rc==SQLITE_OK ){
1.6514 + put4byte(&pP1[36 + idx*4], iMeta);
1.6515 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.6516 + if( idx==7 ){
1.6517 + assert( pBt->autoVacuum || iMeta==0 );
1.6518 + assert( iMeta==0 || iMeta==1 );
1.6519 + pBt->incrVacuum = iMeta;
1.6520 + }
1.6521 +#endif
1.6522 + }
1.6523 + }
1.6524 + sqlite3BtreeLeave(p);
1.6525 + return rc;
1.6526 +}
1.6527 +
1.6528 +/*
1.6529 +** Return the flag byte at the beginning of the page that the cursor
1.6530 +** is currently pointing to.
1.6531 +*/
1.6532 +int sqlite3BtreeFlags(BtCursor *pCur){
1.6533 + /* TODO: What about CURSOR_REQUIRESEEK state? Probably need to call
1.6534 + ** restoreCursorPosition() here.
1.6535 + */
1.6536 + MemPage *pPage;
1.6537 + restoreCursorPosition(pCur);
1.6538 + pPage = pCur->apPage[pCur->iPage];
1.6539 + assert( cursorHoldsMutex(pCur) );
1.6540 + assert( pPage->pBt==pCur->pBt );
1.6541 + return pPage ? pPage->aData[pPage->hdrOffset] : 0;
1.6542 +}
1.6543 +
1.6544 +
1.6545 +/*
1.6546 +** Return the pager associated with a BTree. This routine is used for
1.6547 +** testing and debugging only.
1.6548 +*/
1.6549 +Pager *sqlite3BtreePager(Btree *p){
1.6550 + return p->pBt->pPager;
1.6551 +}
1.6552 +
1.6553 +#ifndef SQLITE_OMIT_INTEGRITY_CHECK
1.6554 +/*
1.6555 +** Append a message to the error message string.
1.6556 +*/
1.6557 +static void checkAppendMsg(
1.6558 + IntegrityCk *pCheck,
1.6559 + char *zMsg1,
1.6560 + const char *zFormat,
1.6561 + ...
1.6562 +){
1.6563 + va_list ap;
1.6564 + if( !pCheck->mxErr ) return;
1.6565 + pCheck->mxErr--;
1.6566 + pCheck->nErr++;
1.6567 + va_start(ap, zFormat);
1.6568 + if( pCheck->errMsg.nChar ){
1.6569 + sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);
1.6570 + }
1.6571 + if( zMsg1 ){
1.6572 + sqlite3StrAccumAppend(&pCheck->errMsg, zMsg1, -1);
1.6573 + }
1.6574 + sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap);
1.6575 + va_end(ap);
1.6576 + if( pCheck->errMsg.mallocFailed ){
1.6577 + pCheck->mallocFailed = 1;
1.6578 + }
1.6579 +}
1.6580 +#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
1.6581 +
1.6582 +#ifndef SQLITE_OMIT_INTEGRITY_CHECK
1.6583 +/*
1.6584 +** Add 1 to the reference count for page iPage. If this is the second
1.6585 +** reference to the page, add an error message to pCheck->zErrMsg.
1.6586 +** Return 1 if there are 2 ore more references to the page and 0 if
1.6587 +** if this is the first reference to the page.
1.6588 +**
1.6589 +** Also check that the page number is in bounds.
1.6590 +*/
1.6591 +static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){
1.6592 + if( iPage==0 ) return 1;
1.6593 + if( iPage>pCheck->nPage || iPage<0 ){
1.6594 + checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage);
1.6595 + return 1;
1.6596 + }
1.6597 + if( pCheck->anRef[iPage]==1 ){
1.6598 + checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage);
1.6599 + return 1;
1.6600 + }
1.6601 + return (pCheck->anRef[iPage]++)>1;
1.6602 +}
1.6603 +
1.6604 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.6605 +/*
1.6606 +** Check that the entry in the pointer-map for page iChild maps to
1.6607 +** page iParent, pointer type ptrType. If not, append an error message
1.6608 +** to pCheck.
1.6609 +*/
1.6610 +static void checkPtrmap(
1.6611 + IntegrityCk *pCheck, /* Integrity check context */
1.6612 + Pgno iChild, /* Child page number */
1.6613 + u8 eType, /* Expected pointer map type */
1.6614 + Pgno iParent, /* Expected pointer map parent page number */
1.6615 + char *zContext /* Context description (used for error msg) */
1.6616 +){
1.6617 + int rc;
1.6618 + u8 ePtrmapType;
1.6619 + Pgno iPtrmapParent;
1.6620 +
1.6621 + rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
1.6622 + if( rc!=SQLITE_OK ){
1.6623 + checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild);
1.6624 + return;
1.6625 + }
1.6626 +
1.6627 + if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
1.6628 + checkAppendMsg(pCheck, zContext,
1.6629 + "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
1.6630 + iChild, eType, iParent, ePtrmapType, iPtrmapParent);
1.6631 + }
1.6632 +}
1.6633 +#endif
1.6634 +
1.6635 +/*
1.6636 +** Check the integrity of the freelist or of an overflow page list.
1.6637 +** Verify that the number of pages on the list is N.
1.6638 +*/
1.6639 +static void checkList(
1.6640 + IntegrityCk *pCheck, /* Integrity checking context */
1.6641 + int isFreeList, /* True for a freelist. False for overflow page list */
1.6642 + int iPage, /* Page number for first page in the list */
1.6643 + int N, /* Expected number of pages in the list */
1.6644 + char *zContext /* Context for error messages */
1.6645 +){
1.6646 + int i;
1.6647 + int expected = N;
1.6648 + int iFirst = iPage;
1.6649 + while( N-- > 0 && pCheck->mxErr ){
1.6650 + DbPage *pOvflPage;
1.6651 + unsigned char *pOvflData;
1.6652 + if( iPage<1 ){
1.6653 + checkAppendMsg(pCheck, zContext,
1.6654 + "%d of %d pages missing from overflow list starting at %d",
1.6655 + N+1, expected, iFirst);
1.6656 + break;
1.6657 + }
1.6658 + if( checkRef(pCheck, iPage, zContext) ) break;
1.6659 + if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){
1.6660 + checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage);
1.6661 + break;
1.6662 + }
1.6663 + pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
1.6664 + if( isFreeList ){
1.6665 + int n = get4byte(&pOvflData[4]);
1.6666 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.6667 + if( pCheck->pBt->autoVacuum ){
1.6668 + checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext);
1.6669 + }
1.6670 +#endif
1.6671 + if( n>pCheck->pBt->usableSize/4-2 ){
1.6672 + checkAppendMsg(pCheck, zContext,
1.6673 + "freelist leaf count too big on page %d", iPage);
1.6674 + N--;
1.6675 + }else{
1.6676 + for(i=0; i<n; i++){
1.6677 + Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
1.6678 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.6679 + if( pCheck->pBt->autoVacuum ){
1.6680 + checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext);
1.6681 + }
1.6682 +#endif
1.6683 + checkRef(pCheck, iFreePage, zContext);
1.6684 + }
1.6685 + N -= n;
1.6686 + }
1.6687 + }
1.6688 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.6689 + else{
1.6690 + /* If this database supports auto-vacuum and iPage is not the last
1.6691 + ** page in this overflow list, check that the pointer-map entry for
1.6692 + ** the following page matches iPage.
1.6693 + */
1.6694 + if( pCheck->pBt->autoVacuum && N>0 ){
1.6695 + i = get4byte(pOvflData);
1.6696 + checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext);
1.6697 + }
1.6698 + }
1.6699 +#endif
1.6700 + iPage = get4byte(pOvflData);
1.6701 + sqlite3PagerUnref(pOvflPage);
1.6702 + }
1.6703 +}
1.6704 +#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
1.6705 +
1.6706 +#ifndef SQLITE_OMIT_INTEGRITY_CHECK
1.6707 +/*
1.6708 +** Do various sanity checks on a single page of a tree. Return
1.6709 +** the tree depth. Root pages return 0. Parents of root pages
1.6710 +** return 1, and so forth.
1.6711 +**
1.6712 +** These checks are done:
1.6713 +**
1.6714 +** 1. Make sure that cells and freeblocks do not overlap
1.6715 +** but combine to completely cover the page.
1.6716 +** NO 2. Make sure cell keys are in order.
1.6717 +** NO 3. Make sure no key is less than or equal to zLowerBound.
1.6718 +** NO 4. Make sure no key is greater than or equal to zUpperBound.
1.6719 +** 5. Check the integrity of overflow pages.
1.6720 +** 6. Recursively call checkTreePage on all children.
1.6721 +** 7. Verify that the depth of all children is the same.
1.6722 +** 8. Make sure this page is at least 33% full or else it is
1.6723 +** the root of the tree.
1.6724 +*/
1.6725 +static int checkTreePage(
1.6726 + IntegrityCk *pCheck, /* Context for the sanity check */
1.6727 + int iPage, /* Page number of the page to check */
1.6728 + MemPage *pParent, /* Parent page */
1.6729 + char *zParentContext /* Parent context */
1.6730 +){
1.6731 + MemPage *pPage;
1.6732 + int i, rc, depth, d2, pgno, cnt;
1.6733 + int hdr, cellStart;
1.6734 + int nCell;
1.6735 + u8 *data;
1.6736 + BtShared *pBt;
1.6737 + int usableSize;
1.6738 + char zContext[100];
1.6739 + char *hit = 0;
1.6740 +
1.6741 + sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage);
1.6742 +
1.6743 + /* Check that the page exists
1.6744 + */
1.6745 + pBt = pCheck->pBt;
1.6746 + usableSize = pBt->usableSize;
1.6747 + if( iPage==0 ) return 0;
1.6748 + if( checkRef(pCheck, iPage, zParentContext) ) return 0;
1.6749 + if( (rc = sqlite3BtreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
1.6750 + checkAppendMsg(pCheck, zContext,
1.6751 + "unable to get the page. error code=%d", rc);
1.6752 + return 0;
1.6753 + }
1.6754 + if( (rc = sqlite3BtreeInitPage(pPage))!=0 ){
1.6755 + checkAppendMsg(pCheck, zContext,
1.6756 + "sqlite3BtreeInitPage() returns error code %d", rc);
1.6757 + releasePage(pPage);
1.6758 + return 0;
1.6759 + }
1.6760 +
1.6761 + /* Check out all the cells.
1.6762 + */
1.6763 + depth = 0;
1.6764 + for(i=0; i<pPage->nCell && pCheck->mxErr; i++){
1.6765 + u8 *pCell;
1.6766 + int sz;
1.6767 + CellInfo info;
1.6768 +
1.6769 + /* Check payload overflow pages
1.6770 + */
1.6771 + sqlite3_snprintf(sizeof(zContext), zContext,
1.6772 + "On tree page %d cell %d: ", iPage, i);
1.6773 + pCell = findCell(pPage,i);
1.6774 + sqlite3BtreeParseCellPtr(pPage, pCell, &info);
1.6775 + sz = info.nData;
1.6776 + if( !pPage->intKey ) sz += info.nKey;
1.6777 + assert( sz==info.nPayload );
1.6778 + if( sz>info.nLocal ){
1.6779 + int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4);
1.6780 + Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
1.6781 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.6782 + if( pBt->autoVacuum ){
1.6783 + checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext);
1.6784 + }
1.6785 +#endif
1.6786 + checkList(pCheck, 0, pgnoOvfl, nPage, zContext);
1.6787 + }
1.6788 +
1.6789 + /* Check sanity of left child page.
1.6790 + */
1.6791 + if( !pPage->leaf ){
1.6792 + pgno = get4byte(pCell);
1.6793 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.6794 + if( pBt->autoVacuum ){
1.6795 + checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
1.6796 + }
1.6797 +#endif
1.6798 + d2 = checkTreePage(pCheck,pgno,pPage,zContext);
1.6799 + if( i>0 && d2!=depth ){
1.6800 + checkAppendMsg(pCheck, zContext, "Child page depth differs");
1.6801 + }
1.6802 + depth = d2;
1.6803 + }
1.6804 + }
1.6805 + if( !pPage->leaf ){
1.6806 + pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
1.6807 + sqlite3_snprintf(sizeof(zContext), zContext,
1.6808 + "On page %d at right child: ", iPage);
1.6809 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.6810 + if( pBt->autoVacuum ){
1.6811 + checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, 0);
1.6812 + }
1.6813 +#endif
1.6814 + checkTreePage(pCheck, pgno, pPage, zContext);
1.6815 + }
1.6816 +
1.6817 + /* Check for complete coverage of the page
1.6818 + */
1.6819 + data = pPage->aData;
1.6820 + hdr = pPage->hdrOffset;
1.6821 + hit = sqlite3PageMalloc( pBt->pageSize );
1.6822 + if( hit==0 ){
1.6823 + pCheck->mallocFailed = 1;
1.6824 + }else{
1.6825 + u16 contentOffset = get2byte(&data[hdr+5]);
1.6826 + if (contentOffset > usableSize) {
1.6827 + checkAppendMsg(pCheck, 0,
1.6828 + "Corruption detected in header on page %d",iPage,0);
1.6829 + goto check_page_abort;
1.6830 + }
1.6831 + memset(hit, 0, usableSize );
1.6832 + memset(hit, 1, get2byte(&data[hdr+5]));
1.6833 + nCell = get2byte(&data[hdr+3]);
1.6834 + cellStart = hdr + 12 - 4*pPage->leaf;
1.6835 + for(i=0; i<nCell; i++){
1.6836 + int pc = get2byte(&data[cellStart+i*2]);
1.6837 + u16 size = 1024;
1.6838 + int j;
1.6839 + if( pc<=usableSize ){
1.6840 + size = cellSizePtr(pPage, &data[pc]);
1.6841 + }
1.6842 + if( (pc+size-1)>=usableSize || pc<0 ){
1.6843 + checkAppendMsg(pCheck, 0,
1.6844 + "Corruption detected in cell %d on page %d",i,iPage,0);
1.6845 + }else{
1.6846 + for(j=pc+size-1; j>=pc; j--) hit[j]++;
1.6847 + }
1.6848 + }
1.6849 + for(cnt=0, i=get2byte(&data[hdr+1]); i>0 && i<usableSize && cnt<10000;
1.6850 + cnt++){
1.6851 + int size = get2byte(&data[i+2]);
1.6852 + int j;
1.6853 + if( (i+size-1)>=usableSize || i<0 ){
1.6854 + checkAppendMsg(pCheck, 0,
1.6855 + "Corruption detected in cell %d on page %d",i,iPage,0);
1.6856 + }else{
1.6857 + for(j=i+size-1; j>=i; j--) hit[j]++;
1.6858 + }
1.6859 + i = get2byte(&data[i]);
1.6860 + }
1.6861 + for(i=cnt=0; i<usableSize; i++){
1.6862 + if( hit[i]==0 ){
1.6863 + cnt++;
1.6864 + }else if( hit[i]>1 ){
1.6865 + checkAppendMsg(pCheck, 0,
1.6866 + "Multiple uses for byte %d of page %d", i, iPage);
1.6867 + break;
1.6868 + }
1.6869 + }
1.6870 + if( cnt!=data[hdr+7] ){
1.6871 + checkAppendMsg(pCheck, 0,
1.6872 + "Fragmented space is %d byte reported as %d on page %d",
1.6873 + cnt, data[hdr+7], iPage);
1.6874 + }
1.6875 + }
1.6876 +
1.6877 +check_page_abort:
1.6878 + if( hit ) sqlite3PageFree(hit);
1.6879 +
1.6880 + releasePage(pPage);
1.6881 + return depth+1;
1.6882 +}
1.6883 +#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
1.6884 +
1.6885 +#ifndef SQLITE_OMIT_INTEGRITY_CHECK
1.6886 +/*
1.6887 +** This routine does a complete check of the given BTree file. aRoot[] is
1.6888 +** an array of pages numbers were each page number is the root page of
1.6889 +** a table. nRoot is the number of entries in aRoot.
1.6890 +**
1.6891 +** Write the number of error seen in *pnErr. Except for some memory
1.6892 +** allocation errors, nn error message is held in memory obtained from
1.6893 +** malloc is returned if *pnErr is non-zero. If *pnErr==0 then NULL is
1.6894 +** returned.
1.6895 +*/
1.6896 +char *sqlite3BtreeIntegrityCheck(
1.6897 + Btree *p, /* The btree to be checked */
1.6898 + int *aRoot, /* An array of root pages numbers for individual trees */
1.6899 + int nRoot, /* Number of entries in aRoot[] */
1.6900 + int mxErr, /* Stop reporting errors after this many */
1.6901 + int *pnErr /* Write number of errors seen to this variable */
1.6902 +){
1.6903 + int i;
1.6904 + int nRef;
1.6905 + IntegrityCk sCheck;
1.6906 + BtShared *pBt = p->pBt;
1.6907 + char zErr[100];
1.6908 +
1.6909 + sqlite3BtreeEnter(p);
1.6910 + pBt->db = p->db;
1.6911 + nRef = sqlite3PagerRefcount(pBt->pPager);
1.6912 + if( lockBtreeWithRetry(p)!=SQLITE_OK ){
1.6913 + *pnErr = 1;
1.6914 + sqlite3BtreeLeave(p);
1.6915 + return sqlite3DbStrDup(0, "cannot acquire a read lock on the database");
1.6916 + }
1.6917 + sCheck.pBt = pBt;
1.6918 + sCheck.pPager = pBt->pPager;
1.6919 + sCheck.nPage = pagerPagecount(sCheck.pPager);
1.6920 + sCheck.mxErr = mxErr;
1.6921 + sCheck.nErr = 0;
1.6922 + sCheck.mallocFailed = 0;
1.6923 + *pnErr = 0;
1.6924 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.6925 + if( pBt->nTrunc!=0 ){
1.6926 + sCheck.nPage = pBt->nTrunc;
1.6927 + }
1.6928 +#endif
1.6929 + if( sCheck.nPage==0 ){
1.6930 + unlockBtreeIfUnused(pBt);
1.6931 + sqlite3BtreeLeave(p);
1.6932 + return 0;
1.6933 + }
1.6934 + sCheck.anRef = sqlite3Malloc( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
1.6935 + if( !sCheck.anRef ){
1.6936 + unlockBtreeIfUnused(pBt);
1.6937 + *pnErr = 1;
1.6938 + sqlite3BtreeLeave(p);
1.6939 + return 0;
1.6940 + }
1.6941 + for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
1.6942 + i = PENDING_BYTE_PAGE(pBt);
1.6943 + if( i<=sCheck.nPage ){
1.6944 + sCheck.anRef[i] = 1;
1.6945 + }
1.6946 + sqlite3StrAccumInit(&sCheck.errMsg, zErr, sizeof(zErr), 20000);
1.6947 +
1.6948 + /* Check the integrity of the freelist
1.6949 + */
1.6950 + checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
1.6951 + get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
1.6952 +
1.6953 + /* Check all the tables.
1.6954 + */
1.6955 + for(i=0; i<nRoot && sCheck.mxErr; i++){
1.6956 + if( aRoot[i]==0 ) continue;
1.6957 +#ifndef SQLITE_OMIT_AUTOVACUUM
1.6958 + if( pBt->autoVacuum && aRoot[i]>1 ){
1.6959 + checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0);
1.6960 + }
1.6961 +#endif
1.6962 + checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: ");
1.6963 + }
1.6964 +
1.6965 + /* Make sure every page in the file is referenced
1.6966 + */
1.6967 + for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
1.6968 +#ifdef SQLITE_OMIT_AUTOVACUUM
1.6969 + if( sCheck.anRef[i]==0 ){
1.6970 + checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
1.6971 + }
1.6972 +#else
1.6973 + /* If the database supports auto-vacuum, make sure no tables contain
1.6974 + ** references to pointer-map pages.
1.6975 + */
1.6976 + if( sCheck.anRef[i]==0 &&
1.6977 + (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
1.6978 + checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
1.6979 + }
1.6980 + if( sCheck.anRef[i]!=0 &&
1.6981 + (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
1.6982 + checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i);
1.6983 + }
1.6984 +#endif
1.6985 + }
1.6986 +
1.6987 + /* Make sure this analysis did not leave any unref() pages
1.6988 + */
1.6989 + unlockBtreeIfUnused(pBt);
1.6990 + if( nRef != sqlite3PagerRefcount(pBt->pPager) ){
1.6991 + checkAppendMsg(&sCheck, 0,
1.6992 + "Outstanding page count goes from %d to %d during this analysis",
1.6993 + nRef, sqlite3PagerRefcount(pBt->pPager)
1.6994 + );
1.6995 + }
1.6996 +
1.6997 + /* Clean up and report errors.
1.6998 + */
1.6999 + sqlite3BtreeLeave(p);
1.7000 + sqlite3_free(sCheck.anRef);
1.7001 + if( sCheck.mallocFailed ){
1.7002 + sqlite3StrAccumReset(&sCheck.errMsg);
1.7003 + *pnErr = sCheck.nErr+1;
1.7004 + return 0;
1.7005 + }
1.7006 + *pnErr = sCheck.nErr;
1.7007 + if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);
1.7008 + return sqlite3StrAccumFinish(&sCheck.errMsg);
1.7009 +}
1.7010 +#endif /* SQLITE_OMIT_INTEGRITY_CHECK */
1.7011 +
1.7012 +/*
1.7013 +** Return the full pathname of the underlying database file.
1.7014 +**
1.7015 +** The pager filename is invariant as long as the pager is
1.7016 +** open so it is safe to access without the BtShared mutex.
1.7017 +*/
1.7018 +const char *sqlite3BtreeGetFilename(Btree *p){
1.7019 + assert( p->pBt->pPager!=0 );
1.7020 + return sqlite3PagerFilename(p->pBt->pPager);
1.7021 +}
1.7022 +
1.7023 +/*
1.7024 +** Return the pathname of the directory that contains the database file.
1.7025 +**
1.7026 +** The pager directory name is invariant as long as the pager is
1.7027 +** open so it is safe to access without the BtShared mutex.
1.7028 +*/
1.7029 +const char *sqlite3BtreeGetDirname(Btree *p){
1.7030 + assert( p->pBt->pPager!=0 );
1.7031 + return sqlite3PagerDirname(p->pBt->pPager);
1.7032 +}
1.7033 +
1.7034 +/*
1.7035 +** Return the pathname of the journal file for this database. The return
1.7036 +** value of this routine is the same regardless of whether the journal file
1.7037 +** has been created or not.
1.7038 +**
1.7039 +** The pager journal filename is invariant as long as the pager is
1.7040 +** open so it is safe to access without the BtShared mutex.
1.7041 +*/
1.7042 +const char *sqlite3BtreeGetJournalname(Btree *p){
1.7043 + assert( p->pBt->pPager!=0 );
1.7044 + return sqlite3PagerJournalname(p->pBt->pPager);
1.7045 +}
1.7046 +
1.7047 +#ifndef SQLITE_OMIT_VACUUM
1.7048 +/*
1.7049 +** Copy the complete content of pBtFrom into pBtTo. A transaction
1.7050 +** must be active for both files.
1.7051 +**
1.7052 +** The size of file pTo may be reduced by this operation.
1.7053 +** If anything goes wrong, the transaction on pTo is rolled back.
1.7054 +**
1.7055 +** If successful, CommitPhaseOne() may be called on pTo before returning.
1.7056 +** The caller should finish committing the transaction on pTo by calling
1.7057 +** sqlite3BtreeCommit().
1.7058 +*/
1.7059 +static int btreeCopyFile(Btree *pTo, Btree *pFrom){
1.7060 + int rc = SQLITE_OK;
1.7061 + Pgno i;
1.7062 +
1.7063 + Pgno nFromPage; /* Number of pages in pFrom */
1.7064 + Pgno nToPage; /* Number of pages in pTo */
1.7065 + Pgno nNewPage; /* Number of pages in pTo after the copy */
1.7066 +
1.7067 + Pgno iSkip; /* Pending byte page in pTo */
1.7068 + int nToPageSize; /* Page size of pTo in bytes */
1.7069 + int nFromPageSize; /* Page size of pFrom in bytes */
1.7070 +
1.7071 + BtShared *pBtTo = pTo->pBt;
1.7072 + BtShared *pBtFrom = pFrom->pBt;
1.7073 + pBtTo->db = pTo->db;
1.7074 + pBtFrom->db = pFrom->db;
1.7075 +
1.7076 + nToPageSize = pBtTo->pageSize;
1.7077 + nFromPageSize = pBtFrom->pageSize;
1.7078 +
1.7079 + if( pTo->inTrans!=TRANS_WRITE || pFrom->inTrans!=TRANS_WRITE ){
1.7080 + return SQLITE_ERROR;
1.7081 + }
1.7082 + if( pBtTo->pCursor ){
1.7083 + return SQLITE_BUSY;
1.7084 + }
1.7085 +
1.7086 + nToPage = pagerPagecount(pBtTo->pPager);
1.7087 + nFromPage = pagerPagecount(pBtFrom->pPager);
1.7088 + iSkip = PENDING_BYTE_PAGE(pBtTo);
1.7089 +
1.7090 + /* Variable nNewPage is the number of pages required to store the
1.7091 + ** contents of pFrom using the current page-size of pTo.
1.7092 + */
1.7093 + nNewPage = ((i64)nFromPage * (i64)nFromPageSize + (i64)nToPageSize - 1) /
1.7094 + (i64)nToPageSize;
1.7095 +
1.7096 + for(i=1; rc==SQLITE_OK && (i<=nToPage || i<=nNewPage); i++){
1.7097 +
1.7098 + /* Journal the original page.
1.7099 + **
1.7100 + ** iSkip is the page number of the locking page (PENDING_BYTE_PAGE)
1.7101 + ** in database *pTo (before the copy). This page is never written
1.7102 + ** into the journal file. Unless i==iSkip or the page was not
1.7103 + ** present in pTo before the copy operation, journal page i from pTo.
1.7104 + */
1.7105 + if( i!=iSkip && i<=nToPage ){
1.7106 + DbPage *pDbPage = 0;
1.7107 + rc = sqlite3PagerGet(pBtTo->pPager, i, &pDbPage);
1.7108 + if( rc==SQLITE_OK ){
1.7109 + rc = sqlite3PagerWrite(pDbPage);
1.7110 + if( rc==SQLITE_OK && i>nFromPage ){
1.7111 + /* Yeah. It seems wierd to call DontWrite() right after Write(). But
1.7112 + ** that is because the names of those procedures do not exactly
1.7113 + ** represent what they do. Write() really means "put this page in the
1.7114 + ** rollback journal and mark it as dirty so that it will be written
1.7115 + ** to the database file later." DontWrite() undoes the second part of
1.7116 + ** that and prevents the page from being written to the database. The
1.7117 + ** page is still on the rollback journal, though. And that is the
1.7118 + ** whole point of this block: to put pages on the rollback journal.
1.7119 + */
1.7120 + rc = sqlite3PagerDontWrite(pDbPage);
1.7121 + }
1.7122 + sqlite3PagerUnref(pDbPage);
1.7123 + }
1.7124 + }
1.7125 +
1.7126 + /* Overwrite the data in page i of the target database */
1.7127 + if( rc==SQLITE_OK && i!=iSkip && i<=nNewPage ){
1.7128 +
1.7129 + DbPage *pToPage = 0;
1.7130 + sqlite3_int64 iOff;
1.7131 +
1.7132 + rc = sqlite3PagerGet(pBtTo->pPager, i, &pToPage);
1.7133 + if( rc==SQLITE_OK ){
1.7134 + rc = sqlite3PagerWrite(pToPage);
1.7135 + }
1.7136 +
1.7137 + for(
1.7138 + iOff=(i-1)*nToPageSize;
1.7139 + rc==SQLITE_OK && iOff<i*nToPageSize;
1.7140 + iOff += nFromPageSize
1.7141 + ){
1.7142 + DbPage *pFromPage = 0;
1.7143 + Pgno iFrom = (iOff/nFromPageSize)+1;
1.7144 +
1.7145 + if( iFrom==PENDING_BYTE_PAGE(pBtFrom) ){
1.7146 + continue;
1.7147 + }
1.7148 +
1.7149 + rc = sqlite3PagerGet(pBtFrom->pPager, iFrom, &pFromPage);
1.7150 + if( rc==SQLITE_OK ){
1.7151 + char *zTo = sqlite3PagerGetData(pToPage);
1.7152 + char *zFrom = sqlite3PagerGetData(pFromPage);
1.7153 + int nCopy;
1.7154 +
1.7155 + if( nFromPageSize>=nToPageSize ){
1.7156 + zFrom += ((i-1)*nToPageSize - ((iFrom-1)*nFromPageSize));
1.7157 + nCopy = nToPageSize;
1.7158 + }else{
1.7159 + zTo += (((iFrom-1)*nFromPageSize) - (i-1)*nToPageSize);
1.7160 + nCopy = nFromPageSize;
1.7161 + }
1.7162 +
1.7163 + memcpy(zTo, zFrom, nCopy);
1.7164 + sqlite3PagerUnref(pFromPage);
1.7165 + }
1.7166 + }
1.7167 +
1.7168 + if( pToPage ){
1.7169 + MemPage *p = (MemPage *)sqlite3PagerGetExtra(pToPage);
1.7170 + p->isInit = 0;
1.7171 + sqlite3PagerUnref(pToPage);
1.7172 + }
1.7173 + }
1.7174 + }
1.7175 +
1.7176 + /* If things have worked so far, the database file may need to be
1.7177 + ** truncated. The complex part is that it may need to be truncated to
1.7178 + ** a size that is not an integer multiple of nToPageSize - the current
1.7179 + ** page size used by the pager associated with B-Tree pTo.
1.7180 + **
1.7181 + ** For example, say the page-size of pTo is 2048 bytes and the original
1.7182 + ** number of pages is 5 (10 KB file). If pFrom has a page size of 1024
1.7183 + ** bytes and 9 pages, then the file needs to be truncated to 9KB.
1.7184 + */
1.7185 + if( rc==SQLITE_OK ){
1.7186 + if( nFromPageSize!=nToPageSize ){
1.7187 + sqlite3_file *pFile = sqlite3PagerFile(pBtTo->pPager);
1.7188 + i64 iSize = (i64)nFromPageSize * (i64)nFromPage;
1.7189 + i64 iNow = (i64)((nToPage>nNewPage)?nToPage:nNewPage) * (i64)nToPageSize;
1.7190 + i64 iPending = ((i64)PENDING_BYTE_PAGE(pBtTo)-1) *(i64)nToPageSize;
1.7191 +
1.7192 + assert( iSize<=iNow );
1.7193 +
1.7194 + /* Commit phase one syncs the journal file associated with pTo
1.7195 + ** containing the original data. It does not sync the database file
1.7196 + ** itself. After doing this it is safe to use OsTruncate() and other
1.7197 + ** file APIs on the database file directly.
1.7198 + */
1.7199 + pBtTo->db = pTo->db;
1.7200 + rc = sqlite3PagerCommitPhaseOne(pBtTo->pPager, 0, 0, 1);
1.7201 + if( iSize<iNow && rc==SQLITE_OK ){
1.7202 + rc = sqlite3OsTruncate(pFile, iSize);
1.7203 + }
1.7204 +
1.7205 + /* The loop that copied data from database pFrom to pTo did not
1.7206 + ** populate the locking page of database pTo. If the page-size of
1.7207 + ** pFrom is smaller than that of pTo, this means some data will
1.7208 + ** not have been copied.
1.7209 + **
1.7210 + ** This block copies the missing data from database pFrom to pTo
1.7211 + ** using file APIs. This is safe because at this point we know that
1.7212 + ** all of the original data from pTo has been synced into the
1.7213 + ** journal file. At this point it would be safe to do anything at
1.7214 + ** all to the database file except truncate it to zero bytes.
1.7215 + */
1.7216 + if( rc==SQLITE_OK && nFromPageSize<nToPageSize && iSize>iPending){
1.7217 + i64 iOff;
1.7218 + for(
1.7219 + iOff=iPending;
1.7220 + rc==SQLITE_OK && iOff<(iPending+nToPageSize);
1.7221 + iOff += nFromPageSize
1.7222 + ){
1.7223 + DbPage *pFromPage = 0;
1.7224 + Pgno iFrom = (iOff/nFromPageSize)+1;
1.7225 +
1.7226 + if( iFrom==PENDING_BYTE_PAGE(pBtFrom) || iFrom>nFromPage ){
1.7227 + continue;
1.7228 + }
1.7229 +
1.7230 + rc = sqlite3PagerGet(pBtFrom->pPager, iFrom, &pFromPage);
1.7231 + if( rc==SQLITE_OK ){
1.7232 + char *zFrom = sqlite3PagerGetData(pFromPage);
1.7233 + rc = sqlite3OsWrite(pFile, zFrom, nFromPageSize, iOff);
1.7234 + sqlite3PagerUnref(pFromPage);
1.7235 + }
1.7236 + }
1.7237 + }
1.7238 +
1.7239 + /* Sync the database file */
1.7240 + if( rc==SQLITE_OK ){
1.7241 + rc = sqlite3PagerSync(pBtTo->pPager);
1.7242 + }
1.7243 + }else{
1.7244 + rc = sqlite3PagerTruncate(pBtTo->pPager, nNewPage);
1.7245 + }
1.7246 + if( rc==SQLITE_OK ){
1.7247 + pBtTo->pageSizeFixed = 0;
1.7248 + }
1.7249 + }
1.7250 +
1.7251 + if( rc ){
1.7252 + sqlite3BtreeRollback(pTo);
1.7253 + }
1.7254 +
1.7255 + return rc;
1.7256 +}
1.7257 +int sqlite3BtreeCopyFile(Btree *pTo, Btree *pFrom){
1.7258 + int rc;
1.7259 + sqlite3BtreeEnter(pTo);
1.7260 + sqlite3BtreeEnter(pFrom);
1.7261 + rc = btreeCopyFile(pTo, pFrom);
1.7262 + sqlite3BtreeLeave(pFrom);
1.7263 + sqlite3BtreeLeave(pTo);
1.7264 + return rc;
1.7265 +}
1.7266 +
1.7267 +#endif /* SQLITE_OMIT_VACUUM */
1.7268 +
1.7269 +/*
1.7270 +** Return non-zero if a transaction is active.
1.7271 +*/
1.7272 +int sqlite3BtreeIsInTrans(Btree *p){
1.7273 + assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
1.7274 + return (p && (p->inTrans==TRANS_WRITE));
1.7275 +}
1.7276 +
1.7277 +/*
1.7278 +** Return non-zero if a statement transaction is active.
1.7279 +*/
1.7280 +int sqlite3BtreeIsInStmt(Btree *p){
1.7281 + assert( sqlite3BtreeHoldsMutex(p) );
1.7282 + return (p->pBt && p->pBt->inStmt);
1.7283 +}
1.7284 +
1.7285 +/*
1.7286 +** Return non-zero if a read (or write) transaction is active.
1.7287 +*/
1.7288 +int sqlite3BtreeIsInReadTrans(Btree *p){
1.7289 + assert( sqlite3_mutex_held(p->db->mutex) );
1.7290 + return (p && (p->inTrans!=TRANS_NONE));
1.7291 +}
1.7292 +
1.7293 +/*
1.7294 +** This function returns a pointer to a blob of memory associated with
1.7295 +** a single shared-btree. The memory is used by client code for its own
1.7296 +** purposes (for example, to store a high-level schema associated with
1.7297 +** the shared-btree). The btree layer manages reference counting issues.
1.7298 +**
1.7299 +** The first time this is called on a shared-btree, nBytes bytes of memory
1.7300 +** are allocated, zeroed, and returned to the caller. For each subsequent
1.7301 +** call the nBytes parameter is ignored and a pointer to the same blob
1.7302 +** of memory returned.
1.7303 +**
1.7304 +** If the nBytes parameter is 0 and the blob of memory has not yet been
1.7305 +** allocated, a null pointer is returned. If the blob has already been
1.7306 +** allocated, it is returned as normal.
1.7307 +**
1.7308 +** Just before the shared-btree is closed, the function passed as the
1.7309 +** xFree argument when the memory allocation was made is invoked on the
1.7310 +** blob of allocated memory. This function should not call sqlite3_free()
1.7311 +** on the memory, the btree layer does that.
1.7312 +*/
1.7313 +void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
1.7314 + BtShared *pBt = p->pBt;
1.7315 + sqlite3BtreeEnter(p);
1.7316 + if( !pBt->pSchema && nBytes ){
1.7317 + pBt->pSchema = sqlite3MallocZero(nBytes);
1.7318 + pBt->xFreeSchema = xFree;
1.7319 + }
1.7320 + sqlite3BtreeLeave(p);
1.7321 + return pBt->pSchema;
1.7322 +}
1.7323 +
1.7324 +/*
1.7325 +** Return true if another user of the same shared btree as the argument
1.7326 +** handle holds an exclusive lock on the sqlite_master table.
1.7327 +*/
1.7328 +int sqlite3BtreeSchemaLocked(Btree *p){
1.7329 + int rc;
1.7330 + assert( sqlite3_mutex_held(p->db->mutex) );
1.7331 + sqlite3BtreeEnter(p);
1.7332 + rc = (queryTableLock(p, MASTER_ROOT, READ_LOCK)!=SQLITE_OK);
1.7333 + sqlite3BtreeLeave(p);
1.7334 + return rc;
1.7335 +}
1.7336 +
1.7337 +
1.7338 +#ifndef SQLITE_OMIT_SHARED_CACHE
1.7339 +/*
1.7340 +** Obtain a lock on the table whose root page is iTab. The
1.7341 +** lock is a write lock if isWritelock is true or a read lock
1.7342 +** if it is false.
1.7343 +*/
1.7344 +int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
1.7345 + int rc = SQLITE_OK;
1.7346 + if( p->sharable ){
1.7347 + u8 lockType = READ_LOCK + isWriteLock;
1.7348 + assert( READ_LOCK+1==WRITE_LOCK );
1.7349 + assert( isWriteLock==0 || isWriteLock==1 );
1.7350 + sqlite3BtreeEnter(p);
1.7351 + rc = queryTableLock(p, iTab, lockType);
1.7352 + if( rc==SQLITE_OK ){
1.7353 + rc = lockTable(p, iTab, lockType);
1.7354 + }
1.7355 + sqlite3BtreeLeave(p);
1.7356 + }
1.7357 + return rc;
1.7358 +}
1.7359 +#endif
1.7360 +
1.7361 +#ifndef SQLITE_OMIT_INCRBLOB
1.7362 +/*
1.7363 +** Argument pCsr must be a cursor opened for writing on an
1.7364 +** INTKEY table currently pointing at a valid table entry.
1.7365 +** This function modifies the data stored as part of that entry.
1.7366 +** Only the data content may only be modified, it is not possible
1.7367 +** to change the length of the data stored.
1.7368 +*/
1.7369 +int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
1.7370 + assert( cursorHoldsMutex(pCsr) );
1.7371 + assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
1.7372 + assert(pCsr->isIncrblobHandle);
1.7373 +
1.7374 + restoreCursorPosition(pCsr);
1.7375 + assert( pCsr->eState!=CURSOR_REQUIRESEEK );
1.7376 + if( pCsr->eState!=CURSOR_VALID ){
1.7377 + return SQLITE_ABORT;
1.7378 + }
1.7379 +
1.7380 + /* Check some preconditions:
1.7381 + ** (a) the cursor is open for writing,
1.7382 + ** (b) there is no read-lock on the table being modified and
1.7383 + ** (c) the cursor points at a valid row of an intKey table.
1.7384 + */
1.7385 + if( !pCsr->wrFlag ){
1.7386 + return SQLITE_READONLY;
1.7387 + }
1.7388 + assert( !pCsr->pBt->readOnly
1.7389 + && pCsr->pBt->inTransaction==TRANS_WRITE );
1.7390 + if( checkReadLocks(pCsr->pBtree, pCsr->pgnoRoot, pCsr, 0) ){
1.7391 + return SQLITE_LOCKED; /* The table pCur points to has a read lock */
1.7392 + }
1.7393 + if( pCsr->eState==CURSOR_INVALID || !pCsr->apPage[pCsr->iPage]->intKey ){
1.7394 + return SQLITE_ERROR;
1.7395 + }
1.7396 +
1.7397 + return accessPayload(pCsr, offset, amt, (unsigned char *)z, 0, 1);
1.7398 +}
1.7399 +
1.7400 +/*
1.7401 +** Set a flag on this cursor to cache the locations of pages from the
1.7402 +** overflow list for the current row. This is used by cursors opened
1.7403 +** for incremental blob IO only.
1.7404 +**
1.7405 +** This function sets a flag only. The actual page location cache
1.7406 +** (stored in BtCursor.aOverflow[]) is allocated and used by function
1.7407 +** accessPayload() (the worker function for sqlite3BtreeData() and
1.7408 +** sqlite3BtreePutData()).
1.7409 +*/
1.7410 +void sqlite3BtreeCacheOverflow(BtCursor *pCur){
1.7411 + assert( cursorHoldsMutex(pCur) );
1.7412 + assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
1.7413 + assert(!pCur->isIncrblobHandle);
1.7414 + assert(!pCur->aOverflow);
1.7415 + pCur->isIncrblobHandle = 1;
1.7416 +}
1.7417 +#endif