Update contrib.
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 *************************************************************************
12 ** $Id: btree.c,v 1.495 2008/08/02 17:36:46 danielk1977 Exp $
14 ** This file implements a external (disk-based) database using BTrees.
15 ** See the header comment on "btreeInt.h" for additional information.
16 ** Including a description of file format and an overview of operation.
21 ** The header string that appears at the beginning of every
24 static const char zMagicHeader[] = SQLITE_FILE_HEADER;
27 ** Set this global variable to 1 to enable tracing using the TRACE
31 int sqlite3BtreeTrace=0; /* True to enable tracing */
32 # define TRACE(X) if(sqlite3BtreeTrace){printf X;fflush(stdout);}
39 #ifndef SQLITE_OMIT_SHARED_CACHE
41 ** A flag to indicate whether or not shared cache is enabled. Also,
42 ** a list of BtShared objects that are eligible for participation
43 ** in shared cache. The variables have file scope during normal builds,
44 ** but the test harness needs to access these variables so we make them
45 ** global for test builds.
48 BtShared *sqlite3SharedCacheList = 0;
49 int sqlite3SharedCacheEnabled = 0;
51 static BtShared *sqlite3SharedCacheList = 0;
52 static int sqlite3SharedCacheEnabled = 0;
54 #endif /* SQLITE_OMIT_SHARED_CACHE */
56 #ifndef SQLITE_OMIT_SHARED_CACHE
58 ** Enable or disable the shared pager and schema features.
60 ** This routine has no effect on existing database connections.
61 ** The shared cache setting effects only future calls to
62 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
64 int sqlite3_enable_shared_cache(int enable){
65 sqlite3SharedCacheEnabled = enable;
72 ** Forward declaration
74 static int checkReadLocks(Btree*, Pgno, BtCursor*, i64);
77 #ifdef SQLITE_OMIT_SHARED_CACHE
79 ** The functions queryTableLock(), lockTable() and unlockAllTables()
80 ** manipulate entries in the BtShared.pLock linked list used to store
81 ** shared-cache table level locks. If the library is compiled with the
82 ** shared-cache feature disabled, then there is only ever one user
83 ** of each BtShared structure and so this locking is not necessary.
84 ** So define the lock related functions as no-ops.
86 #define queryTableLock(a,b,c) SQLITE_OK
87 #define lockTable(a,b,c) SQLITE_OK
88 #define unlockAllTables(a)
91 #ifndef SQLITE_OMIT_SHARED_CACHE
93 ** Query to see if btree handle p may obtain a lock of type eLock
94 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
95 ** SQLITE_OK if the lock may be obtained (by calling lockTable()), or
96 ** SQLITE_LOCKED if not.
98 static int queryTableLock(Btree *p, Pgno iTab, u8 eLock){
99 BtShared *pBt = p->pBt;
102 assert( sqlite3BtreeHoldsMutex(p) );
103 assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
106 /* This is a no-op if the shared-cache is not enabled */
111 /* If some other connection is holding an exclusive lock, the
112 ** requested lock may not be obtained.
114 if( pBt->pExclusive && pBt->pExclusive!=p ){
115 return SQLITE_LOCKED;
118 /* This (along with lockTable()) is where the ReadUncommitted flag is
119 ** dealt with. If the caller is querying for a read-lock and the flag is
120 ** set, it is unconditionally granted - even if there are write-locks
121 ** on the table. If a write-lock is requested, the ReadUncommitted flag
122 ** is not considered.
124 ** In function lockTable(), if a read-lock is demanded and the
125 ** ReadUncommitted flag is set, no entry is added to the locks list
128 ** To summarize: If the ReadUncommitted flag is set, then read cursors do
129 ** not create or respect table locks. The locking procedure for a
130 ** write-cursor does not change.
133 0==(p->db->flags&SQLITE_ReadUncommitted) ||
137 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
138 if( pIter->pBtree!=p && pIter->iTable==iTab &&
139 (pIter->eLock!=eLock || eLock!=READ_LOCK) ){
140 return SQLITE_LOCKED;
146 #endif /* !SQLITE_OMIT_SHARED_CACHE */
148 #ifndef SQLITE_OMIT_SHARED_CACHE
150 ** Add a lock on the table with root-page iTable to the shared-btree used
151 ** by Btree handle p. Parameter eLock must be either READ_LOCK or
154 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_BUSY and
155 ** SQLITE_NOMEM may also be returned.
157 static int lockTable(Btree *p, Pgno iTable, u8 eLock){
158 BtShared *pBt = p->pBt;
162 assert( sqlite3BtreeHoldsMutex(p) );
163 assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
166 /* This is a no-op if the shared-cache is not enabled */
171 assert( SQLITE_OK==queryTableLock(p, iTable, eLock) );
173 /* If the read-uncommitted flag is set and a read-lock is requested,
174 ** return early without adding an entry to the BtShared.pLock list. See
175 ** comment in function queryTableLock() for more info on handling
176 ** the ReadUncommitted flag.
179 (p->db->flags&SQLITE_ReadUncommitted) &&
180 (eLock==READ_LOCK) &&
186 /* First search the list for an existing lock on this table. */
187 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
188 if( pIter->iTable==iTable && pIter->pBtree==p ){
194 /* If the above search did not find a BtLock struct associating Btree p
195 ** with table iTable, allocate one and link it into the list.
198 pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
202 pLock->iTable = iTable;
204 pLock->pNext = pBt->pLock;
208 /* Set the BtLock.eLock variable to the maximum of the current lock
209 ** and the requested lock. This means if a write-lock was already held
210 ** and a read-lock requested, we don't incorrectly downgrade the lock.
212 assert( WRITE_LOCK>READ_LOCK );
213 if( eLock>pLock->eLock ){
214 pLock->eLock = eLock;
219 #endif /* !SQLITE_OMIT_SHARED_CACHE */
221 #ifndef SQLITE_OMIT_SHARED_CACHE
223 ** Release all the table locks (locks obtained via calls to the lockTable()
224 ** procedure) held by Btree handle p.
226 static void unlockAllTables(Btree *p){
227 BtShared *pBt = p->pBt;
228 BtLock **ppIter = &pBt->pLock;
230 assert( sqlite3BtreeHoldsMutex(p) );
231 assert( p->sharable || 0==*ppIter );
234 BtLock *pLock = *ppIter;
235 assert( pBt->pExclusive==0 || pBt->pExclusive==pLock->pBtree );
236 if( pLock->pBtree==p ){
237 *ppIter = pLock->pNext;
240 ppIter = &pLock->pNext;
244 if( pBt->pExclusive==p ){
248 #endif /* SQLITE_OMIT_SHARED_CACHE */
250 static void releasePage(MemPage *pPage); /* Forward reference */
253 ** Verify that the cursor holds a mutex on the BtShared
256 static int cursorHoldsMutex(BtCursor *p){
257 return sqlite3_mutex_held(p->pBt->mutex);
262 #ifndef SQLITE_OMIT_INCRBLOB
264 ** Invalidate the overflow page-list cache for cursor pCur, if any.
266 static void invalidateOverflowCache(BtCursor *pCur){
267 assert( cursorHoldsMutex(pCur) );
268 sqlite3_free(pCur->aOverflow);
273 ** Invalidate the overflow page-list cache for all cursors opened
274 ** on the shared btree structure pBt.
276 static void invalidateAllOverflowCache(BtShared *pBt){
278 assert( sqlite3_mutex_held(pBt->mutex) );
279 for(p=pBt->pCursor; p; p=p->pNext){
280 invalidateOverflowCache(p);
284 #define invalidateOverflowCache(x)
285 #define invalidateAllOverflowCache(x)
289 ** Save the current cursor position in the variables BtCursor.nKey
290 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
292 static int saveCursorPosition(BtCursor *pCur){
295 assert( CURSOR_VALID==pCur->eState );
296 assert( 0==pCur->pKey );
297 assert( cursorHoldsMutex(pCur) );
299 rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);
301 /* If this is an intKey table, then the above call to BtreeKeySize()
302 ** stores the integer key in pCur->nKey. In this case this value is
303 ** all that is required. Otherwise, if pCur is not open on an intKey
304 ** table, then malloc space for and store the pCur->nKey bytes of key
307 if( rc==SQLITE_OK && 0==pCur->pPage->intKey){
308 void *pKey = sqlite3Malloc(pCur->nKey);
310 rc = sqlite3BtreeKey(pCur, 0, pCur->nKey, pKey);
320 assert( !pCur->pPage->intKey || !pCur->pKey );
323 releasePage(pCur->pPage);
325 pCur->eState = CURSOR_REQUIRESEEK;
328 invalidateOverflowCache(pCur);
333 ** Save the positions of all cursors except pExcept open on the table
334 ** with root-page iRoot. Usually, this is called just before cursor
335 ** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()).
337 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
339 assert( sqlite3_mutex_held(pBt->mutex) );
340 assert( pExcept==0 || pExcept->pBt==pBt );
341 for(p=pBt->pCursor; p; p=p->pNext){
342 if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) &&
343 p->eState==CURSOR_VALID ){
344 int rc = saveCursorPosition(p);
354 ** Clear the current cursor position.
356 static void clearCursorPosition(BtCursor *pCur){
357 assert( cursorHoldsMutex(pCur) );
358 sqlite3_free(pCur->pKey);
360 pCur->eState = CURSOR_INVALID;
364 ** Restore the cursor to the position it was in (or as close to as possible)
365 ** when saveCursorPosition() was called. Note that this call deletes the
366 ** saved position info stored by saveCursorPosition(), so there can be
367 ** at most one effective restoreCursorPosition() call after each
368 ** saveCursorPosition().
370 int sqlite3BtreeRestoreCursorPosition(BtCursor *pCur){
372 assert( cursorHoldsMutex(pCur) );
373 assert( pCur->eState>=CURSOR_REQUIRESEEK );
374 if( pCur->eState==CURSOR_FAULT ){
377 pCur->eState = CURSOR_INVALID;
378 rc = sqlite3BtreeMoveto(pCur, pCur->pKey, 0, pCur->nKey, 0, &pCur->skip);
380 sqlite3_free(pCur->pKey);
382 assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
387 #define restoreCursorPosition(p) \
388 (p->eState>=CURSOR_REQUIRESEEK ? \
389 sqlite3BtreeRestoreCursorPosition(p) : \
393 ** Determine whether or not a cursor has moved from the position it
394 ** was last placed at. Cursor can move when the row they are pointing
395 ** at is deleted out from under them.
397 ** This routine returns an error code if something goes wrong. The
398 ** integer *pHasMoved is set to one if the cursor has moved and 0 if not.
400 int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved){
403 rc = restoreCursorPosition(pCur);
408 if( pCur->eState!=CURSOR_VALID || pCur->skip!=0 ){
416 #ifndef SQLITE_OMIT_AUTOVACUUM
418 ** Given a page number of a regular database page, return the page
419 ** number for the pointer-map page that contains the entry for the
420 ** input page number.
422 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
423 int nPagesPerMapPage, iPtrMap, ret;
424 assert( sqlite3_mutex_held(pBt->mutex) );
425 nPagesPerMapPage = (pBt->usableSize/5)+1;
426 iPtrMap = (pgno-2)/nPagesPerMapPage;
427 ret = (iPtrMap*nPagesPerMapPage) + 2;
428 if( ret==PENDING_BYTE_PAGE(pBt) ){
435 ** Write an entry into the pointer map.
437 ** This routine updates the pointer map entry for page number 'key'
438 ** so that it maps to type 'eType' and parent page number 'pgno'.
439 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
441 static int ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent){
442 DbPage *pDbPage; /* The pointer map page */
443 u8 *pPtrmap; /* The pointer map data */
444 Pgno iPtrmap; /* The pointer map page number */
445 int offset; /* Offset in pointer map page */
448 assert( sqlite3_mutex_held(pBt->mutex) );
449 /* The master-journal page number must never be used as a pointer map page */
450 assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
452 assert( pBt->autoVacuum );
454 return SQLITE_CORRUPT_BKPT;
456 iPtrmap = PTRMAP_PAGENO(pBt, key);
457 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
461 offset = PTRMAP_PTROFFSET(iPtrmap, key);
462 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
464 if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
465 TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
466 rc = sqlite3PagerWrite(pDbPage);
468 pPtrmap[offset] = eType;
469 put4byte(&pPtrmap[offset+1], parent);
473 sqlite3PagerUnref(pDbPage);
478 ** Read an entry from the pointer map.
480 ** This routine retrieves the pointer map entry for page 'key', writing
481 ** the type and parent page number to *pEType and *pPgno respectively.
482 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
484 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
485 DbPage *pDbPage; /* The pointer map page */
486 int iPtrmap; /* Pointer map page index */
487 u8 *pPtrmap; /* Pointer map page data */
488 int offset; /* Offset of entry in pointer map */
491 assert( sqlite3_mutex_held(pBt->mutex) );
493 iPtrmap = PTRMAP_PAGENO(pBt, key);
494 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
498 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
500 offset = PTRMAP_PTROFFSET(iPtrmap, key);
502 *pEType = pPtrmap[offset];
503 if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
505 sqlite3PagerUnref(pDbPage);
506 if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
510 #else /* if defined SQLITE_OMIT_AUTOVACUUM */
511 #define ptrmapPut(w,x,y,z) SQLITE_OK
512 #define ptrmapGet(w,x,y,z) SQLITE_OK
513 #define ptrmapPutOvfl(y,z) SQLITE_OK
517 ** Given a btree page and a cell index (0 means the first cell on
518 ** the page, 1 means the second cell, and so forth) return a pointer
519 ** to the cell content.
521 ** This routine works only for pages that do not contain overflow cells.
523 #define findCell(P,I) \
524 ((P)->aData + ((P)->maskPage & get2byte(&(P)->aData[(P)->cellOffset+2*(I)])))
527 ** This a more complex version of findCell() that works for
528 ** pages that do contain overflow cells. See insert
530 static u8 *findOverflowCell(MemPage *pPage, int iCell){
532 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
533 for(i=pPage->nOverflow-1; i>=0; i--){
535 struct _OvflCell *pOvfl;
536 pOvfl = &pPage->aOvfl[i];
545 return findCell(pPage, iCell);
549 ** Parse a cell content block and fill in the CellInfo structure. There
550 ** are two versions of this function. sqlite3BtreeParseCell() takes a
551 ** cell index as the second argument and sqlite3BtreeParseCellPtr()
552 ** takes a pointer to the body of the cell as its second argument.
554 ** Within this file, the parseCell() macro can be called instead of
555 ** sqlite3BtreeParseCellPtr(). Using some compilers, this will be faster.
557 void sqlite3BtreeParseCellPtr(
558 MemPage *pPage, /* Page containing the cell */
559 u8 *pCell, /* Pointer to the cell text. */
560 CellInfo *pInfo /* Fill in this structure */
562 int n; /* Number bytes in cell content header */
563 u32 nPayload; /* Number of bytes of cell payload */
565 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
567 pInfo->pCell = pCell;
568 assert( pPage->leaf==0 || pPage->leaf==1 );
569 n = pPage->childPtrSize;
570 assert( n==4-4*pPage->leaf );
572 if( pPage->hasData ){
573 n += getVarint32(&pCell[n], nPayload);
577 n += getVarint(&pCell[n], (u64*)&pInfo->nKey);
578 pInfo->nData = nPayload;
581 n += getVarint32(&pCell[n], nPayload);
582 pInfo->nKey = nPayload;
584 pInfo->nPayload = nPayload;
586 if( likely(nPayload<=pPage->maxLocal) ){
587 /* This is the (easy) common case where the entire payload fits
588 ** on the local page. No overflow is required.
590 int nSize; /* Total size of cell content in bytes */
591 nSize = nPayload + n;
592 pInfo->nLocal = nPayload;
593 pInfo->iOverflow = 0;
594 if( (nSize & ~3)==0 ){
595 nSize = 4; /* Minimum cell size is 4 */
597 pInfo->nSize = nSize;
599 /* If the payload will not fit completely on the local page, we have
600 ** to decide how much to store locally and how much to spill onto
601 ** overflow pages. The strategy is to minimize the amount of unused
602 ** space on overflow pages while keeping the amount of local storage
603 ** in between minLocal and maxLocal.
605 ** Warning: changing the way overflow payload is distributed in any
606 ** way will result in an incompatible file format.
608 int minLocal; /* Minimum amount of payload held locally */
609 int maxLocal; /* Maximum amount of payload held locally */
610 int surplus; /* Overflow payload available for local storage */
612 minLocal = pPage->minLocal;
613 maxLocal = pPage->maxLocal;
614 surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);
615 if( surplus <= maxLocal ){
616 pInfo->nLocal = surplus;
618 pInfo->nLocal = minLocal;
620 pInfo->iOverflow = pInfo->nLocal + n;
621 pInfo->nSize = pInfo->iOverflow + 4;
624 #define parseCell(pPage, iCell, pInfo) \
625 sqlite3BtreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo))
626 void sqlite3BtreeParseCell(
627 MemPage *pPage, /* Page containing the cell */
628 int iCell, /* The cell index. First cell is 0 */
629 CellInfo *pInfo /* Fill in this structure */
631 parseCell(pPage, iCell, pInfo);
635 ** Compute the total number of bytes that a Cell needs in the cell
636 ** data area of the btree-page. The return number includes the cell
637 ** data header and the local payload, but not any overflow page or
638 ** the space used by the cell pointer.
641 static u16 cellSize(MemPage *pPage, int iCell){
643 sqlite3BtreeParseCell(pPage, iCell, &info);
647 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
649 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
653 #ifndef SQLITE_OMIT_AUTOVACUUM
655 ** If the cell pCell, part of page pPage contains a pointer
656 ** to an overflow page, insert an entry into the pointer-map
657 ** for the overflow page.
659 static int ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell){
662 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
663 assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
664 if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
665 Pgno ovfl = get4byte(&pCell[info.iOverflow]);
666 return ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno);
671 ** If the cell with index iCell on page pPage contains a pointer
672 ** to an overflow page, insert an entry into the pointer-map
673 ** for the overflow page.
675 static int ptrmapPutOvfl(MemPage *pPage, int iCell){
677 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
678 pCell = findOverflowCell(pPage, iCell);
679 return ptrmapPutOvflPtr(pPage, pCell);
685 ** Defragment the page given. All Cells are moved to the
686 ** end of the page and all free space is collected into one
687 ** big FreeBlk that occurs in between the header and cell
688 ** pointer array and the cell content area.
690 static void defragmentPage(MemPage *pPage){
691 int i; /* Loop counter */
692 int pc; /* Address of a i-th cell */
693 int addr; /* Offset of first byte after cell pointer array */
694 int hdr; /* Offset to the page header */
695 int size; /* Size of a cell */
696 int usableSize; /* Number of usable bytes on a page */
697 int cellOffset; /* Offset to the cell pointer array */
698 int brk; /* Offset to the cell content area */
699 int nCell; /* Number of cells on the page */
700 unsigned char *data; /* The page data */
701 unsigned char *temp; /* Temp area for cell content */
703 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
704 assert( pPage->pBt!=0 );
705 assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
706 assert( pPage->nOverflow==0 );
707 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
708 temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
710 hdr = pPage->hdrOffset;
711 cellOffset = pPage->cellOffset;
712 nCell = pPage->nCell;
713 assert( nCell==get2byte(&data[hdr+3]) );
714 usableSize = pPage->pBt->usableSize;
715 brk = get2byte(&data[hdr+5]);
716 memcpy(&temp[brk], &data[brk], usableSize - brk);
718 for(i=0; i<nCell; i++){
719 u8 *pAddr; /* The i-th cell pointer */
720 pAddr = &data[cellOffset + i*2];
721 pc = get2byte(pAddr);
722 assert( pc<pPage->pBt->usableSize );
723 size = cellSizePtr(pPage, &temp[pc]);
725 memcpy(&data[brk], &temp[pc], size);
726 put2byte(pAddr, brk);
728 assert( brk>=cellOffset+2*nCell );
729 put2byte(&data[hdr+5], brk);
733 addr = cellOffset+2*nCell;
734 memset(&data[addr], 0, brk-addr);
738 ** Allocate nByte bytes of space on a page.
740 ** Return the index into pPage->aData[] of the first byte of
741 ** the new allocation. The caller guarantees that there is enough
742 ** space. This routine will never fail.
744 ** If the page contains nBytes of free space but does not contain
745 ** nBytes of contiguous free space, then this routine automatically
746 ** calls defragementPage() to consolidate all free space before
747 ** allocating the new chunk.
749 static int allocateSpace(MemPage *pPage, int nByte){
759 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
760 assert( pPage->pBt );
761 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
762 assert( nByte>=0 ); /* Minimum cell size is 4 */
763 assert( pPage->nFree>=nByte );
764 assert( pPage->nOverflow==0 );
765 pPage->nFree -= nByte;
766 hdr = pPage->hdrOffset;
770 /* Search the freelist looking for a slot big enough to satisfy the
773 while( (pc = get2byte(&data[addr]))>0 ){
774 size = get2byte(&data[pc+2]);
777 memcpy(&data[addr], &data[pc], 2);
778 data[hdr+7] = nFrag + size - nByte;
781 put2byte(&data[pc+2], size-nByte);
782 return pc + size - nByte;
789 /* Allocate memory from the gap in between the cell pointer array
790 ** and the cell content area.
792 top = get2byte(&data[hdr+5]);
793 nCell = get2byte(&data[hdr+3]);
794 cellOffset = pPage->cellOffset;
795 if( nFrag>=60 || cellOffset + 2*nCell > top - nByte ){
796 defragmentPage(pPage);
797 top = get2byte(&data[hdr+5]);
800 assert( cellOffset + 2*nCell <= top );
801 put2byte(&data[hdr+5], top);
806 ** Return a section of the pPage->aData to the freelist.
807 ** The first byte of the new free block is pPage->aDisk[start]
808 ** and the size of the block is "size" bytes.
810 ** Most of the effort here is involved in coalesing adjacent
811 ** free blocks into a single big free block.
813 static void freeSpace(MemPage *pPage, int start, int size){
814 int addr, pbegin, hdr;
815 unsigned char *data = pPage->aData;
817 assert( pPage->pBt!=0 );
818 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
819 assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) );
820 assert( (start + size)<=pPage->pBt->usableSize );
821 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
822 assert( size>=0 ); /* Minimum cell size is 4 */
824 #ifdef SQLITE_SECURE_DELETE
825 /* Overwrite deleted information with zeros when the SECURE_DELETE
826 ** option is enabled at compile-time */
827 memset(&data[start], 0, size);
830 /* Add the space back into the linked list of freeblocks */
831 hdr = pPage->hdrOffset;
833 while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
834 assert( pbegin<=pPage->pBt->usableSize-4 );
835 assert( pbegin>addr );
838 assert( pbegin<=pPage->pBt->usableSize-4 );
839 assert( pbegin>addr || pbegin==0 );
840 put2byte(&data[addr], start);
841 put2byte(&data[start], pbegin);
842 put2byte(&data[start+2], size);
843 pPage->nFree += size;
845 /* Coalesce adjacent free blocks */
846 addr = pPage->hdrOffset + 1;
847 while( (pbegin = get2byte(&data[addr]))>0 ){
849 assert( pbegin>addr );
850 assert( pbegin<=pPage->pBt->usableSize-4 );
851 pnext = get2byte(&data[pbegin]);
852 psize = get2byte(&data[pbegin+2]);
853 if( pbegin + psize + 3 >= pnext && pnext>0 ){
854 int frag = pnext - (pbegin+psize);
855 assert( frag<=data[pPage->hdrOffset+7] );
856 data[pPage->hdrOffset+7] -= frag;
857 put2byte(&data[pbegin], get2byte(&data[pnext]));
858 put2byte(&data[pbegin+2], pnext+get2byte(&data[pnext+2])-pbegin);
864 /* If the cell content area begins with a freeblock, remove it. */
865 if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){
867 pbegin = get2byte(&data[hdr+1]);
868 memcpy(&data[hdr+1], &data[pbegin], 2);
869 top = get2byte(&data[hdr+5]);
870 put2byte(&data[hdr+5], top + get2byte(&data[pbegin+2]));
875 ** Decode the flags byte (the first byte of the header) for a page
876 ** and initialize fields of the MemPage structure accordingly.
878 ** Only the following combinations are supported. Anything different
879 ** indicates a corrupt database files:
882 ** PTF_ZERODATA | PTF_LEAF
883 ** PTF_LEAFDATA | PTF_INTKEY
884 ** PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
886 static int decodeFlags(MemPage *pPage, int flagByte){
887 BtShared *pBt; /* A copy of pPage->pBt */
889 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
890 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
891 pPage->leaf = flagByte>>3; assert( PTF_LEAF == 1<<3 );
892 flagByte &= ~PTF_LEAF;
893 pPage->childPtrSize = 4-4*pPage->leaf;
895 if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
897 pPage->hasData = pPage->leaf;
898 pPage->maxLocal = pBt->maxLeaf;
899 pPage->minLocal = pBt->minLeaf;
900 }else if( flagByte==PTF_ZERODATA ){
903 pPage->maxLocal = pBt->maxLocal;
904 pPage->minLocal = pBt->minLocal;
906 return SQLITE_CORRUPT_BKPT;
912 ** Initialize the auxiliary information for a disk block.
914 ** The pParent parameter must be a pointer to the MemPage which
915 ** is the parent of the page being initialized. The root of a
916 ** BTree has no parent and so for that page, pParent==NULL.
918 ** Return SQLITE_OK on success. If we see that the page does
919 ** not contain a well-formed database page, then return
920 ** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not
921 ** guarantee that the page is well-formed. It only shows that
922 ** we failed to detect any corruption.
924 int sqlite3BtreeInitPage(
925 MemPage *pPage, /* The page to be initialized */
926 MemPage *pParent /* The parent. Might be NULL */
928 int pc; /* Address of a freeblock within pPage->aData[] */
929 int hdr; /* Offset to beginning of page header */
930 u8 *data; /* Equal to pPage->aData */
931 BtShared *pBt; /* The main btree structure */
932 int usableSize; /* Amount of usable space on each page */
933 int cellOffset; /* Offset from start of page to first cell pointer */
934 int nFree; /* Number of unused bytes on the page */
935 int top; /* First byte of the cell content area */
939 assert( pParent==0 || pParent->pBt==pBt );
940 assert( sqlite3_mutex_held(pBt->mutex) );
941 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
942 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
943 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
944 if( pPage->pParent!=pParent && (pPage->pParent!=0 || pPage->isInit) ){
945 /* The parent page should never change unless the file is corrupt */
946 return SQLITE_CORRUPT_BKPT;
948 if( pPage->isInit ) return SQLITE_OK;
949 if( pPage->pParent==0 && pParent!=0 ){
950 pPage->pParent = pParent;
951 sqlite3PagerRef(pParent->pDbPage);
953 hdr = pPage->hdrOffset;
955 if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;
956 assert( pBt->pageSize>=512 && pBt->pageSize<=32768 );
957 pPage->maskPage = pBt->pageSize - 1;
958 pPage->nOverflow = 0;
960 usableSize = pBt->usableSize;
961 pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
962 top = get2byte(&data[hdr+5]);
963 pPage->nCell = get2byte(&data[hdr+3]);
964 if( pPage->nCell>MX_CELL(pBt) ){
965 /* To many cells for a single page. The page must be corrupt */
966 return SQLITE_CORRUPT_BKPT;
968 if( pPage->nCell==0 && pParent!=0 && pParent->pgno!=1 ){
969 /* All pages must have at least one cell, except for root pages */
970 return SQLITE_CORRUPT_BKPT;
973 /* Compute the total free space on the page */
974 pc = get2byte(&data[hdr+1]);
975 nFree = data[hdr+7] + top - (cellOffset + 2*pPage->nCell);
978 if( pc>usableSize-4 ){
979 /* Free block is off the page */
980 return SQLITE_CORRUPT_BKPT;
982 next = get2byte(&data[pc]);
983 size = get2byte(&data[pc+2]);
984 if( next>0 && next<=pc+size+3 ){
985 /* Free blocks must be in accending order */
986 return SQLITE_CORRUPT_BKPT;
991 pPage->nFree = nFree;
992 if( nFree>=usableSize ){
993 /* Free space cannot exceed total page size */
994 return SQLITE_CORRUPT_BKPT;
998 /* Check that all the offsets in the cell offset array are within range.
1000 ** Omitting this consistency check and using the pPage->maskPage mask
1001 ** to prevent overrunning the page buffer in findCell() results in a
1002 ** 2.5% performance gain.
1005 u8 *pOff; /* Iterator used to check all cell offsets are in range */
1006 u8 *pEnd; /* Pointer to end of cell offset array */
1007 u8 mask; /* Mask of bits that must be zero in MSB of cell offsets */
1008 mask = ~(((u8)(pBt->pageSize>>8))-1);
1009 pEnd = &data[cellOffset + pPage->nCell*2];
1010 for(pOff=&data[cellOffset]; pOff!=pEnd && !((*pOff)&mask); pOff+=2);
1012 return SQLITE_CORRUPT_BKPT;
1022 ** Set up a raw page so that it looks like a database page holding
1025 static void zeroPage(MemPage *pPage, int flags){
1026 unsigned char *data = pPage->aData;
1027 BtShared *pBt = pPage->pBt;
1028 int hdr = pPage->hdrOffset;
1031 assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
1032 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1033 assert( sqlite3PagerGetData(pPage->pDbPage) == data );
1034 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1035 assert( sqlite3_mutex_held(pBt->mutex) );
1036 /*memset(&data[hdr], 0, pBt->usableSize - hdr);*/
1038 first = hdr + 8 + 4*((flags&PTF_LEAF)==0);
1039 memset(&data[hdr+1], 0, 4);
1041 put2byte(&data[hdr+5], pBt->usableSize);
1042 pPage->nFree = pBt->usableSize - first;
1043 decodeFlags(pPage, flags);
1044 pPage->hdrOffset = hdr;
1045 pPage->cellOffset = first;
1046 pPage->nOverflow = 0;
1047 assert( pBt->pageSize>=512 && pBt->pageSize<=32768 );
1048 pPage->maskPage = pBt->pageSize - 1;
1049 pPage->idxShift = 0;
1055 ** Get a page from the pager. Initialize the MemPage.pBt and
1056 ** MemPage.aData elements if needed.
1058 ** If the noContent flag is set, it means that we do not care about
1059 ** the content of the page at this time. So do not go to the disk
1060 ** to fetch the content. Just fill in the content with zeros for now.
1061 ** If in the future we call sqlite3PagerWrite() on this page, that
1062 ** means we have started to be concerned about content and the disk
1063 ** read should occur at that point.
1065 int sqlite3BtreeGetPage(
1066 BtShared *pBt, /* The btree */
1067 Pgno pgno, /* Number of the page to fetch */
1068 MemPage **ppPage, /* Return the page in this parameter */
1069 int noContent /* Do not load page content if true */
1075 assert( sqlite3_mutex_held(pBt->mutex) );
1076 rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent);
1078 pPage = (MemPage *)sqlite3PagerGetExtra(pDbPage);
1079 pPage->aData = sqlite3PagerGetData(pDbPage);
1080 pPage->pDbPage = pDbPage;
1083 pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
1089 ** Get a page from the pager and initialize it. This routine
1090 ** is just a convenience wrapper around separate calls to
1091 ** sqlite3BtreeGetPage() and sqlite3BtreeInitPage().
1093 static int getAndInitPage(
1094 BtShared *pBt, /* The database file */
1095 Pgno pgno, /* Number of the page to get */
1096 MemPage **ppPage, /* Write the page pointer here */
1097 MemPage *pParent /* Parent of the page */
1100 assert( sqlite3_mutex_held(pBt->mutex) );
1102 return SQLITE_CORRUPT_BKPT;
1104 rc = sqlite3BtreeGetPage(pBt, pgno, ppPage, 0);
1105 if( rc==SQLITE_OK && (*ppPage)->isInit==0 ){
1106 rc = sqlite3BtreeInitPage(*ppPage, pParent);
1107 if( rc!=SQLITE_OK ){
1108 releasePage(*ppPage);
1116 ** Release a MemPage. This should be called once for each prior
1117 ** call to sqlite3BtreeGetPage.
1119 static void releasePage(MemPage *pPage){
1121 assert( pPage->aData );
1122 assert( pPage->pBt );
1123 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1124 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
1125 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1126 sqlite3PagerUnref(pPage->pDbPage);
1131 ** This routine is called when the reference count for a page
1132 ** reaches zero. We need to unref the pParent pointer when that
1135 static void pageDestructor(DbPage *pData, int pageSize){
1137 assert( (pageSize & 7)==0 );
1138 pPage = (MemPage *)sqlite3PagerGetExtra(pData);
1139 assert( pPage->isInit==0 || sqlite3_mutex_held(pPage->pBt->mutex) );
1140 if( pPage->pParent ){
1141 MemPage *pParent = pPage->pParent;
1142 assert( pParent->pBt==pPage->pBt );
1144 releasePage(pParent);
1150 ** During a rollback, when the pager reloads information into the cache
1151 ** so that the cache is restored to its original state at the start of
1152 ** the transaction, for each page restored this routine is called.
1154 ** This routine needs to reset the extra data section at the end of the
1155 ** page to agree with the restored data.
1157 static void pageReinit(DbPage *pData, int pageSize){
1159 assert( (pageSize & 7)==0 );
1160 pPage = (MemPage *)sqlite3PagerGetExtra(pData);
1161 if( pPage->isInit ){
1162 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1164 sqlite3BtreeInitPage(pPage, pPage->pParent);
1169 ** Invoke the busy handler for a btree.
1171 static int sqlite3BtreeInvokeBusyHandler(void *pArg, int n){
1172 BtShared *pBt = (BtShared*)pArg;
1174 assert( sqlite3_mutex_held(pBt->db->mutex) );
1175 return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
1179 ** Open a database file.
1181 ** zFilename is the name of the database file. If zFilename is NULL
1182 ** a new database with a random name is created. This randomly named
1183 ** database file will be deleted when sqlite3BtreeClose() is called.
1184 ** If zFilename is ":memory:" then an in-memory database is created
1185 ** that is automatically destroyed when it is closed.
1187 int sqlite3BtreeOpen(
1188 const char *zFilename, /* Name of the file containing the BTree database */
1189 sqlite3 *db, /* Associated database handle */
1190 Btree **ppBtree, /* Pointer to new Btree object written here */
1191 int flags, /* Options */
1192 int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */
1194 sqlite3_vfs *pVfs; /* The VFS to use for this btree */
1195 BtShared *pBt = 0; /* Shared part of btree structure */
1196 Btree *p; /* Handle to return */
1199 unsigned char zDbHeader[100];
1201 /* Set the variable isMemdb to true for an in-memory database, or
1202 ** false for a file-based database. This symbol is only required if
1203 ** either of the shared-data or autovacuum features are compiled
1204 ** into the library.
1206 #if !defined(SQLITE_OMIT_SHARED_CACHE) || !defined(SQLITE_OMIT_AUTOVACUUM)
1207 #ifdef SQLITE_OMIT_MEMORYDB
1208 const int isMemdb = 0;
1210 const int isMemdb = zFilename && !strcmp(zFilename, ":memory:");
1215 assert( sqlite3_mutex_held(db->mutex) );
1218 p = sqlite3MallocZero(sizeof(Btree));
1220 return SQLITE_NOMEM;
1222 p->inTrans = TRANS_NONE;
1225 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1227 ** If this Btree is a candidate for shared cache, try to find an
1228 ** existing BtShared object that we can share with
1231 && (db->flags & SQLITE_Vtab)==0
1232 && zFilename && zFilename[0]
1234 if( sqlite3SharedCacheEnabled ){
1235 int nFullPathname = pVfs->mxPathname+1;
1236 char *zFullPathname = sqlite3Malloc(nFullPathname);
1237 sqlite3_mutex *mutexShared;
1239 db->flags |= SQLITE_SharedCache;
1240 if( !zFullPathname ){
1242 return SQLITE_NOMEM;
1244 sqlite3OsFullPathname(pVfs, zFilename, nFullPathname, zFullPathname);
1245 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1246 sqlite3_mutex_enter(mutexShared);
1247 for(pBt=sqlite3SharedCacheList; pBt; pBt=pBt->pNext){
1248 assert( pBt->nRef>0 );
1249 if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager))
1250 && sqlite3PagerVfs(pBt->pPager)==pVfs ){
1256 sqlite3_mutex_leave(mutexShared);
1257 sqlite3_free(zFullPathname);
1261 /* In debug mode, we mark all persistent databases as sharable
1262 ** even when they are not. This exercises the locking code and
1263 ** gives more opportunity for asserts(sqlite3_mutex_held())
1264 ** statements to find locking problems.
1273 ** The following asserts make sure that structures used by the btree are
1274 ** the right size. This is to guard against size changes that result
1275 ** when compiling on a different architecture.
1277 assert( sizeof(i64)==8 || sizeof(i64)==4 );
1278 assert( sizeof(u64)==8 || sizeof(u64)==4 );
1279 assert( sizeof(u32)==4 );
1280 assert( sizeof(u16)==2 );
1281 assert( sizeof(Pgno)==4 );
1283 pBt = sqlite3MallocZero( sizeof(*pBt) );
1286 goto btree_open_out;
1288 pBt->busyHdr.xFunc = sqlite3BtreeInvokeBusyHandler;
1289 pBt->busyHdr.pArg = pBt;
1290 rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
1291 EXTRA_SIZE, flags, vfsFlags);
1292 if( rc==SQLITE_OK ){
1293 rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
1295 if( rc!=SQLITE_OK ){
1296 goto btree_open_out;
1298 sqlite3PagerSetBusyhandler(pBt->pPager, &pBt->busyHdr);
1301 sqlite3PagerSetDestructor(pBt->pPager, pageDestructor);
1302 sqlite3PagerSetReiniter(pBt->pPager, pageReinit);
1305 pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager);
1306 pBt->pageSize = get2byte(&zDbHeader[16]);
1307 if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
1308 || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
1310 sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1311 #ifndef SQLITE_OMIT_AUTOVACUUM
1312 /* If the magic name ":memory:" will create an in-memory database, then
1313 ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
1314 ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
1315 ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
1316 ** regular file-name. In this case the auto-vacuum applies as per normal.
1318 if( zFilename && !isMemdb ){
1319 pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
1320 pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
1325 nReserve = zDbHeader[20];
1326 pBt->pageSizeFixed = 1;
1327 #ifndef SQLITE_OMIT_AUTOVACUUM
1328 pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
1329 pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
1332 pBt->usableSize = pBt->pageSize - nReserve;
1333 assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */
1334 sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1336 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1337 /* Add the new BtShared object to the linked list sharable BtShareds.
1340 sqlite3_mutex *mutexShared;
1342 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1343 if( SQLITE_THREADSAFE && sqlite3Config.bCoreMutex ){
1344 pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
1345 if( pBt->mutex==0 ){
1347 db->mallocFailed = 0;
1348 goto btree_open_out;
1351 sqlite3_mutex_enter(mutexShared);
1352 pBt->pNext = sqlite3SharedCacheList;
1353 sqlite3SharedCacheList = pBt;
1354 sqlite3_mutex_leave(mutexShared);
1359 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1360 /* If the new Btree uses a sharable pBtShared, then link the new
1361 ** Btree into the list of all sharable Btrees for the same connection.
1362 ** The list is kept in ascending order by pBt address.
1367 for(i=0; i<db->nDb; i++){
1368 if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
1369 while( pSib->pPrev ){ pSib = pSib->pPrev; }
1370 if( p->pBt<pSib->pBt ){
1375 while( pSib->pNext && pSib->pNext->pBt<p->pBt ){
1378 p->pNext = pSib->pNext;
1381 p->pNext->pPrev = p;
1393 if( rc!=SQLITE_OK ){
1394 if( pBt && pBt->pPager ){
1395 sqlite3PagerClose(pBt->pPager);
1405 ** Decrement the BtShared.nRef counter. When it reaches zero,
1406 ** remove the BtShared structure from the sharing list. Return
1407 ** true if the BtShared.nRef counter reaches zero and return
1408 ** false if it is still positive.
1410 static int removeFromSharingList(BtShared *pBt){
1411 #ifndef SQLITE_OMIT_SHARED_CACHE
1412 sqlite3_mutex *pMaster;
1416 assert( sqlite3_mutex_notheld(pBt->mutex) );
1417 pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1418 sqlite3_mutex_enter(pMaster);
1421 if( sqlite3SharedCacheList==pBt ){
1422 sqlite3SharedCacheList = pBt->pNext;
1424 pList = sqlite3SharedCacheList;
1425 while( ALWAYS(pList) && pList->pNext!=pBt ){
1428 if( ALWAYS(pList) ){
1429 pList->pNext = pBt->pNext;
1432 if( SQLITE_THREADSAFE ){
1433 sqlite3_mutex_free(pBt->mutex);
1437 sqlite3_mutex_leave(pMaster);
1445 ** Make sure pBt->pTmpSpace points to an allocation of
1446 ** MX_CELL_SIZE(pBt) bytes.
1448 static void allocateTempSpace(BtShared *pBt){
1449 if( !pBt->pTmpSpace ){
1450 pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
1455 ** Free the pBt->pTmpSpace allocation
1457 static void freeTempSpace(BtShared *pBt){
1458 sqlite3PageFree( pBt->pTmpSpace);
1463 ** Close an open database and invalidate all cursors.
1465 int sqlite3BtreeClose(Btree *p){
1466 BtShared *pBt = p->pBt;
1469 /* Close all cursors opened via this handle. */
1470 assert( sqlite3_mutex_held(p->db->mutex) );
1471 sqlite3BtreeEnter(p);
1473 pCur = pBt->pCursor;
1475 BtCursor *pTmp = pCur;
1477 if( pTmp->pBtree==p ){
1478 sqlite3BtreeCloseCursor(pTmp);
1482 /* Rollback any active transaction and free the handle structure.
1483 ** The call to sqlite3BtreeRollback() drops any table-locks held by
1486 sqlite3BtreeRollback(p);
1487 sqlite3BtreeLeave(p);
1489 /* If there are still other outstanding references to the shared-btree
1490 ** structure, return now. The remainder of this procedure cleans
1491 ** up the shared-btree.
1493 assert( p->wantToLock==0 && p->locked==0 );
1494 if( !p->sharable || removeFromSharingList(pBt) ){
1495 /* The pBt is no longer on the sharing list, so we can access
1496 ** it without having to hold the mutex.
1498 ** Clean out and delete the BtShared object.
1500 assert( !pBt->pCursor );
1501 sqlite3PagerClose(pBt->pPager);
1502 if( pBt->xFreeSchema && pBt->pSchema ){
1503 pBt->xFreeSchema(pBt->pSchema);
1505 sqlite3_free(pBt->pSchema);
1510 #ifndef SQLITE_OMIT_SHARED_CACHE
1511 assert( p->wantToLock==0 );
1512 assert( p->locked==0 );
1513 if( p->pPrev ) p->pPrev->pNext = p->pNext;
1514 if( p->pNext ) p->pNext->pPrev = p->pPrev;
1522 ** Change the limit on the number of pages allowed in the cache.
1524 ** The maximum number of cache pages is set to the absolute
1525 ** value of mxPage. If mxPage is negative, the pager will
1526 ** operate asynchronously - it will not stop to do fsync()s
1527 ** to insure data is written to the disk surface before
1528 ** continuing. Transactions still work if synchronous is off,
1529 ** and the database cannot be corrupted if this program
1530 ** crashes. But if the operating system crashes or there is
1531 ** an abrupt power failure when synchronous is off, the database
1532 ** could be left in an inconsistent and unrecoverable state.
1533 ** Synchronous is on by default so database corruption is not
1534 ** normally a worry.
1536 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
1537 BtShared *pBt = p->pBt;
1538 assert( sqlite3_mutex_held(p->db->mutex) );
1539 sqlite3BtreeEnter(p);
1540 sqlite3PagerSetCachesize(pBt->pPager, mxPage);
1541 sqlite3BtreeLeave(p);
1546 ** Change the way data is synced to disk in order to increase or decrease
1547 ** how well the database resists damage due to OS crashes and power
1548 ** failures. Level 1 is the same as asynchronous (no syncs() occur and
1549 ** there is a high probability of damage) Level 2 is the default. There
1550 ** is a very low but non-zero probability of damage. Level 3 reduces the
1551 ** probability of damage to near zero but with a write performance reduction.
1553 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
1554 int sqlite3BtreeSetSafetyLevel(Btree *p, int level, int fullSync){
1555 BtShared *pBt = p->pBt;
1556 assert( sqlite3_mutex_held(p->db->mutex) );
1557 sqlite3BtreeEnter(p);
1558 sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync);
1559 sqlite3BtreeLeave(p);
1565 ** Return TRUE if the given btree is set to safety level 1. In other
1566 ** words, return TRUE if no sync() occurs on the disk files.
1568 int sqlite3BtreeSyncDisabled(Btree *p){
1569 BtShared *pBt = p->pBt;
1571 assert( sqlite3_mutex_held(p->db->mutex) );
1572 sqlite3BtreeEnter(p);
1573 assert( pBt && pBt->pPager );
1574 rc = sqlite3PagerNosync(pBt->pPager);
1575 sqlite3BtreeLeave(p);
1579 #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
1581 ** Change the default pages size and the number of reserved bytes per page.
1583 ** The page size must be a power of 2 between 512 and 65536. If the page
1584 ** size supplied does not meet this constraint then the page size is not
1587 ** Page sizes are constrained to be a power of two so that the region
1588 ** of the database file used for locking (beginning at PENDING_BYTE,
1589 ** the first byte past the 1GB boundary, 0x40000000) needs to occur
1590 ** at the beginning of a page.
1592 ** If parameter nReserve is less than zero, then the number of reserved
1593 ** bytes per page is left unchanged.
1595 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve){
1597 BtShared *pBt = p->pBt;
1598 sqlite3BtreeEnter(p);
1599 if( pBt->pageSizeFixed ){
1600 sqlite3BtreeLeave(p);
1601 return SQLITE_READONLY;
1604 nReserve = pBt->pageSize - pBt->usableSize;
1606 if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
1607 ((pageSize-1)&pageSize)==0 ){
1608 assert( (pageSize & 7)==0 );
1609 assert( !pBt->pPage1 && !pBt->pCursor );
1610 pBt->pageSize = pageSize;
1612 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1614 pBt->usableSize = pBt->pageSize - nReserve;
1615 sqlite3BtreeLeave(p);
1620 ** Return the currently defined page size
1622 int sqlite3BtreeGetPageSize(Btree *p){
1623 return p->pBt->pageSize;
1625 int sqlite3BtreeGetReserve(Btree *p){
1627 sqlite3BtreeEnter(p);
1628 n = p->pBt->pageSize - p->pBt->usableSize;
1629 sqlite3BtreeLeave(p);
1634 ** Set the maximum page count for a database if mxPage is positive.
1635 ** No changes are made if mxPage is 0 or negative.
1636 ** Regardless of the value of mxPage, return the maximum page count.
1638 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
1640 sqlite3BtreeEnter(p);
1641 n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
1642 sqlite3BtreeLeave(p);
1645 #endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */
1648 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
1649 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
1650 ** is disabled. The default value for the auto-vacuum property is
1651 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
1653 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
1654 #ifdef SQLITE_OMIT_AUTOVACUUM
1655 return SQLITE_READONLY;
1657 BtShared *pBt = p->pBt;
1659 int av = (autoVacuum?1:0);
1661 sqlite3BtreeEnter(p);
1662 if( pBt->pageSizeFixed && av!=pBt->autoVacuum ){
1663 rc = SQLITE_READONLY;
1665 pBt->autoVacuum = av;
1667 sqlite3BtreeLeave(p);
1673 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is
1674 ** enabled 1 is returned. Otherwise 0.
1676 int sqlite3BtreeGetAutoVacuum(Btree *p){
1677 #ifdef SQLITE_OMIT_AUTOVACUUM
1678 return BTREE_AUTOVACUUM_NONE;
1681 sqlite3BtreeEnter(p);
1683 (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
1684 (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
1685 BTREE_AUTOVACUUM_INCR
1687 sqlite3BtreeLeave(p);
1694 ** Get a reference to pPage1 of the database file. This will
1695 ** also acquire a readlock on that file.
1697 ** SQLITE_OK is returned on success. If the file is not a
1698 ** well-formed database file, then SQLITE_CORRUPT is returned.
1699 ** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM
1700 ** is returned if we run out of memory.
1702 static int lockBtree(BtShared *pBt){
1707 assert( sqlite3_mutex_held(pBt->mutex) );
1708 if( pBt->pPage1 ) return SQLITE_OK;
1709 rc = sqlite3BtreeGetPage(pBt, 1, &pPage1, 0);
1710 if( rc!=SQLITE_OK ) return rc;
1712 /* Do some checking to help insure the file we opened really is
1713 ** a valid database file.
1715 rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
1716 if( rc!=SQLITE_OK ){
1717 goto page1_init_failed;
1718 }else if( nPage>0 ){
1721 u8 *page1 = pPage1->aData;
1723 if( memcmp(page1, zMagicHeader, 16)!=0 ){
1724 goto page1_init_failed;
1730 goto page1_init_failed;
1733 /* The maximum embedded fraction must be exactly 25%. And the minimum
1734 ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data.
1735 ** The original design allowed these amounts to vary, but as of
1736 ** version 3.6.0, we require them to be fixed.
1738 if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
1739 goto page1_init_failed;
1741 pageSize = get2byte(&page1[16]);
1742 if( ((pageSize-1)&pageSize)!=0 || pageSize<512 ||
1743 (SQLITE_MAX_PAGE_SIZE<32768 && pageSize>SQLITE_MAX_PAGE_SIZE)
1745 goto page1_init_failed;
1747 assert( (pageSize & 7)==0 );
1748 usableSize = pageSize - page1[20];
1749 if( pageSize!=pBt->pageSize ){
1750 /* After reading the first page of the database assuming a page size
1751 ** of BtShared.pageSize, we have discovered that the page-size is
1752 ** actually pageSize. Unlock the database, leave pBt->pPage1 at
1753 ** zero and return SQLITE_OK. The caller will call this function
1754 ** again with the correct page-size.
1756 releasePage(pPage1);
1757 pBt->usableSize = usableSize;
1758 pBt->pageSize = pageSize;
1760 sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1763 if( usableSize<500 ){
1764 goto page1_init_failed;
1766 pBt->pageSize = pageSize;
1767 pBt->usableSize = usableSize;
1768 #ifndef SQLITE_OMIT_AUTOVACUUM
1769 pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
1770 pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
1774 /* maxLocal is the maximum amount of payload to store locally for
1775 ** a cell. Make sure it is small enough so that at least minFanout
1776 ** cells can will fit on one page. We assume a 10-byte page header.
1777 ** Besides the payload, the cell must store:
1778 ** 2-byte pointer to the cell
1779 ** 4-byte child pointer
1780 ** 9-byte nKey value
1781 ** 4-byte nData value
1782 ** 4-byte overflow page pointer
1783 ** So a cell consists of a 2-byte poiner, a header which is as much as
1784 ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
1787 pBt->maxLocal = (pBt->usableSize-12)*64/255 - 23;
1788 pBt->minLocal = (pBt->usableSize-12)*32/255 - 23;
1789 pBt->maxLeaf = pBt->usableSize - 35;
1790 pBt->minLeaf = (pBt->usableSize-12)*32/255 - 23;
1791 assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
1792 pBt->pPage1 = pPage1;
1796 releasePage(pPage1);
1802 ** This routine works like lockBtree() except that it also invokes the
1803 ** busy callback if there is lock contention.
1805 static int lockBtreeWithRetry(Btree *pRef){
1808 assert( sqlite3BtreeHoldsMutex(pRef) );
1809 if( pRef->inTrans==TRANS_NONE ){
1810 u8 inTransaction = pRef->pBt->inTransaction;
1811 btreeIntegrity(pRef);
1812 rc = sqlite3BtreeBeginTrans(pRef, 0);
1813 pRef->pBt->inTransaction = inTransaction;
1814 pRef->inTrans = TRANS_NONE;
1815 if( rc==SQLITE_OK ){
1816 pRef->pBt->nTransaction--;
1818 btreeIntegrity(pRef);
1825 ** If there are no outstanding cursors and we are not in the middle
1826 ** of a transaction but there is a read lock on the database, then
1827 ** this routine unrefs the first page of the database file which
1828 ** has the effect of releasing the read lock.
1830 ** If there are any outstanding cursors, this routine is a no-op.
1832 ** If there is a transaction in progress, this routine is a no-op.
1834 static void unlockBtreeIfUnused(BtShared *pBt){
1835 assert( sqlite3_mutex_held(pBt->mutex) );
1836 if( pBt->inTransaction==TRANS_NONE && pBt->pCursor==0 && pBt->pPage1!=0 ){
1837 if( sqlite3PagerRefcount(pBt->pPager)>=1 ){
1838 assert( pBt->pPage1->aData );
1840 if( pBt->pPage1->aData==0 ){
1841 MemPage *pPage = pBt->pPage1;
1842 pPage->aData = sqlite3PagerGetData(pPage->pDbPage);
1847 releasePage(pBt->pPage1);
1855 ** Create a new database by initializing the first page of the
1858 static int newDatabase(BtShared *pBt){
1860 unsigned char *data;
1864 assert( sqlite3_mutex_held(pBt->mutex) );
1865 rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
1866 if( rc!=SQLITE_OK || nPage>0 ){
1872 rc = sqlite3PagerWrite(pP1->pDbPage);
1874 memcpy(data, zMagicHeader, sizeof(zMagicHeader));
1875 assert( sizeof(zMagicHeader)==16 );
1876 put2byte(&data[16], pBt->pageSize);
1879 data[20] = pBt->pageSize - pBt->usableSize;
1883 memset(&data[24], 0, 100-24);
1884 zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
1885 pBt->pageSizeFixed = 1;
1886 #ifndef SQLITE_OMIT_AUTOVACUUM
1887 assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
1888 assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
1889 put4byte(&data[36 + 4*4], pBt->autoVacuum);
1890 put4byte(&data[36 + 7*4], pBt->incrVacuum);
1896 ** Attempt to start a new transaction. A write-transaction
1897 ** is started if the second argument is nonzero, otherwise a read-
1898 ** transaction. If the second argument is 2 or more and exclusive
1899 ** transaction is started, meaning that no other process is allowed
1900 ** to access the database. A preexisting transaction may not be
1901 ** upgraded to exclusive by calling this routine a second time - the
1902 ** exclusivity flag only works for a new transaction.
1904 ** A write-transaction must be started before attempting any
1905 ** changes to the database. None of the following routines
1906 ** will work unless a transaction is started first:
1908 ** sqlite3BtreeCreateTable()
1909 ** sqlite3BtreeCreateIndex()
1910 ** sqlite3BtreeClearTable()
1911 ** sqlite3BtreeDropTable()
1912 ** sqlite3BtreeInsert()
1913 ** sqlite3BtreeDelete()
1914 ** sqlite3BtreeUpdateMeta()
1916 ** If an initial attempt to acquire the lock fails because of lock contention
1917 ** and the database was previously unlocked, then invoke the busy handler
1918 ** if there is one. But if there was previously a read-lock, do not
1919 ** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is
1920 ** returned when there is already a read-lock in order to avoid a deadlock.
1922 ** Suppose there are two processes A and B. A has a read lock and B has
1923 ** a reserved lock. B tries to promote to exclusive but is blocked because
1924 ** of A's read lock. A tries to promote to reserved but is blocked by B.
1925 ** One or the other of the two processes must give way or there can be
1926 ** no progress. By returning SQLITE_BUSY and not invoking the busy callback
1927 ** when A already has a read lock, we encourage A to give up and let B
1930 int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
1931 BtShared *pBt = p->pBt;
1934 sqlite3BtreeEnter(p);
1938 /* If the btree is already in a write-transaction, or it
1939 ** is already in a read-transaction and a read-transaction
1940 ** is requested, this is a no-op.
1942 if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
1946 /* Write transactions are not possible on a read-only database */
1947 if( pBt->readOnly && wrflag ){
1948 rc = SQLITE_READONLY;
1952 /* If another database handle has already opened a write transaction
1953 ** on this shared-btree structure and a second write transaction is
1954 ** requested, return SQLITE_BUSY.
1956 if( pBt->inTransaction==TRANS_WRITE && wrflag ){
1961 #ifndef SQLITE_OMIT_SHARED_CACHE
1964 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
1965 if( pIter->pBtree!=p ){
1974 if( pBt->pPage1==0 ){
1976 rc = lockBtree(pBt);
1977 }while( pBt->pPage1==0 && rc==SQLITE_OK );
1980 if( rc==SQLITE_OK && wrflag ){
1981 if( pBt->readOnly ){
1982 rc = SQLITE_READONLY;
1984 rc = sqlite3PagerBegin(pBt->pPage1->pDbPage, wrflag>1);
1985 if( rc==SQLITE_OK ){
1986 rc = newDatabase(pBt);
1991 if( rc==SQLITE_OK ){
1992 if( wrflag ) pBt->inStmt = 0;
1994 unlockBtreeIfUnused(pBt);
1996 }while( rc==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
1997 sqlite3BtreeInvokeBusyHandler(pBt, 0) );
1999 if( rc==SQLITE_OK ){
2000 if( p->inTrans==TRANS_NONE ){
2001 pBt->nTransaction++;
2003 p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
2004 if( p->inTrans>pBt->inTransaction ){
2005 pBt->inTransaction = p->inTrans;
2007 #ifndef SQLITE_OMIT_SHARED_CACHE
2009 assert( !pBt->pExclusive );
2010 pBt->pExclusive = p;
2018 sqlite3BtreeLeave(p);
2023 ** Return the size of the database file in pages. Or return -1 if
2024 ** there is any kind of error.
2026 static int pagerPagecount(Pager *pPager){
2029 rc = sqlite3PagerPagecount(pPager, &nPage);
2030 return (rc==SQLITE_OK?nPage:-1);
2034 #ifndef SQLITE_OMIT_AUTOVACUUM
2037 ** Set the pointer-map entries for all children of page pPage. Also, if
2038 ** pPage contains cells that point to overflow pages, set the pointer
2039 ** map entries for the overflow pages as well.
2041 static int setChildPtrmaps(MemPage *pPage){
2042 int i; /* Counter variable */
2043 int nCell; /* Number of cells in page pPage */
2044 int rc; /* Return code */
2045 BtShared *pBt = pPage->pBt;
2046 int isInitOrig = pPage->isInit;
2047 Pgno pgno = pPage->pgno;
2049 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2050 rc = sqlite3BtreeInitPage(pPage, pPage->pParent);
2051 if( rc!=SQLITE_OK ){
2052 goto set_child_ptrmaps_out;
2054 nCell = pPage->nCell;
2056 for(i=0; i<nCell; i++){
2057 u8 *pCell = findCell(pPage, i);
2059 rc = ptrmapPutOvflPtr(pPage, pCell);
2060 if( rc!=SQLITE_OK ){
2061 goto set_child_ptrmaps_out;
2065 Pgno childPgno = get4byte(pCell);
2066 rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
2067 if( rc!=SQLITE_OK ) goto set_child_ptrmaps_out;
2072 Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
2073 rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
2076 set_child_ptrmaps_out:
2077 pPage->isInit = isInitOrig;
2082 ** Somewhere on pPage, which is guarenteed to be a btree page, not an overflow
2083 ** page, is a pointer to page iFrom. Modify this pointer so that it points to
2084 ** iTo. Parameter eType describes the type of pointer to be modified, as
2087 ** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child
2090 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
2091 ** page pointed to by one of the cells on pPage.
2093 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
2094 ** overflow page in the list.
2096 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
2097 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2098 if( eType==PTRMAP_OVERFLOW2 ){
2099 /* The pointer is always the first 4 bytes of the page in this case. */
2100 if( get4byte(pPage->aData)!=iFrom ){
2101 return SQLITE_CORRUPT_BKPT;
2103 put4byte(pPage->aData, iTo);
2105 int isInitOrig = pPage->isInit;
2109 sqlite3BtreeInitPage(pPage, 0);
2110 nCell = pPage->nCell;
2112 for(i=0; i<nCell; i++){
2113 u8 *pCell = findCell(pPage, i);
2114 if( eType==PTRMAP_OVERFLOW1 ){
2116 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
2117 if( info.iOverflow ){
2118 if( iFrom==get4byte(&pCell[info.iOverflow]) ){
2119 put4byte(&pCell[info.iOverflow], iTo);
2124 if( get4byte(pCell)==iFrom ){
2125 put4byte(pCell, iTo);
2132 if( eType!=PTRMAP_BTREE ||
2133 get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
2134 return SQLITE_CORRUPT_BKPT;
2136 put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
2139 pPage->isInit = isInitOrig;
2146 ** Move the open database page pDbPage to location iFreePage in the
2147 ** database. The pDbPage reference remains valid.
2149 static int relocatePage(
2150 BtShared *pBt, /* Btree */
2151 MemPage *pDbPage, /* Open page to move */
2152 u8 eType, /* Pointer map 'type' entry for pDbPage */
2153 Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */
2154 Pgno iFreePage, /* The location to move pDbPage to */
2157 MemPage *pPtrPage; /* The page that contains a pointer to pDbPage */
2158 Pgno iDbPage = pDbPage->pgno;
2159 Pager *pPager = pBt->pPager;
2162 assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
2163 eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
2164 assert( sqlite3_mutex_held(pBt->mutex) );
2165 assert( pDbPage->pBt==pBt );
2167 /* Move page iDbPage from its current location to page number iFreePage */
2168 TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
2169 iDbPage, iFreePage, iPtrPage, eType));
2170 rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
2171 if( rc!=SQLITE_OK ){
2174 pDbPage->pgno = iFreePage;
2176 /* If pDbPage was a btree-page, then it may have child pages and/or cells
2177 ** that point to overflow pages. The pointer map entries for all these
2178 ** pages need to be changed.
2180 ** If pDbPage is an overflow page, then the first 4 bytes may store a
2181 ** pointer to a subsequent overflow page. If this is the case, then
2182 ** the pointer map needs to be updated for the subsequent overflow page.
2184 if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
2185 rc = setChildPtrmaps(pDbPage);
2186 if( rc!=SQLITE_OK ){
2190 Pgno nextOvfl = get4byte(pDbPage->aData);
2192 rc = ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage);
2193 if( rc!=SQLITE_OK ){
2199 /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
2200 ** that it points at iFreePage. Also fix the pointer map entry for
2203 if( eType!=PTRMAP_ROOTPAGE ){
2204 rc = sqlite3BtreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
2205 if( rc!=SQLITE_OK ){
2208 rc = sqlite3PagerWrite(pPtrPage->pDbPage);
2209 if( rc!=SQLITE_OK ){
2210 releasePage(pPtrPage);
2213 rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
2214 releasePage(pPtrPage);
2215 if( rc==SQLITE_OK ){
2216 rc = ptrmapPut(pBt, iFreePage, eType, iPtrPage);
2222 /* Forward declaration required by incrVacuumStep(). */
2223 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
2226 ** Perform a single step of an incremental-vacuum. If successful,
2227 ** return SQLITE_OK. If there is no work to do (and therefore no
2228 ** point in calling this function again), return SQLITE_DONE.
2230 ** More specificly, this function attempts to re-organize the
2231 ** database so that the last page of the file currently in use
2232 ** is no longer in use.
2234 ** If the nFin parameter is non-zero, the implementation assumes
2235 ** that the caller will keep calling incrVacuumStep() until
2236 ** it returns SQLITE_DONE or an error, and that nFin is the
2237 ** number of pages the database file will contain after this
2238 ** process is complete.
2240 static int incrVacuumStep(BtShared *pBt, Pgno nFin){
2241 Pgno iLastPg; /* Last page in the database */
2242 Pgno nFreeList; /* Number of pages still on the free-list */
2244 assert( sqlite3_mutex_held(pBt->mutex) );
2245 iLastPg = pBt->nTrunc;
2247 iLastPg = pagerPagecount(pBt->pPager);
2250 if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
2255 nFreeList = get4byte(&pBt->pPage1->aData[36]);
2256 if( nFreeList==0 || nFin==iLastPg ){
2260 rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
2261 if( rc!=SQLITE_OK ){
2264 if( eType==PTRMAP_ROOTPAGE ){
2265 return SQLITE_CORRUPT_BKPT;
2268 if( eType==PTRMAP_FREEPAGE ){
2270 /* Remove the page from the files free-list. This is not required
2271 ** if nFin is non-zero. In that case, the free-list will be
2272 ** truncated to zero after this function returns, so it doesn't
2273 ** matter if it still contains some garbage entries.
2277 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1);
2278 if( rc!=SQLITE_OK ){
2281 assert( iFreePg==iLastPg );
2282 releasePage(pFreePg);
2285 Pgno iFreePg; /* Index of free page to move pLastPg to */
2288 rc = sqlite3BtreeGetPage(pBt, iLastPg, &pLastPg, 0);
2289 if( rc!=SQLITE_OK ){
2293 /* If nFin is zero, this loop runs exactly once and page pLastPg
2294 ** is swapped with the first free page pulled off the free list.
2296 ** On the other hand, if nFin is greater than zero, then keep
2297 ** looping until a free-page located within the first nFin pages
2298 ** of the file is found.
2302 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0);
2303 if( rc!=SQLITE_OK ){
2304 releasePage(pLastPg);
2307 releasePage(pFreePg);
2308 }while( nFin!=0 && iFreePg>nFin );
2309 assert( iFreePg<iLastPg );
2311 rc = sqlite3PagerWrite(pLastPg->pDbPage);
2312 if( rc==SQLITE_OK ){
2313 rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, nFin!=0);
2315 releasePage(pLastPg);
2316 if( rc!=SQLITE_OK ){
2322 pBt->nTrunc = iLastPg - 1;
2323 while( pBt->nTrunc==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, pBt->nTrunc) ){
2330 ** A write-transaction must be opened before calling this function.
2331 ** It performs a single unit of work towards an incremental vacuum.
2333 ** If the incremental vacuum is finished after this function has run,
2334 ** SQLITE_DONE is returned. If it is not finished, but no error occured,
2335 ** SQLITE_OK is returned. Otherwise an SQLite error code.
2337 int sqlite3BtreeIncrVacuum(Btree *p){
2339 BtShared *pBt = p->pBt;
2341 sqlite3BtreeEnter(p);
2343 assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
2344 if( !pBt->autoVacuum ){
2347 invalidateAllOverflowCache(pBt);
2348 rc = incrVacuumStep(pBt, 0);
2350 sqlite3BtreeLeave(p);
2355 ** This routine is called prior to sqlite3PagerCommit when a transaction
2356 ** is commited for an auto-vacuum database.
2358 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
2359 ** the database file should be truncated to during the commit process.
2360 ** i.e. the database has been reorganized so that only the first *pnTrunc
2361 ** pages are in use.
2363 static int autoVacuumCommit(BtShared *pBt, Pgno *pnTrunc){
2365 Pager *pPager = pBt->pPager;
2367 int nRef = sqlite3PagerRefcount(pPager);
2370 assert( sqlite3_mutex_held(pBt->mutex) );
2371 invalidateAllOverflowCache(pBt);
2372 assert(pBt->autoVacuum);
2373 if( !pBt->incrVacuum ){
2376 if( pBt->nTrunc==0 ){
2379 const int pgsz = pBt->pageSize;
2380 int nOrig = pagerPagecount(pBt->pPager);
2382 if( PTRMAP_ISPAGE(pBt, nOrig) ){
2383 return SQLITE_CORRUPT_BKPT;
2385 if( nOrig==PENDING_BYTE_PAGE(pBt) ){
2388 nFree = get4byte(&pBt->pPage1->aData[36]);
2389 nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+pgsz/5)/(pgsz/5);
2390 nFin = nOrig - nFree - nPtrmap;
2391 if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<=PENDING_BYTE_PAGE(pBt) ){
2394 while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
2399 while( rc==SQLITE_OK ){
2400 rc = incrVacuumStep(pBt, nFin);
2402 if( rc==SQLITE_DONE ){
2403 assert(nFin==0 || pBt->nTrunc==0 || nFin<=pBt->nTrunc);
2405 if( pBt->nTrunc && nFin ){
2406 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
2407 put4byte(&pBt->pPage1->aData[32], 0);
2408 put4byte(&pBt->pPage1->aData[36], 0);
2412 if( rc!=SQLITE_OK ){
2413 sqlite3PagerRollback(pPager);
2417 if( rc==SQLITE_OK ){
2418 *pnTrunc = pBt->nTrunc;
2421 assert( nRef==sqlite3PagerRefcount(pPager) );
2428 ** This routine does the first phase of a two-phase commit. This routine
2429 ** causes a rollback journal to be created (if it does not already exist)
2430 ** and populated with enough information so that if a power loss occurs
2431 ** the database can be restored to its original state by playing back
2432 ** the journal. Then the contents of the journal are flushed out to
2433 ** the disk. After the journal is safely on oxide, the changes to the
2434 ** database are written into the database file and flushed to oxide.
2435 ** At the end of this call, the rollback journal still exists on the
2436 ** disk and we are still holding all locks, so the transaction has not
2437 ** committed. See sqlite3BtreeCommit() for the second phase of the
2440 ** This call is a no-op if no write-transaction is currently active on pBt.
2442 ** Otherwise, sync the database file for the btree pBt. zMaster points to
2443 ** the name of a master journal file that should be written into the
2444 ** individual journal file, or is NULL, indicating no master journal file
2445 ** (single database transaction).
2447 ** When this is called, the master journal should already have been
2448 ** created, populated with this journal pointer and synced to disk.
2450 ** Once this is routine has returned, the only thing required to commit
2451 ** the write-transaction for this database file is to delete the journal.
2453 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
2455 if( p->inTrans==TRANS_WRITE ){
2456 BtShared *pBt = p->pBt;
2458 sqlite3BtreeEnter(p);
2460 #ifndef SQLITE_OMIT_AUTOVACUUM
2461 if( pBt->autoVacuum ){
2462 rc = autoVacuumCommit(pBt, &nTrunc);
2463 if( rc!=SQLITE_OK ){
2464 sqlite3BtreeLeave(p);
2469 rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, nTrunc, 0);
2470 sqlite3BtreeLeave(p);
2476 ** Commit the transaction currently in progress.
2478 ** This routine implements the second phase of a 2-phase commit. The
2479 ** sqlite3BtreeSync() routine does the first phase and should be invoked
2480 ** prior to calling this routine. The sqlite3BtreeSync() routine did
2481 ** all the work of writing information out to disk and flushing the
2482 ** contents so that they are written onto the disk platter. All this
2483 ** routine has to do is delete or truncate the rollback journal
2484 ** (which causes the transaction to commit) and drop locks.
2486 ** This will release the write lock on the database file. If there
2487 ** are no active cursors, it also releases the read lock.
2489 int sqlite3BtreeCommitPhaseTwo(Btree *p){
2490 BtShared *pBt = p->pBt;
2492 sqlite3BtreeEnter(p);
2496 /* If the handle has a write-transaction open, commit the shared-btrees
2497 ** transaction and set the shared state to TRANS_READ.
2499 if( p->inTrans==TRANS_WRITE ){
2501 assert( pBt->inTransaction==TRANS_WRITE );
2502 assert( pBt->nTransaction>0 );
2503 rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
2504 if( rc!=SQLITE_OK ){
2505 sqlite3BtreeLeave(p);
2508 pBt->inTransaction = TRANS_READ;
2513 /* If the handle has any kind of transaction open, decrement the transaction
2514 ** count of the shared btree. If the transaction count reaches 0, set
2515 ** the shared state to TRANS_NONE. The unlockBtreeIfUnused() call below
2516 ** will unlock the pager.
2518 if( p->inTrans!=TRANS_NONE ){
2519 pBt->nTransaction--;
2520 if( 0==pBt->nTransaction ){
2521 pBt->inTransaction = TRANS_NONE;
2525 /* Set the handles current transaction state to TRANS_NONE and unlock
2526 ** the pager if this call closed the only read or write transaction.
2528 p->inTrans = TRANS_NONE;
2529 unlockBtreeIfUnused(pBt);
2532 sqlite3BtreeLeave(p);
2537 ** Do both phases of a commit.
2539 int sqlite3BtreeCommit(Btree *p){
2541 sqlite3BtreeEnter(p);
2542 rc = sqlite3BtreeCommitPhaseOne(p, 0);
2543 if( rc==SQLITE_OK ){
2544 rc = sqlite3BtreeCommitPhaseTwo(p);
2546 sqlite3BtreeLeave(p);
2552 ** Return the number of write-cursors open on this handle. This is for use
2553 ** in assert() expressions, so it is only compiled if NDEBUG is not
2556 ** For the purposes of this routine, a write-cursor is any cursor that
2557 ** is capable of writing to the databse. That means the cursor was
2558 ** originally opened for writing and the cursor has not be disabled
2559 ** by having its state changed to CURSOR_FAULT.
2561 static int countWriteCursors(BtShared *pBt){
2564 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
2565 if( pCur->wrFlag && pCur->eState!=CURSOR_FAULT ) r++;
2572 ** This routine sets the state to CURSOR_FAULT and the error
2573 ** code to errCode for every cursor on BtShared that pBtree
2576 ** Every cursor is tripped, including cursors that belong
2577 ** to other database connections that happen to be sharing
2578 ** the cache with pBtree.
2580 ** This routine gets called when a rollback occurs.
2581 ** All cursors using the same cache must be tripped
2582 ** to prevent them from trying to use the btree after
2583 ** the rollback. The rollback may have deleted tables
2584 ** or moved root pages, so it is not sufficient to
2585 ** save the state of the cursor. The cursor must be
2588 void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){
2590 sqlite3BtreeEnter(pBtree);
2591 for(p=pBtree->pBt->pCursor; p; p=p->pNext){
2592 clearCursorPosition(p);
2593 p->eState = CURSOR_FAULT;
2596 sqlite3BtreeLeave(pBtree);
2600 ** Rollback the transaction in progress. All cursors will be
2601 ** invalided by this operation. Any attempt to use a cursor
2602 ** that was open at the beginning of this operation will result
2605 ** This will release the write lock on the database file. If there
2606 ** are no active cursors, it also releases the read lock.
2608 int sqlite3BtreeRollback(Btree *p){
2610 BtShared *pBt = p->pBt;
2613 sqlite3BtreeEnter(p);
2615 rc = saveAllCursors(pBt, 0, 0);
2616 #ifndef SQLITE_OMIT_SHARED_CACHE
2617 if( rc!=SQLITE_OK ){
2618 /* This is a horrible situation. An IO or malloc() error occured whilst
2619 ** trying to save cursor positions. If this is an automatic rollback (as
2620 ** the result of a constraint, malloc() failure or IO error) then
2621 ** the cache may be internally inconsistent (not contain valid trees) so
2622 ** we cannot simply return the error to the caller. Instead, abort
2623 ** all queries that may be using any of the cursors that failed to save.
2625 sqlite3BtreeTripAllCursors(p, rc);
2631 if( p->inTrans==TRANS_WRITE ){
2634 #ifndef SQLITE_OMIT_AUTOVACUUM
2638 assert( TRANS_WRITE==pBt->inTransaction );
2639 rc2 = sqlite3PagerRollback(pBt->pPager);
2640 if( rc2!=SQLITE_OK ){
2644 /* The rollback may have destroyed the pPage1->aData value. So
2645 ** call sqlite3BtreeGetPage() on page 1 again to make
2646 ** sure pPage1->aData is set correctly. */
2647 if( sqlite3BtreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
2648 releasePage(pPage1);
2650 assert( countWriteCursors(pBt)==0 );
2651 pBt->inTransaction = TRANS_READ;
2654 if( p->inTrans!=TRANS_NONE ){
2655 assert( pBt->nTransaction>0 );
2656 pBt->nTransaction--;
2657 if( 0==pBt->nTransaction ){
2658 pBt->inTransaction = TRANS_NONE;
2662 p->inTrans = TRANS_NONE;
2664 unlockBtreeIfUnused(pBt);
2667 sqlite3BtreeLeave(p);
2672 ** Start a statement subtransaction. The subtransaction can
2673 ** can be rolled back independently of the main transaction.
2674 ** You must start a transaction before starting a subtransaction.
2675 ** The subtransaction is ended automatically if the main transaction
2676 ** commits or rolls back.
2678 ** Only one subtransaction may be active at a time. It is an error to try
2679 ** to start a new subtransaction if another subtransaction is already active.
2681 ** Statement subtransactions are used around individual SQL statements
2682 ** that are contained within a BEGIN...COMMIT block. If a constraint
2683 ** error occurs within the statement, the effect of that one statement
2684 ** can be rolled back without having to rollback the entire transaction.
2686 int sqlite3BtreeBeginStmt(Btree *p){
2688 BtShared *pBt = p->pBt;
2689 sqlite3BtreeEnter(p);
2691 if( (p->inTrans!=TRANS_WRITE) || pBt->inStmt ){
2692 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
2694 assert( pBt->inTransaction==TRANS_WRITE );
2695 rc = pBt->readOnly ? SQLITE_OK : sqlite3PagerStmtBegin(pBt->pPager);
2698 sqlite3BtreeLeave(p);
2704 ** Commit the statment subtransaction currently in progress. If no
2705 ** subtransaction is active, this is a no-op.
2707 int sqlite3BtreeCommitStmt(Btree *p){
2709 BtShared *pBt = p->pBt;
2710 sqlite3BtreeEnter(p);
2712 if( pBt->inStmt && !pBt->readOnly ){
2713 rc = sqlite3PagerStmtCommit(pBt->pPager);
2718 sqlite3BtreeLeave(p);
2723 ** Rollback the active statement subtransaction. If no subtransaction
2724 ** is active this routine is a no-op.
2726 ** All cursors will be invalidated by this operation. Any attempt
2727 ** to use a cursor that was open at the beginning of this operation
2728 ** will result in an error.
2730 int sqlite3BtreeRollbackStmt(Btree *p){
2732 BtShared *pBt = p->pBt;
2733 sqlite3BtreeEnter(p);
2735 if( pBt->inStmt && !pBt->readOnly ){
2736 rc = sqlite3PagerStmtRollback(pBt->pPager);
2739 sqlite3BtreeLeave(p);
2744 ** Create a new cursor for the BTree whose root is on the page
2745 ** iTable. The act of acquiring a cursor gets a read lock on
2746 ** the database file.
2748 ** If wrFlag==0, then the cursor can only be used for reading.
2749 ** If wrFlag==1, then the cursor can be used for reading or for
2750 ** writing if other conditions for writing are also met. These
2751 ** are the conditions that must be met in order for writing to
2754 ** 1: The cursor must have been opened with wrFlag==1
2756 ** 2: Other database connections that share the same pager cache
2757 ** but which are not in the READ_UNCOMMITTED state may not have
2758 ** cursors open with wrFlag==0 on the same table. Otherwise
2759 ** the changes made by this write cursor would be visible to
2760 ** the read cursors in the other database connection.
2762 ** 3: The database must be writable (not on read-only media)
2764 ** 4: There must be an active transaction.
2766 ** No checking is done to make sure that page iTable really is the
2767 ** root page of a b-tree. If it is not, then the cursor acquired
2768 ** will not work correctly.
2770 static int btreeCursor(
2771 Btree *p, /* The btree */
2772 int iTable, /* Root page of table to open */
2773 int wrFlag, /* 1 to write. 0 read-only */
2774 struct KeyInfo *pKeyInfo, /* First arg to comparison function */
2775 BtCursor *pCur /* Space for new cursor */
2778 BtShared *pBt = p->pBt;
2780 assert( sqlite3BtreeHoldsMutex(p) );
2782 if( pBt->readOnly ){
2783 return SQLITE_READONLY;
2785 if( checkReadLocks(p, iTable, 0, 0) ){
2786 return SQLITE_LOCKED;
2790 if( pBt->pPage1==0 ){
2791 rc = lockBtreeWithRetry(p);
2792 if( rc!=SQLITE_OK ){
2795 if( pBt->readOnly && wrFlag ){
2796 return SQLITE_READONLY;
2799 pCur->pgnoRoot = (Pgno)iTable;
2800 if( iTable==1 && pagerPagecount(pBt->pPager)==0 ){
2802 goto create_cursor_exception;
2804 rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->pPage, 0);
2805 if( rc!=SQLITE_OK ){
2806 goto create_cursor_exception;
2809 /* Now that no other errors can occur, finish filling in the BtCursor
2810 ** variables, link the cursor into the BtShared list and set *ppCur (the
2811 ** output argument to this function).
2813 pCur->pKeyInfo = pKeyInfo;
2816 pCur->wrFlag = wrFlag;
2817 pCur->pNext = pBt->pCursor;
2819 pCur->pNext->pPrev = pCur;
2821 pBt->pCursor = pCur;
2822 pCur->eState = CURSOR_INVALID;
2826 create_cursor_exception:
2827 releasePage(pCur->pPage);
2828 unlockBtreeIfUnused(pBt);
2831 int sqlite3BtreeCursor(
2832 Btree *p, /* The btree */
2833 int iTable, /* Root page of table to open */
2834 int wrFlag, /* 1 to write. 0 read-only */
2835 struct KeyInfo *pKeyInfo, /* First arg to xCompare() */
2836 BtCursor *pCur /* Write new cursor here */
2839 sqlite3BtreeEnter(p);
2841 rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
2842 sqlite3BtreeLeave(p);
2845 int sqlite3BtreeCursorSize(){
2846 return sizeof(BtCursor);
2852 ** Close a cursor. The read lock on the database file is released
2853 ** when the last cursor is closed.
2855 int sqlite3BtreeCloseCursor(BtCursor *pCur){
2856 Btree *pBtree = pCur->pBtree;
2858 BtShared *pBt = pCur->pBt;
2859 sqlite3BtreeEnter(pBtree);
2860 pBt->db = pBtree->db;
2861 clearCursorPosition(pCur);
2863 pCur->pPrev->pNext = pCur->pNext;
2865 pBt->pCursor = pCur->pNext;
2868 pCur->pNext->pPrev = pCur->pPrev;
2870 releasePage(pCur->pPage);
2871 unlockBtreeIfUnused(pBt);
2872 invalidateOverflowCache(pCur);
2873 /* sqlite3_free(pCur); */
2874 sqlite3BtreeLeave(pBtree);
2880 ** Make a temporary cursor by filling in the fields of pTempCur.
2881 ** The temporary cursor is not on the cursor list for the Btree.
2883 void sqlite3BtreeGetTempCursor(BtCursor *pCur, BtCursor *pTempCur){
2884 assert( cursorHoldsMutex(pCur) );
2885 memcpy(pTempCur, pCur, sizeof(*pCur));
2886 pTempCur->pNext = 0;
2887 pTempCur->pPrev = 0;
2888 if( pTempCur->pPage ){
2889 sqlite3PagerRef(pTempCur->pPage->pDbPage);
2894 ** Delete a temporary cursor such as was made by the CreateTemporaryCursor()
2897 void sqlite3BtreeReleaseTempCursor(BtCursor *pCur){
2898 assert( cursorHoldsMutex(pCur) );
2900 sqlite3PagerUnref(pCur->pPage->pDbPage);
2905 ** Make sure the BtCursor* given in the argument has a valid
2906 ** BtCursor.info structure. If it is not already valid, call
2907 ** sqlite3BtreeParseCell() to fill it in.
2909 ** BtCursor.info is a cache of the information in the current cell.
2910 ** Using this cache reduces the number of calls to sqlite3BtreeParseCell().
2912 ** 2007-06-25: There is a bug in some versions of MSVC that cause the
2913 ** compiler to crash when getCellInfo() is implemented as a macro.
2914 ** But there is a measureable speed advantage to using the macro on gcc
2915 ** (when less compiler optimizations like -Os or -O0 are used and the
2916 ** compiler is not doing agressive inlining.) So we use a real function
2917 ** for MSVC and a macro for everything else. Ticket #2457.
2920 static void assertCellInfo(BtCursor *pCur){
2922 memset(&info, 0, sizeof(info));
2923 sqlite3BtreeParseCell(pCur->pPage, pCur->idx, &info);
2924 assert( memcmp(&info, &pCur->info, sizeof(info))==0 );
2927 #define assertCellInfo(x)
2930 /* Use a real function in MSVC to work around bugs in that compiler. */
2931 static void getCellInfo(BtCursor *pCur){
2932 if( pCur->info.nSize==0 ){
2933 sqlite3BtreeParseCell(pCur->pPage, pCur->idx, &pCur->info);
2934 pCur->validNKey = 1;
2936 assertCellInfo(pCur);
2939 #else /* if not _MSC_VER */
2940 /* Use a macro in all other compilers so that the function is inlined */
2941 #define getCellInfo(pCur) \
2942 if( pCur->info.nSize==0 ){ \
2943 sqlite3BtreeParseCell(pCur->pPage, pCur->idx, &pCur->info); \
2944 pCur->validNKey = 1; \
2946 assertCellInfo(pCur); \
2948 #endif /* _MSC_VER */
2951 ** Set *pSize to the size of the buffer needed to hold the value of
2952 ** the key for the current entry. If the cursor is not pointing
2953 ** to a valid entry, *pSize is set to 0.
2955 ** For a table with the INTKEY flag set, this routine returns the key
2956 ** itself, not the number of bytes in the key.
2958 int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
2961 assert( cursorHoldsMutex(pCur) );
2962 rc = restoreCursorPosition(pCur);
2963 if( rc==SQLITE_OK ){
2964 assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
2965 if( pCur->eState==CURSOR_INVALID ){
2969 *pSize = pCur->info.nKey;
2976 ** Set *pSize to the number of bytes of data in the entry the
2977 ** cursor currently points to. Always return SQLITE_OK.
2978 ** Failure is not possible. If the cursor is not currently
2979 ** pointing to an entry (which can happen, for example, if
2980 ** the database is empty) then *pSize is set to 0.
2982 int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
2985 assert( cursorHoldsMutex(pCur) );
2986 rc = restoreCursorPosition(pCur);
2987 if( rc==SQLITE_OK ){
2988 assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
2989 if( pCur->eState==CURSOR_INVALID ){
2990 /* Not pointing at a valid entry - set *pSize to 0. */
2994 *pSize = pCur->info.nData;
3001 ** Given the page number of an overflow page in the database (parameter
3002 ** ovfl), this function finds the page number of the next page in the
3003 ** linked list of overflow pages. If possible, it uses the auto-vacuum
3004 ** pointer-map data instead of reading the content of page ovfl to do so.
3006 ** If an error occurs an SQLite error code is returned. Otherwise:
3008 ** Unless pPgnoNext is NULL, the page number of the next overflow
3009 ** page in the linked list is written to *pPgnoNext. If page ovfl
3010 ** is the last page in its linked list, *pPgnoNext is set to zero.
3012 ** If ppPage is not NULL, *ppPage is set to the MemPage* handle
3013 ** for page ovfl. The underlying pager page may have been requested
3014 ** with the noContent flag set, so the page data accessable via
3015 ** this handle may not be trusted.
3017 static int getOverflowPage(
3019 Pgno ovfl, /* Overflow page */
3020 MemPage **ppPage, /* OUT: MemPage handle */
3021 Pgno *pPgnoNext /* OUT: Next overflow page number */
3024 int rc = SQLITE_OK; /* Initialized to placate warning */
3026 assert( sqlite3_mutex_held(pBt->mutex) );
3027 /* One of these must not be NULL. Otherwise, why call this function? */
3028 assert(ppPage || pPgnoNext);
3030 /* If pPgnoNext is NULL, then this function is being called to obtain
3031 ** a MemPage* reference only. No page-data is required in this case.
3034 return sqlite3BtreeGetPage(pBt, ovfl, ppPage, 1);
3037 #ifndef SQLITE_OMIT_AUTOVACUUM
3038 /* Try to find the next page in the overflow list using the
3039 ** autovacuum pointer-map pages. Guess that the next page in
3040 ** the overflow list is page number (ovfl+1). If that guess turns
3041 ** out to be wrong, fall back to loading the data of page
3042 ** number ovfl to determine the next page number.
3044 if( pBt->autoVacuum ){
3046 Pgno iGuess = ovfl+1;
3049 while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
3053 if( iGuess<=pagerPagecount(pBt->pPager) ){
3054 rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
3055 if( rc!=SQLITE_OK ){
3058 if( eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
3065 if( next==0 || ppPage ){
3068 rc = sqlite3BtreeGetPage(pBt, ovfl, &pPage, next!=0);
3069 assert(rc==SQLITE_OK || pPage==0);
3070 if( next==0 && rc==SQLITE_OK ){
3071 next = get4byte(pPage->aData);
3086 ** Copy data from a buffer to a page, or from a page to a buffer.
3088 ** pPayload is a pointer to data stored on database page pDbPage.
3089 ** If argument eOp is false, then nByte bytes of data are copied
3090 ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
3091 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
3092 ** of data are copied from the buffer pBuf to pPayload.
3094 ** SQLITE_OK is returned on success, otherwise an error code.
3096 static int copyPayload(
3097 void *pPayload, /* Pointer to page data */
3098 void *pBuf, /* Pointer to buffer */
3099 int nByte, /* Number of bytes to copy */
3100 int eOp, /* 0 -> copy from page, 1 -> copy to page */
3101 DbPage *pDbPage /* Page containing pPayload */
3104 /* Copy data from buffer to page (a write operation) */
3105 int rc = sqlite3PagerWrite(pDbPage);
3106 if( rc!=SQLITE_OK ){
3109 memcpy(pPayload, pBuf, nByte);
3111 /* Copy data from page to buffer (a read operation) */
3112 memcpy(pBuf, pPayload, nByte);
3118 ** This function is used to read or overwrite payload information
3119 ** for the entry that the pCur cursor is pointing to. If the eOp
3120 ** parameter is 0, this is a read operation (data copied into
3121 ** buffer pBuf). If it is non-zero, a write (data copied from
3124 ** A total of "amt" bytes are read or written beginning at "offset".
3125 ** Data is read to or from the buffer pBuf.
3127 ** This routine does not make a distinction between key and data.
3128 ** It just reads or writes bytes from the payload area. Data might
3129 ** appear on the main page or be scattered out on multiple overflow
3132 ** If the BtCursor.isIncrblobHandle flag is set, and the current
3133 ** cursor entry uses one or more overflow pages, this function
3134 ** allocates space for and lazily popluates the overflow page-list
3135 ** cache array (BtCursor.aOverflow). Subsequent calls use this
3136 ** cache to make seeking to the supplied offset more efficient.
3138 ** Once an overflow page-list cache has been allocated, it may be
3139 ** invalidated if some other cursor writes to the same table, or if
3140 ** the cursor is moved to a different row. Additionally, in auto-vacuum
3141 ** mode, the following events may invalidate an overflow page-list cache.
3143 ** * An incremental vacuum,
3144 ** * A commit in auto_vacuum="full" mode,
3145 ** * Creating a table (may require moving an overflow page).
3147 static int accessPayload(
3148 BtCursor *pCur, /* Cursor pointing to entry to read from */
3149 int offset, /* Begin reading this far into payload */
3150 int amt, /* Read this many bytes */
3151 unsigned char *pBuf, /* Write the bytes into this buffer */
3152 int skipKey, /* offset begins at data if this is true */
3153 int eOp /* zero to read. non-zero to write. */
3155 unsigned char *aPayload;
3159 MemPage *pPage = pCur->pPage; /* Btree page of current cursor entry */
3160 BtShared *pBt; /* Btree this cursor belongs to */
3163 assert( pCur->eState==CURSOR_VALID );
3164 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
3165 assert( offset>=0 );
3166 assert( cursorHoldsMutex(pCur) );
3169 aPayload = pCur->info.pCell + pCur->info.nHeader;
3170 nKey = (pPage->intKey ? 0 : pCur->info.nKey);
3175 if( offset+amt > nKey+pCur->info.nData ){
3176 /* Trying to read or write past the end of the data is an error */
3177 return SQLITE_ERROR;
3180 /* Check if data must be read/written to/from the btree page itself. */
3181 if( offset<pCur->info.nLocal ){
3183 if( a+offset>pCur->info.nLocal ){
3184 a = pCur->info.nLocal - offset;
3186 rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
3191 offset -= pCur->info.nLocal;
3195 if( rc==SQLITE_OK && amt>0 ){
3196 const int ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */
3199 nextPage = get4byte(&aPayload[pCur->info.nLocal]);
3201 #ifndef SQLITE_OMIT_INCRBLOB
3202 /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[]
3203 ** has not been allocated, allocate it now. The array is sized at
3204 ** one entry for each overflow page in the overflow chain. The
3205 ** page number of the first overflow page is stored in aOverflow[0],
3206 ** etc. A value of 0 in the aOverflow[] array means "not yet known"
3207 ** (the cache is lazily populated).
3209 if( pCur->isIncrblobHandle && !pCur->aOverflow ){
3210 int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
3211 pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl);
3212 if( nOvfl && !pCur->aOverflow ){
3217 /* If the overflow page-list cache has been allocated and the
3218 ** entry for the first required overflow page is valid, skip
3221 if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){
3222 iIdx = (offset/ovflSize);
3223 nextPage = pCur->aOverflow[iIdx];
3224 offset = (offset%ovflSize);
3228 for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){
3230 #ifndef SQLITE_OMIT_INCRBLOB
3231 /* If required, populate the overflow page-list cache. */
3232 if( pCur->aOverflow ){
3233 assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage);
3234 pCur->aOverflow[iIdx] = nextPage;
3238 if( offset>=ovflSize ){
3239 /* The only reason to read this page is to obtain the page
3240 ** number for the next page in the overflow chain. The page
3241 ** data is not required. So first try to lookup the overflow
3242 ** page-list cache, if any, then fall back to the getOverflowPage()
3245 #ifndef SQLITE_OMIT_INCRBLOB
3246 if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){
3247 nextPage = pCur->aOverflow[iIdx+1];
3250 rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
3253 /* Need to read this page properly. It contains some of the
3254 ** range of data that is being read (eOp==0) or written (eOp!=0).
3258 rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage);
3259 if( rc==SQLITE_OK ){
3260 aPayload = sqlite3PagerGetData(pDbPage);
3261 nextPage = get4byte(aPayload);
3262 if( a + offset > ovflSize ){
3263 a = ovflSize - offset;
3265 rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
3266 sqlite3PagerUnref(pDbPage);
3275 if( rc==SQLITE_OK && amt>0 ){
3276 return SQLITE_CORRUPT_BKPT;
3282 ** Read part of the key associated with cursor pCur. Exactly
3283 ** "amt" bytes will be transfered into pBuf[]. The transfer
3284 ** begins at "offset".
3286 ** Return SQLITE_OK on success or an error code if anything goes
3287 ** wrong. An error is returned if "offset+amt" is larger than
3288 ** the available payload.
3290 int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
3293 assert( cursorHoldsMutex(pCur) );
3294 rc = restoreCursorPosition(pCur);
3295 if( rc==SQLITE_OK ){
3296 assert( pCur->eState==CURSOR_VALID );
3297 assert( pCur->pPage!=0 );
3298 if( pCur->pPage->intKey ){
3299 return SQLITE_CORRUPT_BKPT;
3301 assert( pCur->pPage->intKey==0 );
3302 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
3303 rc = accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0, 0);
3309 ** Read part of the data associated with cursor pCur. Exactly
3310 ** "amt" bytes will be transfered into pBuf[]. The transfer
3311 ** begins at "offset".
3313 ** Return SQLITE_OK on success or an error code if anything goes
3314 ** wrong. An error is returned if "offset+amt" is larger than
3315 ** the available payload.
3317 int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
3320 #ifndef SQLITE_OMIT_INCRBLOB
3321 if ( pCur->eState==CURSOR_INVALID ){
3322 return SQLITE_ABORT;
3326 assert( cursorHoldsMutex(pCur) );
3327 rc = restoreCursorPosition(pCur);
3328 if( rc==SQLITE_OK ){
3329 assert( pCur->eState==CURSOR_VALID );
3330 assert( pCur->pPage!=0 );
3331 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
3332 rc = accessPayload(pCur, offset, amt, pBuf, 1, 0);
3338 ** Return a pointer to payload information from the entry that the
3339 ** pCur cursor is pointing to. The pointer is to the beginning of
3340 ** the key if skipKey==0 and it points to the beginning of data if
3341 ** skipKey==1. The number of bytes of available key/data is written
3342 ** into *pAmt. If *pAmt==0, then the value returned will not be
3345 ** This routine is an optimization. It is common for the entire key
3346 ** and data to fit on the local page and for there to be no overflow
3347 ** pages. When that is so, this routine can be used to access the
3348 ** key and data without making a copy. If the key and/or data spills
3349 ** onto overflow pages, then accessPayload() must be used to reassembly
3350 ** the key/data and copy it into a preallocated buffer.
3352 ** The pointer returned by this routine looks directly into the cached
3353 ** page of the database. The data might change or move the next time
3354 ** any btree routine is called.
3356 static const unsigned char *fetchPayload(
3357 BtCursor *pCur, /* Cursor pointing to entry to read from */
3358 int *pAmt, /* Write the number of available bytes here */
3359 int skipKey /* read beginning at data if this is true */
3361 unsigned char *aPayload;
3366 assert( pCur!=0 && pCur->pPage!=0 );
3367 assert( pCur->eState==CURSOR_VALID );
3368 assert( cursorHoldsMutex(pCur) );
3369 pPage = pCur->pPage;
3370 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
3372 aPayload = pCur->info.pCell;
3373 aPayload += pCur->info.nHeader;
3374 if( pPage->intKey ){
3377 nKey = pCur->info.nKey;
3381 nLocal = pCur->info.nLocal - nKey;
3383 nLocal = pCur->info.nLocal;
3394 ** For the entry that cursor pCur is point to, return as
3395 ** many bytes of the key or data as are available on the local
3396 ** b-tree page. Write the number of available bytes into *pAmt.
3398 ** The pointer returned is ephemeral. The key/data may move
3399 ** or be destroyed on the next call to any Btree routine,
3400 ** including calls from other threads against the same cache.
3401 ** Hence, a mutex on the BtShared should be held prior to calling
3404 ** These routines is used to get quick access to key and data
3405 ** in the common case where no overflow pages are used.
3407 const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){
3408 assert( cursorHoldsMutex(pCur) );
3409 if( pCur->eState==CURSOR_VALID ){
3410 return (const void*)fetchPayload(pCur, pAmt, 0);
3414 const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){
3415 assert( cursorHoldsMutex(pCur) );
3416 if( pCur->eState==CURSOR_VALID ){
3417 return (const void*)fetchPayload(pCur, pAmt, 1);
3424 ** Move the cursor down to a new child page. The newPgno argument is the
3425 ** page number of the child page to move to.
3427 static int moveToChild(BtCursor *pCur, u32 newPgno){
3431 BtShared *pBt = pCur->pBt;
3433 assert( cursorHoldsMutex(pCur) );
3434 assert( pCur->eState==CURSOR_VALID );
3435 rc = getAndInitPage(pBt, newPgno, &pNewPage, pCur->pPage);
3437 pNewPage->idxParent = pCur->idx;
3438 pOldPage = pCur->pPage;
3439 pOldPage->idxShift = 0;
3440 releasePage(pOldPage);
3441 pCur->pPage = pNewPage;
3443 pCur->info.nSize = 0;
3444 pCur->validNKey = 0;
3445 if( pNewPage->nCell<1 ){
3446 return SQLITE_CORRUPT_BKPT;
3452 ** Return true if the page is the virtual root of its table.
3454 ** The virtual root page is the root page for most tables. But
3455 ** for the table rooted on page 1, sometime the real root page
3456 ** is empty except for the right-pointer. In such cases the
3457 ** virtual root page is the page that the right-pointer of page
3458 ** 1 is pointing to.
3460 int sqlite3BtreeIsRootPage(MemPage *pPage){
3463 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
3464 pParent = pPage->pParent;
3465 if( pParent==0 ) return 1;
3466 if( pParent->pgno>1 ) return 0;
3467 if( get2byte(&pParent->aData[pParent->hdrOffset+3])==0 ) return 1;
3472 ** Move the cursor up to the parent page.
3474 ** pCur->idx is set to the cell index that contains the pointer
3475 ** to the page we are coming from. If we are coming from the
3476 ** right-most child page then pCur->idx is set to one more than
3477 ** the largest cell index.
3479 void sqlite3BtreeMoveToParent(BtCursor *pCur){
3484 assert( cursorHoldsMutex(pCur) );
3485 assert( pCur->eState==CURSOR_VALID );
3486 pPage = pCur->pPage;
3488 assert( !sqlite3BtreeIsRootPage(pPage) );
3489 pParent = pPage->pParent;
3490 assert( pParent!=0 );
3491 idxParent = pPage->idxParent;
3492 sqlite3PagerRef(pParent->pDbPage);
3494 pCur->pPage = pParent;
3495 pCur->info.nSize = 0;
3496 pCur->validNKey = 0;
3497 assert( pParent->idxShift==0 );
3498 pCur->idx = idxParent;
3502 ** Move the cursor to the root page
3504 static int moveToRoot(BtCursor *pCur){
3507 Btree *p = pCur->pBtree;
3508 BtShared *pBt = p->pBt;
3510 assert( cursorHoldsMutex(pCur) );
3511 assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
3512 assert( CURSOR_VALID < CURSOR_REQUIRESEEK );
3513 assert( CURSOR_FAULT > CURSOR_REQUIRESEEK );
3514 if( pCur->eState>=CURSOR_REQUIRESEEK ){
3515 if( pCur->eState==CURSOR_FAULT ){
3518 clearCursorPosition(pCur);
3520 pRoot = pCur->pPage;
3521 if( pRoot && pRoot->pgno==pCur->pgnoRoot ){
3522 assert( pRoot->isInit );
3525 SQLITE_OK!=(rc = getAndInitPage(pBt, pCur->pgnoRoot, &pRoot, 0))
3527 pCur->eState = CURSOR_INVALID;
3530 releasePage(pCur->pPage);
3531 pCur->pPage = pRoot;
3534 pCur->info.nSize = 0;
3536 pCur->validNKey = 0;
3537 if( pRoot->nCell==0 && !pRoot->leaf ){
3539 assert( pRoot->pgno==1 );
3540 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
3541 assert( subpage>0 );
3542 pCur->eState = CURSOR_VALID;
3543 rc = moveToChild(pCur, subpage);
3545 pCur->eState = ((pCur->pPage->nCell>0)?CURSOR_VALID:CURSOR_INVALID);
3550 ** Move the cursor down to the left-most leaf entry beneath the
3551 ** entry to which it is currently pointing.
3553 ** The left-most leaf is the one with the smallest key - the first
3554 ** in ascending order.
3556 static int moveToLeftmost(BtCursor *pCur){
3561 assert( cursorHoldsMutex(pCur) );
3562 assert( pCur->eState==CURSOR_VALID );
3563 while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){
3564 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
3565 pgno = get4byte(findCell(pPage, pCur->idx));
3566 rc = moveToChild(pCur, pgno);
3572 ** Move the cursor down to the right-most leaf entry beneath the
3573 ** page to which it is currently pointing. Notice the difference
3574 ** between moveToLeftmost() and moveToRightmost(). moveToLeftmost()
3575 ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
3576 ** finds the right-most entry beneath the *page*.
3578 ** The right-most entry is the one with the largest key - the last
3579 ** key in ascending order.
3581 static int moveToRightmost(BtCursor *pCur){
3586 assert( cursorHoldsMutex(pCur) );
3587 assert( pCur->eState==CURSOR_VALID );
3588 while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){
3589 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
3590 pCur->idx = pPage->nCell;
3591 rc = moveToChild(pCur, pgno);
3593 if( rc==SQLITE_OK ){
3594 pCur->idx = pPage->nCell - 1;
3595 pCur->info.nSize = 0;
3596 pCur->validNKey = 0;
3601 /* Move the cursor to the first entry in the table. Return SQLITE_OK
3602 ** on success. Set *pRes to 0 if the cursor actually points to something
3603 ** or set *pRes to 1 if the table is empty.
3605 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
3608 assert( cursorHoldsMutex(pCur) );
3609 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
3610 rc = moveToRoot(pCur);
3611 if( rc==SQLITE_OK ){
3612 if( pCur->eState==CURSOR_INVALID ){
3613 assert( pCur->pPage->nCell==0 );
3617 assert( pCur->pPage->nCell>0 );
3619 rc = moveToLeftmost(pCur);
3625 /* Move the cursor to the last entry in the table. Return SQLITE_OK
3626 ** on success. Set *pRes to 0 if the cursor actually points to something
3627 ** or set *pRes to 1 if the table is empty.
3629 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
3632 assert( cursorHoldsMutex(pCur) );
3633 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
3634 rc = moveToRoot(pCur);
3635 if( rc==SQLITE_OK ){
3636 if( CURSOR_INVALID==pCur->eState ){
3637 assert( pCur->pPage->nCell==0 );
3640 assert( pCur->eState==CURSOR_VALID );
3642 rc = moveToRightmost(pCur);
3644 pCur->atLast = rc==SQLITE_OK;
3650 /* Move the cursor so that it points to an entry near the key
3651 ** specified by pKey/nKey/pUnKey. Return a success code.
3653 ** For INTKEY tables, only the nKey parameter is used. pKey
3654 ** and pUnKey must be NULL. For index tables, either pUnKey
3655 ** must point to a key that has already been unpacked, or else
3656 ** pKey/nKey describes a blob containing the key.
3658 ** If an exact match is not found, then the cursor is always
3659 ** left pointing at a leaf page which would hold the entry if it
3660 ** were present. The cursor might point to an entry that comes
3661 ** before or after the key.
3663 ** The result of comparing the key with the entry to which the
3664 ** cursor is written to *pRes if pRes!=NULL. The meaning of
3665 ** this value is as follows:
3667 ** *pRes<0 The cursor is left pointing at an entry that
3668 ** is smaller than pKey or if the table is empty
3669 ** and the cursor is therefore left point to nothing.
3671 ** *pRes==0 The cursor is left pointing at an entry that
3672 ** exactly matches pKey.
3674 ** *pRes>0 The cursor is left pointing at an entry that
3675 ** is larger than pKey.
3678 int sqlite3BtreeMoveto(
3679 BtCursor *pCur, /* The cursor to be moved */
3680 const void *pKey, /* The key content for indices. Not used by tables */
3681 UnpackedRecord *pUnKey,/* Unpacked version of pKey */
3682 i64 nKey, /* Size of pKey. Or the key for tables */
3683 int biasRight, /* If true, bias the search to the high end */
3684 int *pRes /* Search result flag */
3689 assert( cursorHoldsMutex(pCur) );
3690 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
3692 /* If the cursor is already positioned at the point we are trying
3693 ** to move to, then just return without doing any work */
3694 if( pCur->eState==CURSOR_VALID && pCur->validNKey && pCur->pPage->intKey ){
3695 if( pCur->info.nKey==nKey ){
3699 if( pCur->atLast && pCur->info.nKey<nKey ){
3706 rc = moveToRoot(pCur);
3710 assert( pCur->pPage );
3711 assert( pCur->pPage->isInit );
3712 if( pCur->eState==CURSOR_INVALID ){
3714 assert( pCur->pPage->nCell==0 );
3717 if( pCur->pPage->intKey ){
3718 /* We are given an SQL table to search. The key is the integer
3719 ** rowid contained in nKey. pKey and pUnKey should both be NULL */
3720 assert( pUnKey==0 );
3722 }else if( pUnKey==0 ){
3723 /* We are to search an SQL index using a key encoded as a blob.
3724 ** The blob is found at pKey and is nKey bytes in length. Unpack
3725 ** this key so that we can use it. */
3727 pUnKey = sqlite3VdbeRecordUnpack(pCur->pKeyInfo, nKey, pKey,
3728 aSpace, sizeof(aSpace));
3729 if( pUnKey==0 ) return SQLITE_NOMEM;
3731 /* We are to search an SQL index using a key that is already unpacked
3732 ** and handed to us in pUnKey. */
3738 MemPage *pPage = pCur->pPage;
3739 int c = -1; /* pRes return if table is empty must be -1 */
3741 upr = pPage->nCell-1;
3742 if( !pPage->intKey && pUnKey==0 ){
3743 rc = SQLITE_CORRUPT_BKPT;
3749 pCur->idx = (upr+lwr)/2;
3751 if( lwr<=upr ) for(;;){
3754 pCur->info.nSize = 0;
3755 pCur->validNKey = 1;
3756 if( pPage->intKey ){
3758 pCell = findCell(pPage, pCur->idx) + pPage->childPtrSize;
3759 if( pPage->hasData ){
3761 pCell += getVarint32(pCell, dummy);
3763 getVarint(pCell, (u64*)&nCellKey);
3764 if( nCellKey==nKey ){
3766 }else if( nCellKey<nKey ){
3769 assert( nCellKey>nKey );
3774 pCellKey = (void *)fetchPayload(pCur, &available, 0);
3775 nCellKey = pCur->info.nKey;
3776 if( available>=nCellKey ){
3777 c = sqlite3VdbeRecordCompare(nCellKey, pCellKey, pUnKey);
3779 pCellKey = sqlite3Malloc( nCellKey );
3784 rc = sqlite3BtreeKey(pCur, 0, nCellKey, (void *)pCellKey);
3785 c = sqlite3VdbeRecordCompare(nCellKey, pCellKey, pUnKey);
3786 sqlite3_free(pCellKey);
3787 if( rc ) goto moveto_finish;
3791 pCur->info.nKey = nCellKey;
3792 if( pPage->intKey && !pPage->leaf ){
3797 if( pRes ) *pRes = 0;
3808 pCur->info.nKey = nCellKey;
3811 pCur->idx = (lwr+upr)/2;
3813 assert( lwr==upr+1 );
3814 assert( pPage->isInit );
3817 }else if( lwr>=pPage->nCell ){
3818 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
3820 chldPg = get4byte(findCell(pPage, lwr));
3823 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
3824 if( pRes ) *pRes = c;
3829 pCur->info.nSize = 0;
3830 pCur->validNKey = 0;
3831 rc = moveToChild(pCur, chldPg);
3832 if( rc ) goto moveto_finish;
3836 /* If we created our own unpacked key at the top of this
3837 ** procedure, then destroy that key before returning. */
3838 sqlite3VdbeDeleteUnpackedRecord(pUnKey);
3845 ** Return TRUE if the cursor is not pointing at an entry of the table.
3847 ** TRUE will be returned after a call to sqlite3BtreeNext() moves
3848 ** past the last entry in the table or sqlite3BtreePrev() moves past
3849 ** the first entry. TRUE is also returned if the table is empty.
3851 int sqlite3BtreeEof(BtCursor *pCur){
3852 /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
3853 ** have been deleted? This API will need to change to return an error code
3854 ** as well as the boolean result value.
3856 return (CURSOR_VALID!=pCur->eState);
3860 ** Return the database connection handle for a cursor.
3862 sqlite3 *sqlite3BtreeCursorDb(const BtCursor *pCur){
3863 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
3864 return pCur->pBtree->db;
3868 ** Advance the cursor to the next entry in the database. If
3869 ** successful then set *pRes=0. If the cursor
3870 ** was already pointing to the last entry in the database before
3871 ** this routine was called, then set *pRes=1.
3873 int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
3877 assert( cursorHoldsMutex(pCur) );
3878 rc = restoreCursorPosition(pCur);
3879 if( rc!=SQLITE_OK ){
3883 pPage = pCur->pPage;
3884 if( CURSOR_INVALID==pCur->eState ){
3895 assert( pPage->isInit );
3896 assert( pCur->idx<pPage->nCell );
3899 pCur->info.nSize = 0;
3900 pCur->validNKey = 0;
3901 if( pCur->idx>=pPage->nCell ){
3903 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
3905 rc = moveToLeftmost(pCur);
3910 if( sqlite3BtreeIsRootPage(pPage) ){
3912 pCur->eState = CURSOR_INVALID;
3915 sqlite3BtreeMoveToParent(pCur);
3916 pPage = pCur->pPage;
3917 }while( pCur->idx>=pPage->nCell );
3919 if( pPage->intKey ){
3920 rc = sqlite3BtreeNext(pCur, pRes);
3930 rc = moveToLeftmost(pCur);
3936 ** Step the cursor to the back to the previous entry in the database. If
3937 ** successful then set *pRes=0. If the cursor
3938 ** was already pointing to the first entry in the database before
3939 ** this routine was called, then set *pRes=1.
3941 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
3946 assert( cursorHoldsMutex(pCur) );
3947 rc = restoreCursorPosition(pCur);
3948 if( rc!=SQLITE_OK ){
3952 if( CURSOR_INVALID==pCur->eState ){
3963 pPage = pCur->pPage;
3964 assert( pPage->isInit );
3965 assert( pCur->idx>=0 );
3967 pgno = get4byte( findCell(pPage, pCur->idx) );
3968 rc = moveToChild(pCur, pgno);
3972 rc = moveToRightmost(pCur);
3974 while( pCur->idx==0 ){
3975 if( sqlite3BtreeIsRootPage(pPage) ){
3976 pCur->eState = CURSOR_INVALID;
3980 sqlite3BtreeMoveToParent(pCur);
3981 pPage = pCur->pPage;
3984 pCur->info.nSize = 0;
3985 pCur->validNKey = 0;
3986 if( pPage->intKey && !pPage->leaf ){
3987 rc = sqlite3BtreePrevious(pCur, pRes);
3997 ** Allocate a new page from the database file.
3999 ** The new page is marked as dirty. (In other words, sqlite3PagerWrite()
4000 ** has already been called on the new page.) The new page has also
4001 ** been referenced and the calling routine is responsible for calling
4002 ** sqlite3PagerUnref() on the new page when it is done.
4004 ** SQLITE_OK is returned on success. Any other return value indicates
4005 ** an error. *ppPage and *pPgno are undefined in the event of an error.
4006 ** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned.
4008 ** If the "nearby" parameter is not 0, then a (feeble) effort is made to
4009 ** locate a page close to the page number "nearby". This can be used in an
4010 ** attempt to keep related pages close to each other in the database file,
4011 ** which in turn can make database access faster.
4013 ** If the "exact" parameter is not 0, and the page-number nearby exists
4014 ** anywhere on the free-list, then it is guarenteed to be returned. This
4015 ** is only used by auto-vacuum databases when allocating a new table.
4017 static int allocateBtreePage(
4026 int n; /* Number of pages on the freelist */
4027 int k; /* Number of leaves on the trunk of the freelist */
4028 MemPage *pTrunk = 0;
4029 MemPage *pPrevTrunk = 0;
4031 assert( sqlite3_mutex_held(pBt->mutex) );
4032 pPage1 = pBt->pPage1;
4033 n = get4byte(&pPage1->aData[36]);
4035 /* There are pages on the freelist. Reuse one of those pages. */
4037 u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
4039 /* If the 'exact' parameter was true and a query of the pointer-map
4040 ** shows that the page 'nearby' is somewhere on the free-list, then
4041 ** the entire-list will be searched for that page.
4043 #ifndef SQLITE_OMIT_AUTOVACUUM
4044 if( exact && nearby<=pagerPagecount(pBt->pPager) ){
4047 assert( pBt->autoVacuum );
4048 rc = ptrmapGet(pBt, nearby, &eType, 0);
4050 if( eType==PTRMAP_FREEPAGE ){
4057 /* Decrement the free-list count by 1. Set iTrunk to the index of the
4058 ** first free-list trunk page. iPrevTrunk is initially 1.
4060 rc = sqlite3PagerWrite(pPage1->pDbPage);
4062 put4byte(&pPage1->aData[36], n-1);
4064 /* The code within this loop is run only once if the 'searchList' variable
4065 ** is not true. Otherwise, it runs once for each trunk-page on the
4066 ** free-list until the page 'nearby' is located.
4069 pPrevTrunk = pTrunk;
4071 iTrunk = get4byte(&pPrevTrunk->aData[0]);
4073 iTrunk = get4byte(&pPage1->aData[32]);
4075 rc = sqlite3BtreeGetPage(pBt, iTrunk, &pTrunk, 0);
4078 goto end_allocate_page;
4081 k = get4byte(&pTrunk->aData[4]);
4082 if( k==0 && !searchList ){
4083 /* The trunk has no leaves and the list is not being searched.
4084 ** So extract the trunk page itself and use it as the newly
4085 ** allocated page */
4086 assert( pPrevTrunk==0 );
4087 rc = sqlite3PagerWrite(pTrunk->pDbPage);
4089 goto end_allocate_page;
4092 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
4095 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
4096 }else if( k>pBt->usableSize/4 - 2 ){
4097 /* Value of k is out of range. Database corruption */
4098 rc = SQLITE_CORRUPT_BKPT;
4099 goto end_allocate_page;
4100 #ifndef SQLITE_OMIT_AUTOVACUUM
4101 }else if( searchList && nearby==iTrunk ){
4102 /* The list is being searched and this trunk page is the page
4103 ** to allocate, regardless of whether it has leaves.
4105 assert( *pPgno==iTrunk );
4108 rc = sqlite3PagerWrite(pTrunk->pDbPage);
4110 goto end_allocate_page;
4114 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
4116 memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
4119 /* The trunk page is required by the caller but it contains
4120 ** pointers to free-list leaves. The first leaf becomes a trunk
4121 ** page in this case.
4124 Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
4125 rc = sqlite3BtreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0);
4126 if( rc!=SQLITE_OK ){
4127 goto end_allocate_page;
4129 rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
4130 if( rc!=SQLITE_OK ){
4131 releasePage(pNewTrunk);
4132 goto end_allocate_page;
4134 memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
4135 put4byte(&pNewTrunk->aData[4], k-1);
4136 memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
4137 releasePage(pNewTrunk);
4139 put4byte(&pPage1->aData[32], iNewTrunk);
4141 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
4143 goto end_allocate_page;
4145 put4byte(&pPrevTrunk->aData[0], iNewTrunk);
4149 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
4152 /* Extract a leaf from the trunk */
4155 unsigned char *aData = pTrunk->aData;
4156 rc = sqlite3PagerWrite(pTrunk->pDbPage);
4158 goto end_allocate_page;
4163 dist = get4byte(&aData[8]) - nearby;
4164 if( dist<0 ) dist = -dist;
4166 int d2 = get4byte(&aData[8+i*4]) - nearby;
4167 if( d2<0 ) d2 = -d2;
4177 iPage = get4byte(&aData[8+closest*4]);
4178 if( !searchList || iPage==nearby ){
4181 nPage = pagerPagecount(pBt->pPager);
4183 /* Free page off the end of the file */
4184 rc = SQLITE_CORRUPT_BKPT;
4185 goto end_allocate_page;
4187 TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
4188 ": %d more free pages\n",
4189 *pPgno, closest+1, k, pTrunk->pgno, n-1));
4191 memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
4193 put4byte(&aData[4], k-1);
4194 rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 1);
4195 if( rc==SQLITE_OK ){
4196 sqlite3PagerDontRollback((*ppPage)->pDbPage);
4197 rc = sqlite3PagerWrite((*ppPage)->pDbPage);
4198 if( rc!=SQLITE_OK ){
4199 releasePage(*ppPage);
4205 releasePage(pPrevTrunk);
4207 }while( searchList );
4209 /* There are no pages on the freelist, so create a new page at the
4210 ** end of the file */
4211 int nPage = pagerPagecount(pBt->pPager);
4214 #ifndef SQLITE_OMIT_AUTOVACUUM
4216 /* An incr-vacuum has already run within this transaction. So the
4217 ** page to allocate is not from the physical end of the file, but
4220 *pPgno = pBt->nTrunc+1;
4221 if( *pPgno==PENDING_BYTE_PAGE(pBt) ){
4225 if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, *pPgno) ){
4226 /* If *pPgno refers to a pointer-map page, allocate two new pages
4227 ** at the end of the file instead of one. The first allocated page
4228 ** becomes a new pointer-map page, the second is used by the caller.
4230 TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", *pPgno));
4231 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
4233 if( *pPgno==PENDING_BYTE_PAGE(pBt) ){ (*pPgno)++; }
4236 pBt->nTrunc = *pPgno;
4240 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
4241 rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 0);
4243 rc = sqlite3PagerWrite((*ppPage)->pDbPage);
4244 if( rc!=SQLITE_OK ){
4245 releasePage(*ppPage);
4247 TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
4250 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
4253 releasePage(pTrunk);
4254 releasePage(pPrevTrunk);
4259 ** Add a page of the database file to the freelist.
4261 ** sqlite3PagerUnref() is NOT called for pPage.
4263 static int freePage(MemPage *pPage){
4264 BtShared *pBt = pPage->pBt;
4265 MemPage *pPage1 = pBt->pPage1;
4268 /* Prepare the page for freeing */
4269 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4270 assert( pPage->pgno>1 );
4272 releasePage(pPage->pParent);
4275 /* Increment the free page count on pPage1 */
4276 rc = sqlite3PagerWrite(pPage1->pDbPage);
4278 n = get4byte(&pPage1->aData[36]);
4279 put4byte(&pPage1->aData[36], n+1);
4281 #ifdef SQLITE_SECURE_DELETE
4282 /* If the SQLITE_SECURE_DELETE compile-time option is enabled, then
4283 ** always fully overwrite deleted information with zeros.
4285 rc = sqlite3PagerWrite(pPage->pDbPage);
4287 memset(pPage->aData, 0, pPage->pBt->pageSize);
4290 /* If the database supports auto-vacuum, write an entry in the pointer-map
4291 ** to indicate that the page is free.
4294 rc = ptrmapPut(pBt, pPage->pgno, PTRMAP_FREEPAGE, 0);
4299 /* This is the first free page */
4300 rc = sqlite3PagerWrite(pPage->pDbPage);
4302 memset(pPage->aData, 0, 8);
4303 put4byte(&pPage1->aData[32], pPage->pgno);
4304 TRACE(("FREE-PAGE: %d first\n", pPage->pgno));
4306 /* Other free pages already exist. Retrive the first trunk page
4307 ** of the freelist and find out how many leaves it has. */
4309 rc = sqlite3BtreeGetPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk, 0);
4311 k = get4byte(&pTrunk->aData[4]);
4312 if( k>=pBt->usableSize/4 - 8 ){
4313 /* The trunk is full. Turn the page being freed into a new
4314 ** trunk page with no leaves.
4316 ** Note that the trunk page is not really full until it contains
4317 ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
4318 ** coded. But due to a coding error in versions of SQLite prior to
4319 ** 3.6.0, databases with freelist trunk pages holding more than
4320 ** usableSize/4 - 8 entries will be reported as corrupt. In order
4321 ** to maintain backwards compatibility with older versions of SQLite,
4322 ** we will contain to restrict the number of entries to usableSize/4 - 8
4323 ** for now. At some point in the future (once everyone has upgraded
4324 ** to 3.6.0 or later) we should consider fixing the conditional above
4325 ** to read "usableSize/4-2" instead of "usableSize/4-8".
4327 rc = sqlite3PagerWrite(pPage->pDbPage);
4328 if( rc==SQLITE_OK ){
4329 put4byte(pPage->aData, pTrunk->pgno);
4330 put4byte(&pPage->aData[4], 0);
4331 put4byte(&pPage1->aData[32], pPage->pgno);
4332 TRACE(("FREE-PAGE: %d new trunk page replacing %d\n",
4333 pPage->pgno, pTrunk->pgno));
4336 rc = SQLITE_CORRUPT;
4338 /* Add the newly freed page as a leaf on the current trunk */
4339 rc = sqlite3PagerWrite(pTrunk->pDbPage);
4340 if( rc==SQLITE_OK ){
4341 put4byte(&pTrunk->aData[4], k+1);
4342 put4byte(&pTrunk->aData[8+k*4], pPage->pgno);
4343 #ifndef SQLITE_SECURE_DELETE
4344 sqlite3PagerDontWrite(pPage->pDbPage);
4347 TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
4349 releasePage(pTrunk);
4355 ** Free any overflow pages associated with the given Cell.
4357 static int clearCell(MemPage *pPage, unsigned char *pCell){
4358 BtShared *pBt = pPage->pBt;
4365 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4366 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
4367 if( info.iOverflow==0 ){
4368 return SQLITE_OK; /* No overflow pages. Return without doing anything */
4370 ovflPgno = get4byte(&pCell[info.iOverflow]);
4371 ovflPageSize = pBt->usableSize - 4;
4372 nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;
4373 assert( ovflPgno==0 || nOvfl>0 );
4376 if( ovflPgno==0 || ovflPgno>pagerPagecount(pBt->pPager) ){
4377 return SQLITE_CORRUPT_BKPT;
4380 rc = getOverflowPage(pBt, ovflPgno, &pOvfl, (nOvfl==0)?0:&ovflPgno);
4382 rc = freePage(pOvfl);
4383 sqlite3PagerUnref(pOvfl->pDbPage);
4390 ** Create the byte sequence used to represent a cell on page pPage
4391 ** and write that byte sequence into pCell[]. Overflow pages are
4392 ** allocated and filled in as necessary. The calling procedure
4393 ** is responsible for making sure sufficient space has been allocated
4396 ** Note that pCell does not necessary need to point to the pPage->aData
4397 ** area. pCell might point to some temporary storage. The cell will
4398 ** be constructed in this temporary area then copied into pPage->aData
4401 static int fillInCell(
4402 MemPage *pPage, /* The page that contains the cell */
4403 unsigned char *pCell, /* Complete text of the cell */
4404 const void *pKey, i64 nKey, /* The key */
4405 const void *pData,int nData, /* The data */
4406 int nZero, /* Extra zero bytes to append to pData */
4407 int *pnSize /* Write cell size here */
4414 MemPage *pToRelease = 0;
4415 unsigned char *pPrior;
4416 unsigned char *pPayload;
4417 BtShared *pBt = pPage->pBt;
4422 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4424 /* Fill in the header. */
4429 if( pPage->hasData ){
4430 nHeader += putVarint(&pCell[nHeader], nData+nZero);
4434 nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
4435 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
4436 assert( info.nHeader==nHeader );
4437 assert( info.nKey==nKey );
4438 assert( info.nData==nData+nZero );
4440 /* Fill in the payload */
4441 nPayload = nData + nZero;
4442 if( pPage->intKey ){
4451 *pnSize = info.nSize;
4452 spaceLeft = info.nLocal;
4453 pPayload = &pCell[nHeader];
4454 pPrior = &pCell[info.iOverflow];
4456 while( nPayload>0 ){
4459 #ifndef SQLITE_OMIT_AUTOVACUUM
4460 Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
4461 if( pBt->autoVacuum ){
4465 PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
4472 rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, isExact);
4473 #ifndef SQLITE_OMIT_AUTOVACUUM
4474 /* If the database supports auto-vacuum, and the second or subsequent
4475 ** overflow page is being allocated, add an entry to the pointer-map
4476 ** for that page now.
4478 ** If this is the first overflow page, then write a partial entry
4479 ** to the pointer-map. If we write nothing to this pointer-map slot,
4480 ** then the optimistic overflow chain processing in clearCell()
4481 ** may misinterpret the uninitialised values and delete the
4482 ** wrong pages from the database.
4484 if( pBt->autoVacuum && rc==SQLITE_OK ){
4485 u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
4486 rc = ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap);
4493 releasePage(pToRelease);
4496 put4byte(pPrior, pgnoOvfl);
4497 releasePage(pToRelease);
4499 pPrior = pOvfl->aData;
4500 put4byte(pPrior, 0);
4501 pPayload = &pOvfl->aData[4];
4502 spaceLeft = pBt->usableSize - 4;
4505 if( n>spaceLeft ) n = spaceLeft;
4507 if( n>nSrc ) n = nSrc;
4509 memcpy(pPayload, pSrc, n);
4511 memset(pPayload, 0, n);
4523 releasePage(pToRelease);
4529 ** Change the MemPage.pParent pointer on the page whose number is
4530 ** given in the second argument so that MemPage.pParent holds the
4531 ** pointer in the third argument.
4533 ** If the final argument, updatePtrmap, is non-zero and the database
4534 ** is an auto-vacuum database, then the pointer-map entry for pgno
4537 static int reparentPage(
4538 BtShared *pBt, /* B-Tree structure */
4539 Pgno pgno, /* Page number of child being adopted */
4540 MemPage *pNewParent, /* New parent of pgno */
4541 int idx, /* Index of child page pgno in pNewParent */
4542 int updatePtrmap /* If true, update pointer-map for pgno */
4547 assert( sqlite3_mutex_held(pBt->mutex) );
4548 assert( pNewParent!=0 );
4549 if( pgno==0 ) return SQLITE_OK;
4550 assert( pBt->pPager!=0 );
4551 pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
4553 pThis = (MemPage *)sqlite3PagerGetExtra(pDbPage);
4554 if( pThis->isInit ){
4555 assert( pThis->aData==sqlite3PagerGetData(pDbPage) );
4556 if( pThis->pParent!=pNewParent ){
4557 if( pThis->pParent ) sqlite3PagerUnref(pThis->pParent->pDbPage);
4558 pThis->pParent = pNewParent;
4559 sqlite3PagerRef(pNewParent->pDbPage);
4561 pThis->idxParent = idx;
4563 sqlite3PagerUnref(pDbPage);
4566 if( ISAUTOVACUUM && updatePtrmap ){
4567 return ptrmapPut(pBt, pgno, PTRMAP_BTREE, pNewParent->pgno);
4571 /* If the updatePtrmap flag was clear, assert that the entry in the
4572 ** pointer-map is already correct.
4575 pDbPage = sqlite3PagerLookup(pBt->pPager,PTRMAP_PAGENO(pBt,pgno));
4579 int rc = ptrmapGet(pBt, pgno, &eType, &ii);
4580 assert( rc==SQLITE_OK && ii==pNewParent->pgno && eType==PTRMAP_BTREE );
4581 sqlite3PagerUnref(pDbPage);
4592 ** Change the pParent pointer of all children of pPage to point back
4595 ** In other words, for every child of pPage, invoke reparentPage()
4596 ** to make sure that each child knows that pPage is its parent.
4598 ** This routine gets called after you memcpy() one page into
4601 ** If updatePtrmap is true, then the pointer-map entries for all child
4602 ** pages of pPage are updated.
4604 static int reparentChildPages(MemPage *pPage, int updatePtrmap){
4606 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4609 BtShared *pBt = pPage->pBt;
4610 Pgno iRight = get4byte(&pPage->aData[pPage->hdrOffset+8]);
4612 for(i=0; i<pPage->nCell; i++){
4613 u8 *pCell = findCell(pPage, i);
4614 rc = reparentPage(pBt, get4byte(pCell), pPage, i, updatePtrmap);
4615 if( rc!=SQLITE_OK ) return rc;
4617 rc = reparentPage(pBt, iRight, pPage, i, updatePtrmap);
4618 pPage->idxShift = 0;
4624 ** Remove the i-th cell from pPage. This routine effects pPage only.
4625 ** The cell content is not freed or deallocated. It is assumed that
4626 ** the cell content has been copied someplace else. This routine just
4627 ** removes the reference to the cell from pPage.
4629 ** "sz" must be the number of bytes in the cell.
4631 static void dropCell(MemPage *pPage, int idx, int sz){
4632 int i; /* Loop counter */
4633 int pc; /* Offset to cell content of cell being deleted */
4634 u8 *data; /* pPage->aData */
4635 u8 *ptr; /* Used to move bytes around within data[] */
4637 assert( idx>=0 && idx<pPage->nCell );
4638 assert( sz==cellSize(pPage, idx) );
4639 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
4640 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4641 data = pPage->aData;
4642 ptr = &data[pPage->cellOffset + 2*idx];
4644 assert( pc>10 && pc+sz<=pPage->pBt->usableSize );
4645 freeSpace(pPage, pc, sz);
4646 for(i=idx+1; i<pPage->nCell; i++, ptr+=2){
4651 put2byte(&data[pPage->hdrOffset+3], pPage->nCell);
4653 pPage->idxShift = 1;
4657 ** Insert a new cell on pPage at cell index "i". pCell points to the
4658 ** content of the cell.
4660 ** If the cell content will fit on the page, then put it there. If it
4661 ** will not fit, then make a copy of the cell content into pTemp if
4662 ** pTemp is not null. Regardless of pTemp, allocate a new entry
4663 ** in pPage->aOvfl[] and make it point to the cell content (either
4664 ** in pTemp or the original pCell) and also record its index.
4665 ** Allocating a new entry in pPage->aCell[] implies that
4666 ** pPage->nOverflow is incremented.
4668 ** If nSkip is non-zero, then do not copy the first nSkip bytes of the
4669 ** cell. The caller will overwrite them after this function returns. If
4670 ** nSkip is non-zero, then pCell may not point to an invalid memory location
4671 ** (but pCell+nSkip is always valid).
4673 static int insertCell(
4674 MemPage *pPage, /* Page into which we are copying */
4675 int i, /* New cell becomes the i-th cell of the page */
4676 u8 *pCell, /* Content of the new cell */
4677 int sz, /* Bytes of content in pCell */
4678 u8 *pTemp, /* Temp storage space for pCell, if needed */
4679 u8 nSkip /* Do not write the first nSkip bytes of the cell */
4681 int idx; /* Where to write new cell content in data[] */
4682 int j; /* Loop counter */
4683 int top; /* First byte of content for any cell in data[] */
4684 int end; /* First byte past the last cell pointer in data[] */
4685 int ins; /* Index in data[] where new cell pointer is inserted */
4686 int hdr; /* Offset into data[] of the page header */
4687 int cellOffset; /* Address of first cell pointer in data[] */
4688 u8 *data; /* The content of the whole page */
4689 u8 *ptr; /* Used for moving information around in data[] */
4691 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
4692 assert( sz==cellSizePtr(pPage, pCell) );
4693 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4694 if( pPage->nOverflow || sz+2>pPage->nFree ){
4696 memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip);
4699 j = pPage->nOverflow++;
4700 assert( j<sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0]) );
4701 pPage->aOvfl[j].pCell = pCell;
4702 pPage->aOvfl[j].idx = i;
4705 int rc = sqlite3PagerWrite(pPage->pDbPage);
4706 if( rc!=SQLITE_OK ){
4709 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
4710 data = pPage->aData;
4711 hdr = pPage->hdrOffset;
4712 top = get2byte(&data[hdr+5]);
4713 cellOffset = pPage->cellOffset;
4714 end = cellOffset + 2*pPage->nCell + 2;
4715 ins = cellOffset + 2*i;
4716 if( end > top - sz ){
4717 defragmentPage(pPage);
4718 top = get2byte(&data[hdr+5]);
4719 assert( end + sz <= top );
4721 idx = allocateSpace(pPage, sz);
4723 assert( end <= get2byte(&data[hdr+5]) );
4726 memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip);
4727 for(j=end-2, ptr=&data[j]; j>ins; j-=2, ptr-=2){
4731 put2byte(&data[ins], idx);
4732 put2byte(&data[hdr+3], pPage->nCell);
4733 pPage->idxShift = 1;
4734 #ifndef SQLITE_OMIT_AUTOVACUUM
4735 if( pPage->pBt->autoVacuum ){
4736 /* The cell may contain a pointer to an overflow page. If so, write
4737 ** the entry for the overflow page into the pointer map.
4740 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
4741 assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
4742 if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
4743 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
4744 rc = ptrmapPut(pPage->pBt, pgnoOvfl, PTRMAP_OVERFLOW1, pPage->pgno);
4745 if( rc!=SQLITE_OK ) return rc;
4755 ** Add a list of cells to a page. The page should be initially empty.
4756 ** The cells are guaranteed to fit on the page.
4758 static void assemblePage(
4759 MemPage *pPage, /* The page to be assemblied */
4760 int nCell, /* The number of cells to add to this page */
4761 u8 **apCell, /* Pointers to cell bodies */
4762 u16 *aSize /* Sizes of the cells */
4764 int i; /* Loop counter */
4765 int totalSize; /* Total size of all cells */
4766 int hdr; /* Index of page header */
4767 int cellptr; /* Address of next cell pointer */
4768 int cellbody; /* Address of next cell body */
4769 u8 *data; /* Data for the page */
4771 assert( pPage->nOverflow==0 );
4772 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4774 for(i=0; i<nCell; i++){
4775 totalSize += aSize[i];
4777 assert( totalSize+2*nCell<=pPage->nFree );
4778 assert( pPage->nCell==0 );
4779 cellptr = pPage->cellOffset;
4780 data = pPage->aData;
4781 hdr = pPage->hdrOffset;
4782 put2byte(&data[hdr+3], nCell);
4784 cellbody = allocateSpace(pPage, totalSize);
4785 assert( cellbody>0 );
4786 assert( pPage->nFree >= 2*nCell );
4787 pPage->nFree -= 2*nCell;
4788 for(i=0; i<nCell; i++){
4789 put2byte(&data[cellptr], cellbody);
4790 memcpy(&data[cellbody], apCell[i], aSize[i]);
4792 cellbody += aSize[i];
4794 assert( cellbody==pPage->pBt->usableSize );
4796 pPage->nCell = nCell;
4800 ** The following parameters determine how many adjacent pages get involved
4801 ** in a balancing operation. NN is the number of neighbors on either side
4802 ** of the page that participate in the balancing operation. NB is the
4803 ** total number of pages that participate, including the target page and
4804 ** NN neighbors on either side.
4806 ** The minimum value of NN is 1 (of course). Increasing NN above 1
4807 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
4808 ** in exchange for a larger degradation in INSERT and UPDATE performance.
4809 ** The value of NN appears to give the best results overall.
4811 #define NN 1 /* Number of neighbors on either side of pPage */
4812 #define NB (NN*2+1) /* Total pages involved in the balance */
4814 /* Forward reference */
4815 static int balance(MemPage*, int);
4817 #ifndef SQLITE_OMIT_QUICKBALANCE
4819 ** This version of balance() handles the common special case where
4820 ** a new entry is being inserted on the extreme right-end of the
4821 ** tree, in other words, when the new entry will become the largest
4822 ** entry in the tree.
4824 ** Instead of trying balance the 3 right-most leaf pages, just add
4825 ** a new page to the right-hand side and put the one new entry in
4826 ** that page. This leaves the right side of the tree somewhat
4827 ** unbalanced. But odds are that we will be inserting new entries
4828 ** at the end soon afterwards so the nearly empty page will quickly
4829 ** fill up. On average.
4831 ** pPage is the leaf page which is the right-most page in the tree.
4832 ** pParent is its parent. pPage must have a single overflow entry
4833 ** which is also the right-most entry on the page.
4835 static int balance_quick(MemPage *pPage, MemPage *pParent){
4842 BtShared *pBt = pPage->pBt;
4843 int parentIdx = pParent->nCell; /* pParent new divider cell index */
4844 int parentSize; /* Size of new divider cell */
4845 u8 parentCell[64]; /* Space for the new divider cell */
4847 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4849 /* Allocate a new page. Insert the overflow cell from pPage
4850 ** into it. Then remove the overflow cell from pPage.
4852 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
4853 if( rc!=SQLITE_OK ){
4856 pCell = pPage->aOvfl[0].pCell;
4857 szCell = cellSizePtr(pPage, pCell);
4858 zeroPage(pNew, pPage->aData[0]);
4859 assemblePage(pNew, 1, &pCell, &szCell);
4860 pPage->nOverflow = 0;
4862 /* Set the parent of the newly allocated page to pParent. */
4863 pNew->pParent = pParent;
4864 sqlite3PagerRef(pParent->pDbPage);
4866 /* pPage is currently the right-child of pParent. Change this
4867 ** so that the right-child is the new page allocated above and
4868 ** pPage is the next-to-right child.
4870 ** Ignore the return value of the call to fillInCell(). fillInCell()
4871 ** may only return other than SQLITE_OK if it is required to allocate
4872 ** one or more overflow pages. Since an internal table B-Tree cell
4873 ** may never spill over onto an overflow page (it is a maximum of
4874 ** 13 bytes in size), it is not neccessary to check the return code.
4876 ** Similarly, the insertCell() function cannot fail if the page
4877 ** being inserted into is already writable and the cell does not
4878 ** contain an overflow pointer. So ignore this return code too.
4880 assert( pPage->nCell>0 );
4881 pCell = findCell(pPage, pPage->nCell-1);
4882 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
4883 fillInCell(pParent, parentCell, 0, info.nKey, 0, 0, 0, &parentSize);
4884 assert( parentSize<64 );
4885 assert( sqlite3PagerIswriteable(pParent->pDbPage) );
4886 insertCell(pParent, parentIdx, parentCell, parentSize, 0, 4);
4887 put4byte(findOverflowCell(pParent,parentIdx), pPage->pgno);
4888 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
4890 /* If this is an auto-vacuum database, update the pointer map
4891 ** with entries for the new page, and any pointer from the
4892 ** cell on the page to an overflow page.
4895 rc = ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno);
4896 if( rc==SQLITE_OK ){
4897 rc = ptrmapPutOvfl(pNew, 0);
4899 if( rc!=SQLITE_OK ){
4905 /* Release the reference to the new page and balance the parent page,
4906 ** in case the divider cell inserted caused it to become overfull.
4909 return balance(pParent, 0);
4911 #endif /* SQLITE_OMIT_QUICKBALANCE */
4914 ** This routine redistributes Cells on pPage and up to NN*2 siblings
4915 ** of pPage so that all pages have about the same amount of free space.
4916 ** Usually NN siblings on either side of pPage is used in the balancing,
4917 ** though more siblings might come from one side if pPage is the first
4918 ** or last child of its parent. If pPage has fewer than 2*NN siblings
4919 ** (something which can only happen if pPage is the root page or a
4920 ** child of root) then all available siblings participate in the balancing.
4922 ** The number of siblings of pPage might be increased or decreased by one or
4923 ** two in an effort to keep pages nearly full but not over full. The root page
4924 ** is special and is allowed to be nearly empty. If pPage is
4925 ** the root page, then the depth of the tree might be increased
4926 ** or decreased by one, as necessary, to keep the root page from being
4927 ** overfull or completely empty.
4929 ** Note that when this routine is called, some of the Cells on pPage
4930 ** might not actually be stored in pPage->aData[]. This can happen
4931 ** if the page is overfull. Part of the job of this routine is to
4932 ** make sure all Cells for pPage once again fit in pPage->aData[].
4934 ** In the course of balancing the siblings of pPage, the parent of pPage
4935 ** might become overfull or underfull. If that happens, then this routine
4936 ** is called recursively on the parent.
4938 ** If this routine fails for any reason, it might leave the database
4939 ** in a corrupted state. So if this routine fails, the database should
4942 static int balance_nonroot(MemPage *pPage){
4943 MemPage *pParent; /* The parent of pPage */
4944 BtShared *pBt; /* The whole database */
4945 int nCell = 0; /* Number of cells in apCell[] */
4946 int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */
4947 int nOld; /* Number of pages in apOld[] */
4948 int nNew; /* Number of pages in apNew[] */
4949 int nDiv; /* Number of cells in apDiv[] */
4950 int i, j, k; /* Loop counters */
4951 int idx; /* Index of pPage in pParent->aCell[] */
4952 int nxDiv; /* Next divider slot in pParent->aCell[] */
4953 int rc; /* The return code */
4954 int leafCorrection; /* 4 if pPage is a leaf. 0 if not */
4955 int leafData; /* True if pPage is a leaf of a LEAFDATA tree */
4956 int usableSpace; /* Bytes in pPage beyond the header */
4957 int pageFlags; /* Value of pPage->aData[0] */
4958 int subtotal; /* Subtotal of bytes in cells on one page */
4959 int iSpace1 = 0; /* First unused byte of aSpace1[] */
4960 int iSpace2 = 0; /* First unused byte of aSpace2[] */
4961 int szScratch; /* Size of scratch memory requested */
4962 MemPage *apOld[NB]; /* pPage and up to two siblings */
4963 Pgno pgnoOld[NB]; /* Page numbers for each page in apOld[] */
4964 MemPage *apCopy[NB]; /* Private copies of apOld[] pages */
4965 MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */
4966 Pgno pgnoNew[NB+2]; /* Page numbers for each page in apNew[] */
4967 u8 *apDiv[NB]; /* Divider cells in pParent */
4968 int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */
4969 int szNew[NB+2]; /* Combined size of cells place on i-th page */
4970 u8 **apCell = 0; /* All cells begin balanced */
4971 u16 *szCell; /* Local size of all cells in apCell[] */
4972 u8 *aCopy[NB]; /* Space for holding data of apCopy[] */
4973 u8 *aSpace1; /* Space for copies of dividers cells before balance */
4974 u8 *aSpace2 = 0; /* Space for overflow dividers cells after balance */
4977 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4980 ** Find the parent page.
4982 assert( pPage->isInit );
4983 assert( sqlite3PagerIswriteable(pPage->pDbPage) || pPage->nOverflow==1 );
4985 pParent = pPage->pParent;
4987 if( SQLITE_OK!=(rc = sqlite3PagerWrite(pParent->pDbPage)) ){
4991 TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
4993 #ifndef SQLITE_OMIT_QUICKBALANCE
4995 ** A special case: If a new entry has just been inserted into a
4996 ** table (that is, a btree with integer keys and all data at the leaves)
4997 ** and the new entry is the right-most entry in the tree (it has the
4998 ** largest key) then use the special balance_quick() routine for
4999 ** balancing. balance_quick() is much faster and results in a tighter
5000 ** packing of data in the common case.
5004 pPage->nOverflow==1 &&
5005 pPage->aOvfl[0].idx==pPage->nCell &&
5006 pPage->pParent->pgno!=1 &&
5007 get4byte(&pParent->aData[pParent->hdrOffset+8])==pPage->pgno
5009 assert( pPage->intKey );
5011 ** TODO: Check the siblings to the left of pPage. It may be that
5012 ** they are not full and no new page is required.
5014 return balance_quick(pPage, pParent);
5018 if( SQLITE_OK!=(rc = sqlite3PagerWrite(pPage->pDbPage)) ){
5023 ** Find the cell in the parent page whose left child points back
5024 ** to pPage. The "idx" variable is the index of that cell. If pPage
5025 ** is the rightmost child of pParent then set idx to pParent->nCell
5027 if( pParent->idxShift ){
5030 assert( pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
5031 for(idx=0; idx<pParent->nCell; idx++){
5032 if( get4byte(findCell(pParent, idx))==pgno ){
5036 assert( idx<pParent->nCell
5037 || get4byte(&pParent->aData[pParent->hdrOffset+8])==pgno );
5039 idx = pPage->idxParent;
5043 ** Initialize variables so that it will be safe to jump
5044 ** directly to balance_cleanup at any moment.
5047 sqlite3PagerRef(pParent->pDbPage);
5050 ** Find sibling pages to pPage and the cells in pParent that divide
5051 ** the siblings. An attempt is made to find NN siblings on either
5052 ** side of pPage. More siblings are taken from one side, however, if
5053 ** pPage there are fewer than NN siblings on the other side. If pParent
5054 ** has NB or fewer children then all children of pParent are taken.
5057 if( nxDiv + NB > pParent->nCell ){
5058 nxDiv = pParent->nCell - NB + 1;
5064 for(i=0, k=nxDiv; i<NB; i++, k++){
5065 if( k<pParent->nCell ){
5066 apDiv[i] = findCell(pParent, k);
5068 assert( !pParent->leaf );
5069 pgnoOld[i] = get4byte(apDiv[i]);
5070 }else if( k==pParent->nCell ){
5071 pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+8]);
5075 rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i], pParent);
5076 if( rc ) goto balance_cleanup;
5077 apOld[i]->idxParent = k;
5081 nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
5084 /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
5086 nMaxCells = (nMaxCells + 3)&~3;
5089 ** Allocate space for memory structures
5092 nMaxCells*sizeof(u8*) /* apCell */
5093 + nMaxCells*sizeof(u16) /* szCell */
5094 + (ROUND8(sizeof(MemPage))+pBt->pageSize)*NB /* aCopy */
5095 + pBt->pageSize /* aSpace1 */
5096 + (ISAUTOVACUUM ? nMaxCells : 0); /* aFrom */
5097 apCell = sqlite3ScratchMalloc( szScratch );
5100 goto balance_cleanup;
5102 szCell = (u16*)&apCell[nMaxCells];
5103 aCopy[0] = (u8*)&szCell[nMaxCells];
5104 assert( ((aCopy[0] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
5105 for(i=1; i<NB; i++){
5106 aCopy[i] = &aCopy[i-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
5107 assert( ((aCopy[i] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
5109 aSpace1 = &aCopy[NB-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
5110 assert( ((aSpace1 - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
5112 aFrom = &aSpace1[pBt->pageSize];
5114 aSpace2 = sqlite3PageMalloc(pBt->pageSize);
5117 goto balance_cleanup;
5121 ** Make copies of the content of pPage and its siblings into aOld[].
5122 ** The rest of this function will use data from the copies rather
5123 ** that the original pages since the original pages will be in the
5124 ** process of being overwritten.
5126 for(i=0; i<nOld; i++){
5127 MemPage *p = apCopy[i] = (MemPage*)aCopy[i];
5128 memcpy(p, apOld[i], sizeof(MemPage));
5129 p->aData = (void*)&p[1];
5130 memcpy(p->aData, apOld[i]->aData, pBt->pageSize);
5134 ** Load pointers to all cells on sibling pages and the divider cells
5135 ** into the local apCell[] array. Make copies of the divider cells
5136 ** into space obtained form aSpace1[] and remove the the divider Cells
5139 ** If the siblings are on leaf pages, then the child pointers of the
5140 ** divider cells are stripped from the cells before they are copied
5141 ** into aSpace1[]. In this way, all cells in apCell[] are without
5142 ** child pointers. If siblings are not leaves, then all cell in
5143 ** apCell[] include child pointers. Either way, all cells in apCell[]
5146 ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf.
5147 ** leafData: 1 if pPage holds key+data and pParent holds only keys.
5150 leafCorrection = pPage->leaf*4;
5151 leafData = pPage->hasData;
5152 for(i=0; i<nOld; i++){
5153 MemPage *pOld = apCopy[i];
5154 int limit = pOld->nCell+pOld->nOverflow;
5155 for(j=0; j<limit; j++){
5156 assert( nCell<nMaxCells );
5157 apCell[nCell] = findOverflowCell(pOld, j);
5158 szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
5162 for(a=0; a<pOld->nOverflow; a++){
5163 if( pOld->aOvfl[a].pCell==apCell[nCell] ){
5164 aFrom[nCell] = 0xFF;
5172 u16 sz = cellSizePtr(pParent, apDiv[i]);
5174 /* With the LEAFDATA flag, pParent cells hold only INTKEYs that
5175 ** are duplicates of keys on the child pages. We need to remove
5176 ** the divider cells from pParent, but the dividers cells are not
5177 ** added to apCell[] because they are duplicates of child cells.
5179 dropCell(pParent, nxDiv, sz);
5182 assert( nCell<nMaxCells );
5184 pTemp = &aSpace1[iSpace1];
5186 assert( sz<=pBt->pageSize/4 );
5187 assert( iSpace1<=pBt->pageSize );
5188 memcpy(pTemp, apDiv[i], sz);
5189 apCell[nCell] = pTemp+leafCorrection;
5191 aFrom[nCell] = 0xFF;
5193 dropCell(pParent, nxDiv, sz);
5194 szCell[nCell] -= leafCorrection;
5195 assert( get4byte(pTemp)==pgnoOld[i] );
5197 assert( leafCorrection==0 );
5198 /* The right pointer of the child page pOld becomes the left
5199 ** pointer of the divider cell */
5200 memcpy(apCell[nCell], &pOld->aData[pOld->hdrOffset+8], 4);
5202 assert( leafCorrection==4 );
5203 if( szCell[nCell]<4 ){
5204 /* Do not allow any cells smaller than 4 bytes. */
5214 ** Figure out the number of pages needed to hold all nCell cells.
5215 ** Store this number in "k". Also compute szNew[] which is the total
5216 ** size of all cells on the i-th page and cntNew[] which is the index
5217 ** in apCell[] of the cell that divides page i from page i+1.
5218 ** cntNew[k] should equal nCell.
5220 ** Values computed by this block:
5222 ** k: The total number of sibling pages
5223 ** szNew[i]: Spaced used on the i-th sibling page.
5224 ** cntNew[i]: Index in apCell[] and szCell[] for the first cell to
5225 ** the right of the i-th sibling page.
5226 ** usableSpace: Number of bytes of space available on each sibling.
5229 usableSpace = pBt->usableSize - 12 + leafCorrection;
5230 for(subtotal=k=i=0; i<nCell; i++){
5231 assert( i<nMaxCells );
5232 subtotal += szCell[i] + 2;
5233 if( subtotal > usableSpace ){
5234 szNew[k] = subtotal - szCell[i];
5236 if( leafData ){ i--; }
5241 szNew[k] = subtotal;
5246 ** The packing computed by the previous block is biased toward the siblings
5247 ** on the left side. The left siblings are always nearly full, while the
5248 ** right-most sibling might be nearly empty. This block of code attempts
5249 ** to adjust the packing of siblings to get a better balance.
5251 ** This adjustment is more than an optimization. The packing above might
5252 ** be so out of balance as to be illegal. For example, the right-most
5253 ** sibling might be completely empty. This adjustment is not optional.
5255 for(i=k-1; i>0; i--){
5256 int szRight = szNew[i]; /* Size of sibling on the right */
5257 int szLeft = szNew[i-1]; /* Size of sibling on the left */
5258 int r; /* Index of right-most cell in left sibling */
5259 int d; /* Index of first cell to the left of right sibling */
5261 r = cntNew[i-1] - 1;
5262 d = r + 1 - leafData;
5263 assert( d<nMaxCells );
5264 assert( r<nMaxCells );
5265 while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){
5266 szRight += szCell[d] + 2;
5267 szLeft -= szCell[r] + 2;
5269 r = cntNew[i-1] - 1;
5270 d = r + 1 - leafData;
5273 szNew[i-1] = szLeft;
5276 /* Either we found one or more cells (cntnew[0])>0) or we are the
5277 ** a virtual root page. A virtual root page is when the real root
5278 ** page is page 1 and we are the only child of that page.
5280 assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
5283 ** Allocate k new pages. Reuse old pages where possible.
5285 assert( pPage->pgno>1 );
5286 pageFlags = pPage->aData[0];
5290 pNew = apNew[i] = apOld[i];
5291 pgnoNew[i] = pgnoOld[i];
5293 rc = sqlite3PagerWrite(pNew->pDbPage);
5295 if( rc ) goto balance_cleanup;
5298 rc = allocateBtreePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1], 0);
5299 if( rc ) goto balance_cleanup;
5305 /* Free any old pages that were not reused as new pages.
5308 rc = freePage(apOld[i]);
5309 if( rc ) goto balance_cleanup;
5310 releasePage(apOld[i]);
5316 ** Put the new pages in accending order. This helps to
5317 ** keep entries in the disk file in order so that a scan
5318 ** of the table is a linear scan through the file. That
5319 ** in turn helps the operating system to deliver pages
5320 ** from the disk more rapidly.
5322 ** An O(n^2) insertion sort algorithm is used, but since
5323 ** n is never more than NB (a small constant), that should
5324 ** not be a problem.
5326 ** When NB==3, this one optimization makes the database
5327 ** about 25% faster for large insertions and deletions.
5329 for(i=0; i<k-1; i++){
5330 int minV = pgnoNew[i];
5332 for(j=i+1; j<k; j++){
5333 if( pgnoNew[j]<(unsigned)minV ){
5343 pgnoNew[i] = pgnoNew[minI];
5344 apNew[i] = apNew[minI];
5349 TRACE(("BALANCE: old: %d %d %d new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
5351 nOld>=2 ? pgnoOld[1] : 0,
5352 nOld>=3 ? pgnoOld[2] : 0,
5353 pgnoNew[0], szNew[0],
5354 nNew>=2 ? pgnoNew[1] : 0, nNew>=2 ? szNew[1] : 0,
5355 nNew>=3 ? pgnoNew[2] : 0, nNew>=3 ? szNew[2] : 0,
5356 nNew>=4 ? pgnoNew[3] : 0, nNew>=4 ? szNew[3] : 0,
5357 nNew>=5 ? pgnoNew[4] : 0, nNew>=5 ? szNew[4] : 0));
5360 ** Evenly distribute the data in apCell[] across the new pages.
5361 ** Insert divider cells into pParent as necessary.
5364 for(i=0; i<nNew; i++){
5365 /* Assemble the new sibling page. */
5366 MemPage *pNew = apNew[i];
5367 assert( j<nMaxCells );
5368 assert( pNew->pgno==pgnoNew[i] );
5369 zeroPage(pNew, pageFlags);
5370 assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
5371 assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );
5372 assert( pNew->nOverflow==0 );
5374 /* If this is an auto-vacuum database, update the pointer map entries
5375 ** that point to the siblings that were rearranged. These can be: left
5376 ** children of cells, the right-child of the page, or overflow pages
5377 ** pointed to by cells.
5380 for(k=j; k<cntNew[i]; k++){
5381 assert( k<nMaxCells );
5382 if( aFrom[k]==0xFF || apCopy[aFrom[k]]->pgno!=pNew->pgno ){
5383 rc = ptrmapPutOvfl(pNew, k-j);
5384 if( rc==SQLITE_OK && leafCorrection==0 ){
5385 rc = ptrmapPut(pBt, get4byte(apCell[k]), PTRMAP_BTREE, pNew->pgno);
5387 if( rc!=SQLITE_OK ){
5388 goto balance_cleanup;
5396 /* If the sibling page assembled above was not the right-most sibling,
5397 ** insert a divider cell into the parent page.
5399 if( i<nNew-1 && j<nCell ){
5404 assert( j<nMaxCells );
5406 sz = szCell[j] + leafCorrection;
5407 pTemp = &aSpace2[iSpace2];
5409 memcpy(&pNew->aData[8], pCell, 4);
5411 && (aFrom[j]==0xFF || apCopy[aFrom[j]]->pgno!=pNew->pgno)
5413 rc = ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno);
5414 if( rc!=SQLITE_OK ){
5415 goto balance_cleanup;
5418 }else if( leafData ){
5419 /* If the tree is a leaf-data tree, and the siblings are leaves,
5420 ** then there is no divider cell in apCell[]. Instead, the divider
5421 ** cell consists of the integer key for the right-most cell of
5422 ** the sibling-page assembled above only.
5426 sqlite3BtreeParseCellPtr(pNew, apCell[j], &info);
5428 fillInCell(pParent, pCell, 0, info.nKey, 0, 0, 0, &sz);
5432 /* Obscure case for non-leaf-data trees: If the cell at pCell was
5433 ** previously stored on a leaf node, and its reported size was 4
5434 ** bytes, then it may actually be smaller than this
5435 ** (see sqlite3BtreeParseCellPtr(), 4 bytes is the minimum size of
5436 ** any cell). But it is important to pass the correct size to
5437 ** insertCell(), so reparse the cell now.
5439 ** Note that this can never happen in an SQLite data file, as all
5440 ** cells are at least 4 bytes. It only happens in b-trees used
5441 ** to evaluate "IN (SELECT ...)" and similar clauses.
5444 assert(leafCorrection==4);
5445 sz = cellSizePtr(pParent, pCell);
5449 assert( sz<=pBt->pageSize/4 );
5450 assert( iSpace2<=pBt->pageSize );
5451 rc = insertCell(pParent, nxDiv, pCell, sz, pTemp, 4);
5452 if( rc!=SQLITE_OK ) goto balance_cleanup;
5453 put4byte(findOverflowCell(pParent,nxDiv), pNew->pgno);
5455 /* If this is an auto-vacuum database, and not a leaf-data tree,
5456 ** then update the pointer map with an entry for the overflow page
5457 ** that the cell just inserted points to (if any).
5459 if( ISAUTOVACUUM && !leafData ){
5460 rc = ptrmapPutOvfl(pParent, nxDiv);
5461 if( rc!=SQLITE_OK ){
5462 goto balance_cleanup;
5469 /* Set the pointer-map entry for the new sibling page. */
5471 rc = ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno);
5472 if( rc!=SQLITE_OK ){
5473 goto balance_cleanup;
5480 if( (pageFlags & PTF_LEAF)==0 ){
5481 u8 *zChild = &apCopy[nOld-1]->aData[8];
5482 memcpy(&apNew[nNew-1]->aData[8], zChild, 4);
5484 rc = ptrmapPut(pBt, get4byte(zChild), PTRMAP_BTREE, apNew[nNew-1]->pgno);
5485 if( rc!=SQLITE_OK ){
5486 goto balance_cleanup;
5490 if( nxDiv==pParent->nCell+pParent->nOverflow ){
5491 /* Right-most sibling is the right-most child of pParent */
5492 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew[nNew-1]);
5494 /* Right-most sibling is the left child of the first entry in pParent
5495 ** past the right-most divider entry */
5496 put4byte(findOverflowCell(pParent, nxDiv), pgnoNew[nNew-1]);
5500 ** Reparent children of all cells.
5502 for(i=0; i<nNew; i++){
5503 rc = reparentChildPages(apNew[i], 0);
5504 if( rc!=SQLITE_OK ) goto balance_cleanup;
5506 rc = reparentChildPages(pParent, 0);
5507 if( rc!=SQLITE_OK ) goto balance_cleanup;
5510 ** Balance the parent page. Note that the current page (pPage) might
5511 ** have been added to the freelist so it might no longer be initialized.
5512 ** But the parent page will always be initialized.
5514 assert( pParent->isInit );
5515 sqlite3ScratchFree(apCell);
5517 rc = balance(pParent, 0);
5520 ** Cleanup before returning.
5523 sqlite3PageFree(aSpace2);
5524 sqlite3ScratchFree(apCell);
5525 for(i=0; i<nOld; i++){
5526 releasePage(apOld[i]);
5528 for(i=0; i<nNew; i++){
5529 releasePage(apNew[i]);
5531 releasePage(pParent);
5532 TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n",
5533 pPage->pgno, nOld, nNew, nCell));
5538 ** This routine is called for the root page of a btree when the root
5539 ** page contains no cells. This is an opportunity to make the tree
5540 ** shallower by one level.
5542 static int balance_shallower(MemPage *pPage){
5543 MemPage *pChild; /* The only child page of pPage */
5544 Pgno pgnoChild; /* Page number for pChild */
5545 int rc = SQLITE_OK; /* Return code from subprocedures */
5546 BtShared *pBt; /* The main BTree structure */
5547 int mxCellPerPage; /* Maximum number of cells per page */
5548 u8 **apCell; /* All cells from pages being balanced */
5549 u16 *szCell; /* Local size of all cells */
5551 assert( pPage->pParent==0 );
5552 assert( pPage->nCell==0 );
5553 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5555 mxCellPerPage = MX_CELL(pBt);
5556 apCell = sqlite3Malloc( mxCellPerPage*(sizeof(u8*)+sizeof(u16)) );
5557 if( apCell==0 ) return SQLITE_NOMEM;
5558 szCell = (u16*)&apCell[mxCellPerPage];
5560 /* The table is completely empty */
5561 TRACE(("BALANCE: empty table %d\n", pPage->pgno));
5563 /* The root page is empty but has one child. Transfer the
5564 ** information from that one child into the root page if it
5565 ** will fit. This reduces the depth of the tree by one.
5567 ** If the root page is page 1, it has less space available than
5568 ** its child (due to the 100 byte header that occurs at the beginning
5569 ** of the database fle), so it might not be able to hold all of the
5570 ** information currently contained in the child. If this is the
5571 ** case, then do not do the transfer. Leave page 1 empty except
5572 ** for the right-pointer to the child page. The child page becomes
5573 ** the virtual root of the tree.
5575 pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5576 assert( pgnoChild>0 );
5577 assert( pgnoChild<=pagerPagecount(pPage->pBt->pPager) );
5578 rc = sqlite3BtreeGetPage(pPage->pBt, pgnoChild, &pChild, 0);
5579 if( rc ) goto end_shallow_balance;
5580 if( pPage->pgno==1 ){
5581 rc = sqlite3BtreeInitPage(pChild, pPage);
5582 if( rc ) goto end_shallow_balance;
5583 assert( pChild->nOverflow==0 );
5584 if( pChild->nFree>=100 ){
5585 /* The child information will fit on the root page, so do the
5588 zeroPage(pPage, pChild->aData[0]);
5589 for(i=0; i<pChild->nCell; i++){
5590 apCell[i] = findCell(pChild,i);
5591 szCell[i] = cellSizePtr(pChild, apCell[i]);
5593 assemblePage(pPage, pChild->nCell, apCell, szCell);
5594 /* Copy the right-pointer of the child to the parent. */
5595 put4byte(&pPage->aData[pPage->hdrOffset+8],
5596 get4byte(&pChild->aData[pChild->hdrOffset+8]));
5598 TRACE(("BALANCE: child %d transfer to page 1\n", pChild->pgno));
5600 /* The child has more information that will fit on the root.
5601 ** The tree is already balanced. Do nothing. */
5602 TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno));
5605 memcpy(pPage->aData, pChild->aData, pPage->pBt->usableSize);
5608 rc = sqlite3BtreeInitPage(pPage, 0);
5609 assert( rc==SQLITE_OK );
5611 TRACE(("BALANCE: transfer child %d into root %d\n",
5612 pChild->pgno, pPage->pgno));
5614 rc = reparentChildPages(pPage, 1);
5615 assert( pPage->nOverflow==0 );
5618 for(i=0; i<pPage->nCell; i++){
5619 rc = ptrmapPutOvfl(pPage, i);
5620 if( rc!=SQLITE_OK ){
5621 goto end_shallow_balance;
5625 releasePage(pChild);
5627 end_shallow_balance:
5628 sqlite3_free(apCell);
5634 ** The root page is overfull
5636 ** When this happens, Create a new child page and copy the
5637 ** contents of the root into the child. Then make the root
5638 ** page an empty page with rightChild pointing to the new
5639 ** child. Finally, call balance_internal() on the new child
5640 ** to cause it to split.
5642 static int balance_deeper(MemPage *pPage){
5643 int rc; /* Return value from subprocedures */
5644 MemPage *pChild; /* Pointer to a new child page */
5645 Pgno pgnoChild; /* Page number of the new child page */
5646 BtShared *pBt; /* The BTree */
5647 int usableSize; /* Total usable size of a page */
5648 u8 *data; /* Content of the parent page */
5649 u8 *cdata; /* Content of the child page */
5650 int hdr; /* Offset to page header in parent */
5651 int brk; /* Offset to content of first cell in parent */
5653 assert( pPage->pParent==0 );
5654 assert( pPage->nOverflow>0 );
5656 assert( sqlite3_mutex_held(pBt->mutex) );
5657 rc = allocateBtreePage(pBt, &pChild, &pgnoChild, pPage->pgno, 0);
5659 assert( sqlite3PagerIswriteable(pChild->pDbPage) );
5660 usableSize = pBt->usableSize;
5661 data = pPage->aData;
5662 hdr = pPage->hdrOffset;
5663 brk = get2byte(&data[hdr+5]);
5664 cdata = pChild->aData;
5665 memcpy(cdata, &data[hdr], pPage->cellOffset+2*pPage->nCell-hdr);
5666 memcpy(&cdata[brk], &data[brk], usableSize-brk);
5667 if( pChild->isInit ) return SQLITE_CORRUPT;
5668 rc = sqlite3BtreeInitPage(pChild, pPage);
5669 if( rc ) goto balancedeeper_out;
5670 memcpy(pChild->aOvfl, pPage->aOvfl, pPage->nOverflow*sizeof(pPage->aOvfl[0]));
5671 pChild->nOverflow = pPage->nOverflow;
5672 if( pChild->nOverflow ){
5675 assert( pChild->nCell==pPage->nCell );
5676 zeroPage(pPage, pChild->aData[0] & ~PTF_LEAF);
5677 put4byte(&pPage->aData[pPage->hdrOffset+8], pgnoChild);
5678 TRACE(("BALANCE: copy root %d into %d\n", pPage->pgno, pChild->pgno));
5681 rc = ptrmapPut(pBt, pChild->pgno, PTRMAP_BTREE, pPage->pgno);
5682 if( rc ) goto balancedeeper_out;
5683 for(i=0; i<pChild->nCell; i++){
5684 rc = ptrmapPutOvfl(pChild, i);
5685 if( rc!=SQLITE_OK ){
5686 goto balancedeeper_out;
5689 rc = reparentChildPages(pChild, 1);
5691 if( rc==SQLITE_OK ){
5692 rc = balance_nonroot(pChild);
5696 releasePage(pChild);
5701 ** Decide if the page pPage needs to be balanced. If balancing is
5702 ** required, call the appropriate balancing routine.
5704 static int balance(MemPage *pPage, int insert){
5706 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5707 if( pPage->pParent==0 ){
5708 rc = sqlite3PagerWrite(pPage->pDbPage);
5709 if( rc==SQLITE_OK && pPage->nOverflow>0 ){
5710 rc = balance_deeper(pPage);
5712 if( rc==SQLITE_OK && pPage->nCell==0 ){
5713 rc = balance_shallower(pPage);
5716 if( pPage->nOverflow>0 ||
5717 (!insert && pPage->nFree>pPage->pBt->usableSize*2/3) ){
5718 rc = balance_nonroot(pPage);
5725 ** This routine checks all cursors that point to table pgnoRoot.
5726 ** If any of those cursors were opened with wrFlag==0 in a different
5727 ** database connection (a database connection that shares the pager
5728 ** cache with the current connection) and that other connection
5729 ** is not in the ReadUncommmitted state, then this routine returns
5732 ** As well as cursors with wrFlag==0, cursors with wrFlag==1 and
5733 ** isIncrblobHandle==1 are also considered 'read' cursors. Incremental
5734 ** blob cursors are used for both reading and writing.
5736 ** When pgnoRoot is the root page of an intkey table, this function is also
5737 ** responsible for invalidating incremental blob cursors when the table row
5738 ** on which they are opened is deleted or modified. Cursors are invalidated
5739 ** according to the following rules:
5741 ** 1) When BtreeClearTable() is called to completely delete the contents
5742 ** of a B-Tree table, pExclude is set to zero and parameter iRow is
5743 ** set to non-zero. In this case all incremental blob cursors open
5744 ** on the table rooted at pgnoRoot are invalidated.
5746 ** 2) When BtreeInsert(), BtreeDelete() or BtreePutData() is called to
5747 ** modify a table row via an SQL statement, pExclude is set to the
5748 ** write cursor used to do the modification and parameter iRow is set
5749 ** to the integer row id of the B-Tree entry being modified. Unless
5750 ** pExclude is itself an incremental blob cursor, then all incremental
5751 ** blob cursors open on row iRow of the B-Tree are invalidated.
5753 ** 3) If both pExclude and iRow are set to zero, no incremental blob
5754 ** cursors are invalidated.
5756 static int checkReadLocks(
5763 BtShared *pBt = pBtree->pBt;
5764 sqlite3 *db = pBtree->db;
5765 assert( sqlite3BtreeHoldsMutex(pBtree) );
5766 for(p=pBt->pCursor; p; p=p->pNext){
5767 if( p==pExclude ) continue;
5768 if( p->pgnoRoot!=pgnoRoot ) continue;
5769 #ifndef SQLITE_OMIT_INCRBLOB
5770 if( p->isIncrblobHandle && (
5772 || (pExclude && !pExclude->isIncrblobHandle && p->info.nKey==iRow)
5774 p->eState = CURSOR_INVALID;
5777 if( p->eState!=CURSOR_VALID ) continue;
5779 #ifndef SQLITE_OMIT_INCRBLOB
5780 || p->isIncrblobHandle
5783 sqlite3 *dbOther = p->pBtree->db;
5785 (dbOther!=db && (dbOther->flags & SQLITE_ReadUncommitted)==0) ){
5786 return SQLITE_LOCKED;
5794 ** Insert a new record into the BTree. The key is given by (pKey,nKey)
5795 ** and the data is given by (pData,nData). The cursor is used only to
5796 ** define what table the record should be inserted into. The cursor
5797 ** is left pointing at a random location.
5799 ** For an INTKEY table, only the nKey value of the key is used. pKey is
5800 ** ignored. For a ZERODATA table, the pData and nData are both ignored.
5802 int sqlite3BtreeInsert(
5803 BtCursor *pCur, /* Insert data into the table of this cursor */
5804 const void *pKey, i64 nKey, /* The key of the new record */
5805 const void *pData, int nData, /* The data of the new record */
5806 int nZero, /* Number of extra 0 bytes to append to data */
5807 int appendBias /* True if this is likely an append */
5813 Btree *p = pCur->pBtree;
5814 BtShared *pBt = p->pBt;
5815 unsigned char *oldCell;
5816 unsigned char *newCell = 0;
5818 assert( cursorHoldsMutex(pCur) );
5819 if( pBt->inTransaction!=TRANS_WRITE ){
5820 /* Must start a transaction before doing an insert */
5821 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
5824 assert( !pBt->readOnly );
5825 if( !pCur->wrFlag ){
5826 return SQLITE_PERM; /* Cursor not open for writing */
5828 if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur, nKey) ){
5829 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
5831 if( pCur->eState==CURSOR_FAULT ){
5835 /* Save the positions of any other cursors open on this table */
5836 clearCursorPosition(pCur);
5838 SQLITE_OK!=(rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur)) ||
5839 SQLITE_OK!=(rc = sqlite3BtreeMoveto(pCur, pKey, 0, nKey, appendBias, &loc))
5844 pPage = pCur->pPage;
5845 assert( pPage->intKey || nKey>=0 );
5846 assert( pPage->leaf || !pPage->intKey );
5847 TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
5848 pCur->pgnoRoot, nKey, nData, pPage->pgno,
5849 loc==0 ? "overwrite" : "new entry"));
5850 assert( pPage->isInit );
5851 allocateTempSpace(pBt);
5852 newCell = pBt->pTmpSpace;
5853 if( newCell==0 ) return SQLITE_NOMEM;
5854 rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);
5855 if( rc ) goto end_insert;
5856 assert( szNew==cellSizePtr(pPage, newCell) );
5857 assert( szNew<=MX_CELL_SIZE(pBt) );
5858 if( loc==0 && CURSOR_VALID==pCur->eState ){
5860 assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
5861 rc = sqlite3PagerWrite(pPage->pDbPage);
5865 oldCell = findCell(pPage, pCur->idx);
5867 memcpy(newCell, oldCell, 4);
5869 szOld = cellSizePtr(pPage, oldCell);
5870 rc = clearCell(pPage, oldCell);
5871 if( rc ) goto end_insert;
5872 dropCell(pPage, pCur->idx, szOld);
5873 }else if( loc<0 && pPage->nCell>0 ){
5874 assert( pPage->leaf );
5876 pCur->info.nSize = 0;
5877 pCur->validNKey = 0;
5879 assert( pPage->leaf );
5881 rc = insertCell(pPage, pCur->idx, newCell, szNew, 0, 0);
5882 if( rc!=SQLITE_OK ) goto end_insert;
5883 rc = balance(pPage, 1);
5884 if( rc==SQLITE_OK ){
5892 ** Delete the entry that the cursor is pointing to. The cursor
5893 ** is left pointing at a random location.
5895 int sqlite3BtreeDelete(BtCursor *pCur){
5896 MemPage *pPage = pCur->pPage;
5897 unsigned char *pCell;
5900 Btree *p = pCur->pBtree;
5901 BtShared *pBt = p->pBt;
5903 assert( cursorHoldsMutex(pCur) );
5904 assert( pPage->isInit );
5905 if( pBt->inTransaction!=TRANS_WRITE ){
5906 /* Must start a transaction before doing a delete */
5907 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
5910 assert( !pBt->readOnly );
5911 if( pCur->eState==CURSOR_FAULT ){
5914 if( pCur->idx >= pPage->nCell ){
5915 return SQLITE_ERROR; /* The cursor is not pointing to anything */
5917 if( !pCur->wrFlag ){
5918 return SQLITE_PERM; /* Did not open this cursor for writing */
5920 if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur, pCur->info.nKey) ){
5921 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
5924 /* Restore the current cursor position (a no-op if the cursor is not in
5925 ** CURSOR_REQUIRESEEK state) and save the positions of any other cursors
5926 ** open on the same table. Then call sqlite3PagerWrite() on the page
5927 ** that the entry will be deleted from.
5930 (rc = restoreCursorPosition(pCur))!=0 ||
5931 (rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur))!=0 ||
5932 (rc = sqlite3PagerWrite(pPage->pDbPage))!=0
5937 /* Locate the cell within its page and leave pCell pointing to the
5938 ** data. The clearCell() call frees any overflow pages associated with the
5939 ** cell. The cell itself is still intact.
5941 pCell = findCell(pPage, pCur->idx);
5943 pgnoChild = get4byte(pCell);
5945 rc = clearCell(pPage, pCell);
5952 ** The entry we are about to delete is not a leaf so if we do not
5953 ** do something we will leave a hole on an internal page.
5954 ** We have to fill the hole by moving in a cell from a leaf. The
5955 ** next Cell after the one to be deleted is guaranteed to exist and
5956 ** to be a leaf so we can use it.
5959 unsigned char *pNext;
5961 unsigned char *tempCell = 0;
5962 assert( !pPage->intKey );
5963 sqlite3BtreeGetTempCursor(pCur, &leafCur);
5964 rc = sqlite3BtreeNext(&leafCur, ¬Used);
5965 if( rc==SQLITE_OK ){
5966 rc = sqlite3PagerWrite(leafCur.pPage->pDbPage);
5968 if( rc==SQLITE_OK ){
5970 TRACE(("DELETE: table=%d delete internal from %d replace from leaf %d\n",
5971 pCur->pgnoRoot, pPage->pgno, leafCur.pPage->pgno));
5972 dropCell(pPage, pCur->idx, cellSizePtr(pPage, pCell));
5973 pNext = findCell(leafCur.pPage, leafCur.idx);
5974 szNext = cellSizePtr(leafCur.pPage, pNext);
5975 assert( MX_CELL_SIZE(pBt)>=szNext+4 );
5976 allocateTempSpace(pBt);
5977 tempCell = pBt->pTmpSpace;
5981 if( rc==SQLITE_OK ){
5982 rc = insertCell(pPage, pCur->idx, pNext-4, szNext+4, tempCell, 0);
5984 if( rc==SQLITE_OK ){
5985 put4byte(findOverflowCell(pPage, pCur->idx), pgnoChild);
5986 rc = balance(pPage, 0);
5988 if( rc==SQLITE_OK ){
5989 dropCell(leafCur.pPage, leafCur.idx, szNext);
5990 rc = balance(leafCur.pPage, 0);
5993 sqlite3BtreeReleaseTempCursor(&leafCur);
5995 TRACE(("DELETE: table=%d delete from leaf %d\n",
5996 pCur->pgnoRoot, pPage->pgno));
5997 dropCell(pPage, pCur->idx, cellSizePtr(pPage, pCell));
5998 rc = balance(pPage, 0);
6000 if( rc==SQLITE_OK ){
6007 ** Create a new BTree table. Write into *piTable the page
6008 ** number for the root page of the new table.
6010 ** The type of type is determined by the flags parameter. Only the
6011 ** following values of flags are currently in use. Other values for
6012 ** flags might not work:
6014 ** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys
6015 ** BTREE_ZERODATA Used for SQL indices
6017 static int btreeCreateTable(Btree *p, int *piTable, int flags){
6018 BtShared *pBt = p->pBt;
6023 assert( sqlite3BtreeHoldsMutex(p) );
6024 if( pBt->inTransaction!=TRANS_WRITE ){
6025 /* Must start a transaction first */
6026 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
6029 assert( !pBt->readOnly );
6031 #ifdef SQLITE_OMIT_AUTOVACUUM
6032 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
6037 if( pBt->autoVacuum ){
6038 Pgno pgnoMove; /* Move a page here to make room for the root-page */
6039 MemPage *pPageMove; /* The page to move to. */
6041 /* Creating a new table may probably require moving an existing database
6042 ** to make room for the new tables root page. In case this page turns
6043 ** out to be an overflow page, delete all overflow page-map caches
6044 ** held by open cursors.
6046 invalidateAllOverflowCache(pBt);
6048 /* Read the value of meta[3] from the database to determine where the
6049 ** root page of the new table should go. meta[3] is the largest root-page
6050 ** created so far, so the new root-page is (meta[3]+1).
6052 rc = sqlite3BtreeGetMeta(p, 4, &pgnoRoot);
6053 if( rc!=SQLITE_OK ){
6058 /* The new root-page may not be allocated on a pointer-map page, or the
6059 ** PENDING_BYTE page.
6061 while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
6062 pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
6065 assert( pgnoRoot>=3 );
6067 /* Allocate a page. The page that currently resides at pgnoRoot will
6068 ** be moved to the allocated page (unless the allocated page happens
6069 ** to reside at pgnoRoot).
6071 rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1);
6072 if( rc!=SQLITE_OK ){
6076 if( pgnoMove!=pgnoRoot ){
6077 /* pgnoRoot is the page that will be used for the root-page of
6078 ** the new table (assuming an error did not occur). But we were
6079 ** allocated pgnoMove. If required (i.e. if it was not allocated
6080 ** by extending the file), the current page at position pgnoMove
6081 ** is already journaled.
6086 releasePage(pPageMove);
6088 /* Move the page currently at pgnoRoot to pgnoMove. */
6089 rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0);
6090 if( rc!=SQLITE_OK ){
6093 rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
6094 if( rc!=SQLITE_OK || eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
6098 assert( eType!=PTRMAP_ROOTPAGE );
6099 assert( eType!=PTRMAP_FREEPAGE );
6100 rc = sqlite3PagerWrite(pRoot->pDbPage);
6101 if( rc!=SQLITE_OK ){
6105 rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
6108 /* Obtain the page at pgnoRoot */
6109 if( rc!=SQLITE_OK ){
6112 rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0);
6113 if( rc!=SQLITE_OK ){
6116 rc = sqlite3PagerWrite(pRoot->pDbPage);
6117 if( rc!=SQLITE_OK ){
6125 /* Update the pointer-map and meta-data with the new root-page number. */
6126 rc = ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0);
6131 rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
6138 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
6142 assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
6143 zeroPage(pRoot, flags | PTF_LEAF);
6144 sqlite3PagerUnref(pRoot->pDbPage);
6145 *piTable = (int)pgnoRoot;
6148 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
6150 sqlite3BtreeEnter(p);
6152 rc = btreeCreateTable(p, piTable, flags);
6153 sqlite3BtreeLeave(p);
6158 ** Erase the given database page and all its children. Return
6159 ** the page to the freelist.
6161 static int clearDatabasePage(
6162 BtShared *pBt, /* The BTree that contains the table */
6163 Pgno pgno, /* Page number to clear */
6164 MemPage *pParent, /* Parent page. NULL for the root */
6165 int freePageFlag /* Deallocate page if true */
6169 unsigned char *pCell;
6172 assert( sqlite3_mutex_held(pBt->mutex) );
6173 if( pgno>pagerPagecount(pBt->pPager) ){
6174 return SQLITE_CORRUPT_BKPT;
6177 rc = getAndInitPage(pBt, pgno, &pPage, pParent);
6178 if( rc ) goto cleardatabasepage_out;
6179 for(i=0; i<pPage->nCell; i++){
6180 pCell = findCell(pPage, i);
6182 rc = clearDatabasePage(pBt, get4byte(pCell), pPage->pParent, 1);
6183 if( rc ) goto cleardatabasepage_out;
6185 rc = clearCell(pPage, pCell);
6186 if( rc ) goto cleardatabasepage_out;
6189 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), pPage->pParent, 1);
6190 if( rc ) goto cleardatabasepage_out;
6193 rc = freePage(pPage);
6194 }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
6195 zeroPage(pPage, pPage->aData[0] | PTF_LEAF);
6198 cleardatabasepage_out:
6204 ** Delete all information from a single table in the database. iTable is
6205 ** the page number of the root of the table. After this routine returns,
6206 ** the root page is empty, but still exists.
6208 ** This routine will fail with SQLITE_LOCKED if there are any open
6209 ** read cursors on the table. Open write cursors are moved to the
6210 ** root of the table.
6212 int sqlite3BtreeClearTable(Btree *p, int iTable){
6214 BtShared *pBt = p->pBt;
6215 sqlite3BtreeEnter(p);
6217 if( p->inTrans!=TRANS_WRITE ){
6218 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
6219 }else if( (rc = checkReadLocks(p, iTable, 0, 1))!=SQLITE_OK ){
6221 }else if( SQLITE_OK!=(rc = saveAllCursors(pBt, iTable, 0)) ){
6224 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, 0);
6226 sqlite3BtreeLeave(p);
6231 ** Erase all information in a table and add the root of the table to
6232 ** the freelist. Except, the root of the principle table (the one on
6233 ** page 1) is never added to the freelist.
6235 ** This routine will fail with SQLITE_LOCKED if there are any open
6236 ** cursors on the table.
6238 ** If AUTOVACUUM is enabled and the page at iTable is not the last
6239 ** root page in the database file, then the last root page
6240 ** in the database file is moved into the slot formerly occupied by
6241 ** iTable and that last slot formerly occupied by the last root page
6242 ** is added to the freelist instead of iTable. In this say, all
6243 ** root pages are kept at the beginning of the database file, which
6244 ** is necessary for AUTOVACUUM to work right. *piMoved is set to the
6245 ** page number that used to be the last root page in the file before
6246 ** the move. If no page gets moved, *piMoved is set to 0.
6247 ** The last root page is recorded in meta[3] and the value of
6248 ** meta[3] is updated by this procedure.
6250 static int btreeDropTable(Btree *p, int iTable, int *piMoved){
6253 BtShared *pBt = p->pBt;
6255 assert( sqlite3BtreeHoldsMutex(p) );
6256 if( p->inTrans!=TRANS_WRITE ){
6257 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
6260 /* It is illegal to drop a table if any cursors are open on the
6261 ** database. This is because in auto-vacuum mode the backend may
6262 ** need to move another root-page to fill a gap left by the deleted
6263 ** root page. If an open cursor was using this page a problem would
6267 return SQLITE_LOCKED;
6270 rc = sqlite3BtreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
6272 rc = sqlite3BtreeClearTable(p, iTable);
6281 #ifdef SQLITE_OMIT_AUTOVACUUM
6282 rc = freePage(pPage);
6285 if( pBt->autoVacuum ){
6287 rc = sqlite3BtreeGetMeta(p, 4, &maxRootPgno);
6288 if( rc!=SQLITE_OK ){
6293 if( iTable==maxRootPgno ){
6294 /* If the table being dropped is the table with the largest root-page
6295 ** number in the database, put the root page on the free list.
6297 rc = freePage(pPage);
6299 if( rc!=SQLITE_OK ){
6303 /* The table being dropped does not have the largest root-page
6304 ** number in the database. So move the page that does into the
6305 ** gap left by the deleted root-page.
6309 rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0);
6310 if( rc!=SQLITE_OK ){
6313 rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
6315 if( rc!=SQLITE_OK ){
6318 rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0);
6319 if( rc!=SQLITE_OK ){
6322 rc = freePage(pMove);
6324 if( rc!=SQLITE_OK ){
6327 *piMoved = maxRootPgno;
6330 /* Set the new 'max-root-page' value in the database header. This
6331 ** is the old value less one, less one more if that happens to
6332 ** be a root-page number, less one again if that is the
6333 ** PENDING_BYTE_PAGE.
6336 if( maxRootPgno==PENDING_BYTE_PAGE(pBt) ){
6339 if( maxRootPgno==PTRMAP_PAGENO(pBt, maxRootPgno) ){
6342 assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
6344 rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
6346 rc = freePage(pPage);
6351 /* If sqlite3BtreeDropTable was called on page 1. */
6352 zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
6357 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
6359 sqlite3BtreeEnter(p);
6361 rc = btreeDropTable(p, iTable, piMoved);
6362 sqlite3BtreeLeave(p);
6368 ** Read the meta-information out of a database file. Meta[0]
6369 ** is the number of free pages currently in the database. Meta[1]
6370 ** through meta[15] are available for use by higher layers. Meta[0]
6371 ** is read-only, the others are read/write.
6373 ** The schema layer numbers meta values differently. At the schema
6374 ** layer (and the SetCookie and ReadCookie opcodes) the number of
6375 ** free pages is not visible. So Cookie[0] is the same as Meta[1].
6377 int sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
6381 BtShared *pBt = p->pBt;
6383 sqlite3BtreeEnter(p);
6386 /* Reading a meta-data value requires a read-lock on page 1 (and hence
6387 ** the sqlite_master table. We grab this lock regardless of whether or
6388 ** not the SQLITE_ReadUncommitted flag is set (the table rooted at page
6389 ** 1 is treated as a special case by queryTableLock() and lockTable()).
6391 rc = queryTableLock(p, 1, READ_LOCK);
6392 if( rc!=SQLITE_OK ){
6393 sqlite3BtreeLeave(p);
6397 assert( idx>=0 && idx<=15 );
6398 rc = sqlite3PagerGet(pBt->pPager, 1, &pDbPage);
6400 sqlite3BtreeLeave(p);
6403 pP1 = (unsigned char *)sqlite3PagerGetData(pDbPage);
6404 *pMeta = get4byte(&pP1[36 + idx*4]);
6405 sqlite3PagerUnref(pDbPage);
6407 /* If autovacuumed is disabled in this build but we are trying to
6408 ** access an autovacuumed database, then make the database readonly.
6410 #ifdef SQLITE_OMIT_AUTOVACUUM
6411 if( idx==4 && *pMeta>0 ) pBt->readOnly = 1;
6414 /* Grab the read-lock on page 1. */
6415 rc = lockTable(p, 1, READ_LOCK);
6416 sqlite3BtreeLeave(p);
6421 ** Write meta-information back into the database. Meta[0] is
6422 ** read-only and may not be written.
6424 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
6425 BtShared *pBt = p->pBt;
6428 assert( idx>=1 && idx<=15 );
6429 sqlite3BtreeEnter(p);
6431 if( p->inTrans!=TRANS_WRITE ){
6432 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
6434 assert( pBt->pPage1!=0 );
6435 pP1 = pBt->pPage1->aData;
6436 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
6437 if( rc==SQLITE_OK ){
6438 put4byte(&pP1[36 + idx*4], iMeta);
6439 #ifndef SQLITE_OMIT_AUTOVACUUM
6441 assert( pBt->autoVacuum || iMeta==0 );
6442 assert( iMeta==0 || iMeta==1 );
6443 pBt->incrVacuum = iMeta;
6448 sqlite3BtreeLeave(p);
6453 ** Return the flag byte at the beginning of the page that the cursor
6454 ** is currently pointing to.
6456 int sqlite3BtreeFlags(BtCursor *pCur){
6457 /* TODO: What about CURSOR_REQUIRESEEK state? Probably need to call
6458 ** restoreCursorPosition() here.
6461 restoreCursorPosition(pCur);
6462 pPage = pCur->pPage;
6463 assert( cursorHoldsMutex(pCur) );
6464 assert( pPage->pBt==pCur->pBt );
6465 return pPage ? pPage->aData[pPage->hdrOffset] : 0;
6470 ** Return the pager associated with a BTree. This routine is used for
6471 ** testing and debugging only.
6473 Pager *sqlite3BtreePager(Btree *p){
6474 return p->pBt->pPager;
6477 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
6479 ** Append a message to the error message string.
6481 static void checkAppendMsg(
6482 IntegrityCk *pCheck,
6484 const char *zFormat,
6488 if( !pCheck->mxErr ) return;
6491 va_start(ap, zFormat);
6492 if( pCheck->errMsg.nChar ){
6493 sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);
6496 sqlite3StrAccumAppend(&pCheck->errMsg, zMsg1, -1);
6498 sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap);
6500 if( pCheck->errMsg.mallocFailed ){
6501 pCheck->mallocFailed = 1;
6504 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
6506 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
6508 ** Add 1 to the reference count for page iPage. If this is the second
6509 ** reference to the page, add an error message to pCheck->zErrMsg.
6510 ** Return 1 if there are 2 ore more references to the page and 0 if
6511 ** if this is the first reference to the page.
6513 ** Also check that the page number is in bounds.
6515 static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){
6516 if( iPage==0 ) return 1;
6517 if( iPage>pCheck->nPage || iPage<0 ){
6518 checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage);
6521 if( pCheck->anRef[iPage]==1 ){
6522 checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage);
6525 return (pCheck->anRef[iPage]++)>1;
6528 #ifndef SQLITE_OMIT_AUTOVACUUM
6530 ** Check that the entry in the pointer-map for page iChild maps to
6531 ** page iParent, pointer type ptrType. If not, append an error message
6534 static void checkPtrmap(
6535 IntegrityCk *pCheck, /* Integrity check context */
6536 Pgno iChild, /* Child page number */
6537 u8 eType, /* Expected pointer map type */
6538 Pgno iParent, /* Expected pointer map parent page number */
6539 char *zContext /* Context description (used for error msg) */
6545 rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
6546 if( rc!=SQLITE_OK ){
6547 checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild);
6551 if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
6552 checkAppendMsg(pCheck, zContext,
6553 "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
6554 iChild, eType, iParent, ePtrmapType, iPtrmapParent);
6560 ** Check the integrity of the freelist or of an overflow page list.
6561 ** Verify that the number of pages on the list is N.
6563 static void checkList(
6564 IntegrityCk *pCheck, /* Integrity checking context */
6565 int isFreeList, /* True for a freelist. False for overflow page list */
6566 int iPage, /* Page number for first page in the list */
6567 int N, /* Expected number of pages in the list */
6568 char *zContext /* Context for error messages */
6573 while( N-- > 0 && pCheck->mxErr ){
6575 unsigned char *pOvflData;
6577 checkAppendMsg(pCheck, zContext,
6578 "%d of %d pages missing from overflow list starting at %d",
6579 N+1, expected, iFirst);
6582 if( checkRef(pCheck, iPage, zContext) ) break;
6583 if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){
6584 checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage);
6587 pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
6589 int n = get4byte(&pOvflData[4]);
6590 #ifndef SQLITE_OMIT_AUTOVACUUM
6591 if( pCheck->pBt->autoVacuum ){
6592 checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext);
6595 if( n>pCheck->pBt->usableSize/4-2 ){
6596 checkAppendMsg(pCheck, zContext,
6597 "freelist leaf count too big on page %d", iPage);
6601 Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
6602 #ifndef SQLITE_OMIT_AUTOVACUUM
6603 if( pCheck->pBt->autoVacuum ){
6604 checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext);
6607 checkRef(pCheck, iFreePage, zContext);
6612 #ifndef SQLITE_OMIT_AUTOVACUUM
6614 /* If this database supports auto-vacuum and iPage is not the last
6615 ** page in this overflow list, check that the pointer-map entry for
6616 ** the following page matches iPage.
6618 if( pCheck->pBt->autoVacuum && N>0 ){
6619 i = get4byte(pOvflData);
6620 checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext);
6624 iPage = get4byte(pOvflData);
6625 sqlite3PagerUnref(pOvflPage);
6628 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
6630 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
6632 ** Do various sanity checks on a single page of a tree. Return
6633 ** the tree depth. Root pages return 0. Parents of root pages
6634 ** return 1, and so forth.
6636 ** These checks are done:
6638 ** 1. Make sure that cells and freeblocks do not overlap
6639 ** but combine to completely cover the page.
6640 ** NO 2. Make sure cell keys are in order.
6641 ** NO 3. Make sure no key is less than or equal to zLowerBound.
6642 ** NO 4. Make sure no key is greater than or equal to zUpperBound.
6643 ** 5. Check the integrity of overflow pages.
6644 ** 6. Recursively call checkTreePage on all children.
6645 ** 7. Verify that the depth of all children is the same.
6646 ** 8. Make sure this page is at least 33% full or else it is
6647 ** the root of the tree.
6649 static int checkTreePage(
6650 IntegrityCk *pCheck, /* Context for the sanity check */
6651 int iPage, /* Page number of the page to check */
6652 MemPage *pParent, /* Parent page */
6653 char *zParentContext /* Parent context */
6656 int i, rc, depth, d2, pgno, cnt;
6665 sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage);
6667 /* Check that the page exists
6670 usableSize = pBt->usableSize;
6671 if( iPage==0 ) return 0;
6672 if( checkRef(pCheck, iPage, zParentContext) ) return 0;
6673 if( (rc = sqlite3BtreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
6674 checkAppendMsg(pCheck, zContext,
6675 "unable to get the page. error code=%d", rc);
6678 if( (rc = sqlite3BtreeInitPage(pPage, pParent))!=0 ){
6679 checkAppendMsg(pCheck, zContext,
6680 "sqlite3BtreeInitPage() returns error code %d", rc);
6685 /* Check out all the cells.
6688 for(i=0; i<pPage->nCell && pCheck->mxErr; i++){
6693 /* Check payload overflow pages
6695 sqlite3_snprintf(sizeof(zContext), zContext,
6696 "On tree page %d cell %d: ", iPage, i);
6697 pCell = findCell(pPage,i);
6698 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
6700 if( !pPage->intKey ) sz += info.nKey;
6701 assert( sz==info.nPayload );
6702 if( sz>info.nLocal ){
6703 int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4);
6704 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
6705 #ifndef SQLITE_OMIT_AUTOVACUUM
6706 if( pBt->autoVacuum ){
6707 checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext);
6710 checkList(pCheck, 0, pgnoOvfl, nPage, zContext);
6713 /* Check sanity of left child page.
6716 pgno = get4byte(pCell);
6717 #ifndef SQLITE_OMIT_AUTOVACUUM
6718 if( pBt->autoVacuum ){
6719 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
6722 d2 = checkTreePage(pCheck,pgno,pPage,zContext);
6723 if( i>0 && d2!=depth ){
6724 checkAppendMsg(pCheck, zContext, "Child page depth differs");
6730 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
6731 sqlite3_snprintf(sizeof(zContext), zContext,
6732 "On page %d at right child: ", iPage);
6733 #ifndef SQLITE_OMIT_AUTOVACUUM
6734 if( pBt->autoVacuum ){
6735 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, 0);
6738 checkTreePage(pCheck, pgno, pPage, zContext);
6741 /* Check for complete coverage of the page
6743 data = pPage->aData;
6744 hdr = pPage->hdrOffset;
6745 hit = sqlite3PageMalloc( pBt->pageSize );
6747 pCheck->mallocFailed = 1;
6749 memset(hit, 0, usableSize );
6750 memset(hit, 1, get2byte(&data[hdr+5]));
6751 nCell = get2byte(&data[hdr+3]);
6752 cellStart = hdr + 12 - 4*pPage->leaf;
6753 for(i=0; i<nCell; i++){
6754 int pc = get2byte(&data[cellStart+i*2]);
6755 u16 size = cellSizePtr(pPage, &data[pc]);
6757 if( (pc+size-1)>=usableSize || pc<0 ){
6758 checkAppendMsg(pCheck, 0,
6759 "Corruption detected in cell %d on page %d",i,iPage,0);
6761 for(j=pc+size-1; j>=pc; j--) hit[j]++;
6764 for(cnt=0, i=get2byte(&data[hdr+1]); i>0 && i<usableSize && cnt<10000;
6766 int size = get2byte(&data[i+2]);
6768 if( (i+size-1)>=usableSize || i<0 ){
6769 checkAppendMsg(pCheck, 0,
6770 "Corruption detected in cell %d on page %d",i,iPage,0);
6772 for(j=i+size-1; j>=i; j--) hit[j]++;
6774 i = get2byte(&data[i]);
6776 for(i=cnt=0; i<usableSize; i++){
6779 }else if( hit[i]>1 ){
6780 checkAppendMsg(pCheck, 0,
6781 "Multiple uses for byte %d of page %d", i, iPage);
6785 if( cnt!=data[hdr+7] ){
6786 checkAppendMsg(pCheck, 0,
6787 "Fragmented space is %d byte reported as %d on page %d",
6788 cnt, data[hdr+7], iPage);
6791 sqlite3PageFree(hit);
6796 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
6798 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
6800 ** This routine does a complete check of the given BTree file. aRoot[] is
6801 ** an array of pages numbers were each page number is the root page of
6802 ** a table. nRoot is the number of entries in aRoot.
6804 ** Write the number of error seen in *pnErr. Except for some memory
6805 ** allocation errors, nn error message is held in memory obtained from
6806 ** malloc is returned if *pnErr is non-zero. If *pnErr==0 then NULL is
6809 char *sqlite3BtreeIntegrityCheck(
6810 Btree *p, /* The btree to be checked */
6811 int *aRoot, /* An array of root pages numbers for individual trees */
6812 int nRoot, /* Number of entries in aRoot[] */
6813 int mxErr, /* Stop reporting errors after this many */
6814 int *pnErr /* Write number of errors seen to this variable */
6819 BtShared *pBt = p->pBt;
6822 sqlite3BtreeEnter(p);
6824 nRef = sqlite3PagerRefcount(pBt->pPager);
6825 if( lockBtreeWithRetry(p)!=SQLITE_OK ){
6827 sqlite3BtreeLeave(p);
6828 return sqlite3DbStrDup(0, "cannot acquire a read lock on the database");
6831 sCheck.pPager = pBt->pPager;
6832 sCheck.nPage = pagerPagecount(sCheck.pPager);
6833 sCheck.mxErr = mxErr;
6835 sCheck.mallocFailed = 0;
6837 #ifndef SQLITE_OMIT_AUTOVACUUM
6838 if( pBt->nTrunc!=0 ){
6839 sCheck.nPage = pBt->nTrunc;
6842 if( sCheck.nPage==0 ){
6843 unlockBtreeIfUnused(pBt);
6844 sqlite3BtreeLeave(p);
6847 sCheck.anRef = sqlite3Malloc( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
6848 if( !sCheck.anRef ){
6849 unlockBtreeIfUnused(pBt);
6851 sqlite3BtreeLeave(p);
6854 for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
6855 i = PENDING_BYTE_PAGE(pBt);
6856 if( i<=sCheck.nPage ){
6857 sCheck.anRef[i] = 1;
6859 sqlite3StrAccumInit(&sCheck.errMsg, zErr, sizeof(zErr), 20000);
6861 /* Check the integrity of the freelist
6863 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
6864 get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
6866 /* Check all the tables.
6868 for(i=0; i<nRoot && sCheck.mxErr; i++){
6869 if( aRoot[i]==0 ) continue;
6870 #ifndef SQLITE_OMIT_AUTOVACUUM
6871 if( pBt->autoVacuum && aRoot[i]>1 ){
6872 checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0);
6875 checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: ");
6878 /* Make sure every page in the file is referenced
6880 for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
6881 #ifdef SQLITE_OMIT_AUTOVACUUM
6882 if( sCheck.anRef[i]==0 ){
6883 checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
6886 /* If the database supports auto-vacuum, make sure no tables contain
6887 ** references to pointer-map pages.
6889 if( sCheck.anRef[i]==0 &&
6890 (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
6891 checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
6893 if( sCheck.anRef[i]!=0 &&
6894 (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
6895 checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i);
6900 /* Make sure this analysis did not leave any unref() pages
6902 unlockBtreeIfUnused(pBt);
6903 if( nRef != sqlite3PagerRefcount(pBt->pPager) ){
6904 checkAppendMsg(&sCheck, 0,
6905 "Outstanding page count goes from %d to %d during this analysis",
6906 nRef, sqlite3PagerRefcount(pBt->pPager)
6910 /* Clean up and report errors.
6912 sqlite3BtreeLeave(p);
6913 sqlite3_free(sCheck.anRef);
6914 if( sCheck.mallocFailed ){
6915 sqlite3StrAccumReset(&sCheck.errMsg);
6916 *pnErr = sCheck.nErr+1;
6919 *pnErr = sCheck.nErr;
6920 if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);
6921 return sqlite3StrAccumFinish(&sCheck.errMsg);
6923 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
6926 ** Return the full pathname of the underlying database file.
6928 ** The pager filename is invariant as long as the pager is
6929 ** open so it is safe to access without the BtShared mutex.
6931 const char *sqlite3BtreeGetFilename(Btree *p){
6932 assert( p->pBt->pPager!=0 );
6933 return sqlite3PagerFilename(p->pBt->pPager);
6937 ** Return the pathname of the directory that contains the database file.
6939 ** The pager directory name is invariant as long as the pager is
6940 ** open so it is safe to access without the BtShared mutex.
6942 const char *sqlite3BtreeGetDirname(Btree *p){
6943 assert( p->pBt->pPager!=0 );
6944 return sqlite3PagerDirname(p->pBt->pPager);
6948 ** Return the pathname of the journal file for this database. The return
6949 ** value of this routine is the same regardless of whether the journal file
6950 ** has been created or not.
6952 ** The pager journal filename is invariant as long as the pager is
6953 ** open so it is safe to access without the BtShared mutex.
6955 const char *sqlite3BtreeGetJournalname(Btree *p){
6956 assert( p->pBt->pPager!=0 );
6957 return sqlite3PagerJournalname(p->pBt->pPager);
6960 #ifndef SQLITE_OMIT_VACUUM
6962 ** Copy the complete content of pBtFrom into pBtTo. A transaction
6963 ** must be active for both files.
6965 ** The size of file pTo may be reduced by this operation.
6966 ** If anything goes wrong, the transaction on pTo is rolled back.
6968 ** If successful, CommitPhaseOne() may be called on pTo before returning.
6969 ** The caller should finish committing the transaction on pTo by calling
6970 ** sqlite3BtreeCommit().
6972 static int btreeCopyFile(Btree *pTo, Btree *pFrom){
6976 Pgno nFromPage; /* Number of pages in pFrom */
6977 Pgno nToPage; /* Number of pages in pTo */
6978 Pgno nNewPage; /* Number of pages in pTo after the copy */
6980 Pgno iSkip; /* Pending byte page in pTo */
6981 int nToPageSize; /* Page size of pTo in bytes */
6982 int nFromPageSize; /* Page size of pFrom in bytes */
6984 BtShared *pBtTo = pTo->pBt;
6985 BtShared *pBtFrom = pFrom->pBt;
6986 pBtTo->db = pTo->db;
6987 pBtFrom->db = pFrom->db;
6989 nToPageSize = pBtTo->pageSize;
6990 nFromPageSize = pBtFrom->pageSize;
6992 if( pTo->inTrans!=TRANS_WRITE || pFrom->inTrans!=TRANS_WRITE ){
6993 return SQLITE_ERROR;
6995 if( pBtTo->pCursor ){
6999 nToPage = pagerPagecount(pBtTo->pPager);
7000 nFromPage = pagerPagecount(pBtFrom->pPager);
7001 iSkip = PENDING_BYTE_PAGE(pBtTo);
7003 /* Variable nNewPage is the number of pages required to store the
7004 ** contents of pFrom using the current page-size of pTo.
7006 nNewPage = ((i64)nFromPage * (i64)nFromPageSize + (i64)nToPageSize - 1) /
7009 for(i=1; rc==SQLITE_OK && (i<=nToPage || i<=nNewPage); i++){
7011 /* Journal the original page.
7013 ** iSkip is the page number of the locking page (PENDING_BYTE_PAGE)
7014 ** in database *pTo (before the copy). This page is never written
7015 ** into the journal file. Unless i==iSkip or the page was not
7016 ** present in pTo before the copy operation, journal page i from pTo.
7018 if( i!=iSkip && i<=nToPage ){
7019 DbPage *pDbPage = 0;
7020 rc = sqlite3PagerGet(pBtTo->pPager, i, &pDbPage);
7021 if( rc==SQLITE_OK ){
7022 rc = sqlite3PagerWrite(pDbPage);
7023 if( rc==SQLITE_OK && i>nFromPage ){
7024 /* Yeah. It seems wierd to call DontWrite() right after Write(). But
7025 ** that is because the names of those procedures do not exactly
7026 ** represent what they do. Write() really means "put this page in the
7027 ** rollback journal and mark it as dirty so that it will be written
7028 ** to the database file later." DontWrite() undoes the second part of
7029 ** that and prevents the page from being written to the database. The
7030 ** page is still on the rollback journal, though. And that is the
7031 ** whole point of this block: to put pages on the rollback journal.
7033 sqlite3PagerDontWrite(pDbPage);
7035 sqlite3PagerUnref(pDbPage);
7039 /* Overwrite the data in page i of the target database */
7040 if( rc==SQLITE_OK && i!=iSkip && i<=nNewPage ){
7042 DbPage *pToPage = 0;
7045 rc = sqlite3PagerGet(pBtTo->pPager, i, &pToPage);
7046 if( rc==SQLITE_OK ){
7047 rc = sqlite3PagerWrite(pToPage);
7051 iOff=(i-1)*nToPageSize;
7052 rc==SQLITE_OK && iOff<i*nToPageSize;
7053 iOff += nFromPageSize
7055 DbPage *pFromPage = 0;
7056 Pgno iFrom = (iOff/nFromPageSize)+1;
7058 if( iFrom==PENDING_BYTE_PAGE(pBtFrom) ){
7062 rc = sqlite3PagerGet(pBtFrom->pPager, iFrom, &pFromPage);
7063 if( rc==SQLITE_OK ){
7064 char *zTo = sqlite3PagerGetData(pToPage);
7065 char *zFrom = sqlite3PagerGetData(pFromPage);
7068 if( nFromPageSize>=nToPageSize ){
7069 zFrom += ((i-1)*nToPageSize - ((iFrom-1)*nFromPageSize));
7070 nCopy = nToPageSize;
7072 zTo += (((iFrom-1)*nFromPageSize) - (i-1)*nToPageSize);
7073 nCopy = nFromPageSize;
7076 memcpy(zTo, zFrom, nCopy);
7077 sqlite3PagerUnref(pFromPage);
7081 if( pToPage ) sqlite3PagerUnref(pToPage);
7085 /* If things have worked so far, the database file may need to be
7086 ** truncated. The complex part is that it may need to be truncated to
7087 ** a size that is not an integer multiple of nToPageSize - the current
7088 ** page size used by the pager associated with B-Tree pTo.
7090 ** For example, say the page-size of pTo is 2048 bytes and the original
7091 ** number of pages is 5 (10 KB file). If pFrom has a page size of 1024
7092 ** bytes and 9 pages, then the file needs to be truncated to 9KB.
7094 if( rc==SQLITE_OK ){
7095 if( nFromPageSize!=nToPageSize ){
7096 sqlite3_file *pFile = sqlite3PagerFile(pBtTo->pPager);
7097 i64 iSize = (i64)nFromPageSize * (i64)nFromPage;
7098 i64 iNow = (i64)((nToPage>nNewPage)?nToPage:nNewPage) * (i64)nToPageSize;
7099 i64 iPending = ((i64)PENDING_BYTE_PAGE(pBtTo)-1) *(i64)nToPageSize;
7101 assert( iSize<=iNow );
7103 /* Commit phase one syncs the journal file associated with pTo
7104 ** containing the original data. It does not sync the database file
7105 ** itself. After doing this it is safe to use OsTruncate() and other
7106 ** file APIs on the database file directly.
7108 pBtTo->db = pTo->db;
7109 rc = sqlite3PagerCommitPhaseOne(pBtTo->pPager, 0, 0, 1);
7110 if( iSize<iNow && rc==SQLITE_OK ){
7111 rc = sqlite3OsTruncate(pFile, iSize);
7114 /* The loop that copied data from database pFrom to pTo did not
7115 ** populate the locking page of database pTo. If the page-size of
7116 ** pFrom is smaller than that of pTo, this means some data will
7117 ** not have been copied.
7119 ** This block copies the missing data from database pFrom to pTo
7120 ** using file APIs. This is safe because at this point we know that
7121 ** all of the original data from pTo has been synced into the
7122 ** journal file. At this point it would be safe to do anything at
7123 ** all to the database file except truncate it to zero bytes.
7125 if( rc==SQLITE_OK && nFromPageSize<nToPageSize && iSize>iPending){
7129 rc==SQLITE_OK && iOff<(iPending+nToPageSize);
7130 iOff += nFromPageSize
7132 DbPage *pFromPage = 0;
7133 Pgno iFrom = (iOff/nFromPageSize)+1;
7135 if( iFrom==PENDING_BYTE_PAGE(pBtFrom) || iFrom>nFromPage ){
7139 rc = sqlite3PagerGet(pBtFrom->pPager, iFrom, &pFromPage);
7140 if( rc==SQLITE_OK ){
7141 char *zFrom = sqlite3PagerGetData(pFromPage);
7142 rc = sqlite3OsWrite(pFile, zFrom, nFromPageSize, iOff);
7143 sqlite3PagerUnref(pFromPage);
7148 /* Sync the database file */
7149 if( rc==SQLITE_OK ){
7150 rc = sqlite3PagerSync(pBtTo->pPager);
7153 rc = sqlite3PagerTruncate(pBtTo->pPager, nNewPage);
7155 if( rc==SQLITE_OK ){
7156 pBtTo->pageSizeFixed = 0;
7161 sqlite3BtreeRollback(pTo);
7166 int sqlite3BtreeCopyFile(Btree *pTo, Btree *pFrom){
7168 sqlite3BtreeEnter(pTo);
7169 sqlite3BtreeEnter(pFrom);
7170 rc = btreeCopyFile(pTo, pFrom);
7171 sqlite3BtreeLeave(pFrom);
7172 sqlite3BtreeLeave(pTo);
7176 #endif /* SQLITE_OMIT_VACUUM */
7179 ** Return non-zero if a transaction is active.
7181 int sqlite3BtreeIsInTrans(Btree *p){
7182 assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
7183 return (p && (p->inTrans==TRANS_WRITE));
7187 ** Return non-zero if a statement transaction is active.
7189 int sqlite3BtreeIsInStmt(Btree *p){
7190 assert( sqlite3BtreeHoldsMutex(p) );
7191 return (p->pBt && p->pBt->inStmt);
7195 ** Return non-zero if a read (or write) transaction is active.
7197 int sqlite3BtreeIsInReadTrans(Btree *p){
7198 assert( sqlite3_mutex_held(p->db->mutex) );
7199 return (p && (p->inTrans!=TRANS_NONE));
7203 ** This function returns a pointer to a blob of memory associated with
7204 ** a single shared-btree. The memory is used by client code for its own
7205 ** purposes (for example, to store a high-level schema associated with
7206 ** the shared-btree). The btree layer manages reference counting issues.
7208 ** The first time this is called on a shared-btree, nBytes bytes of memory
7209 ** are allocated, zeroed, and returned to the caller. For each subsequent
7210 ** call the nBytes parameter is ignored and a pointer to the same blob
7211 ** of memory returned.
7213 ** If the nBytes parameter is 0 and the blob of memory has not yet been
7214 ** allocated, a null pointer is returned. If the blob has already been
7215 ** allocated, it is returned as normal.
7217 ** Just before the shared-btree is closed, the function passed as the
7218 ** xFree argument when the memory allocation was made is invoked on the
7219 ** blob of allocated memory. This function should not call sqlite3_free()
7220 ** on the memory, the btree layer does that.
7222 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
7223 BtShared *pBt = p->pBt;
7224 sqlite3BtreeEnter(p);
7225 if( !pBt->pSchema && nBytes ){
7226 pBt->pSchema = sqlite3MallocZero(nBytes);
7227 pBt->xFreeSchema = xFree;
7229 sqlite3BtreeLeave(p);
7230 return pBt->pSchema;
7234 ** Return true if another user of the same shared btree as the argument
7235 ** handle holds an exclusive lock on the sqlite_master table.
7237 int sqlite3BtreeSchemaLocked(Btree *p){
7239 assert( sqlite3_mutex_held(p->db->mutex) );
7240 sqlite3BtreeEnter(p);
7241 rc = (queryTableLock(p, MASTER_ROOT, READ_LOCK)!=SQLITE_OK);
7242 sqlite3BtreeLeave(p);
7247 #ifndef SQLITE_OMIT_SHARED_CACHE
7249 ** Obtain a lock on the table whose root page is iTab. The
7250 ** lock is a write lock if isWritelock is true or a read lock
7253 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
7256 u8 lockType = READ_LOCK + isWriteLock;
7257 assert( READ_LOCK+1==WRITE_LOCK );
7258 assert( isWriteLock==0 || isWriteLock==1 );
7259 sqlite3BtreeEnter(p);
7260 rc = queryTableLock(p, iTab, lockType);
7261 if( rc==SQLITE_OK ){
7262 rc = lockTable(p, iTab, lockType);
7264 sqlite3BtreeLeave(p);
7270 #ifndef SQLITE_OMIT_INCRBLOB
7272 ** Argument pCsr must be a cursor opened for writing on an
7273 ** INTKEY table currently pointing at a valid table entry.
7274 ** This function modifies the data stored as part of that entry.
7275 ** Only the data content may only be modified, it is not possible
7276 ** to change the length of the data stored.
7278 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
7279 assert( cursorHoldsMutex(pCsr) );
7280 assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
7281 assert(pCsr->isIncrblobHandle);
7283 restoreCursorPosition(pCsr);
7284 assert( pCsr->eState!=CURSOR_REQUIRESEEK );
7285 if( pCsr->eState!=CURSOR_VALID ){
7286 return SQLITE_ABORT;
7289 /* Check some preconditions:
7290 ** (a) the cursor is open for writing,
7291 ** (b) there is no read-lock on the table being modified and
7292 ** (c) the cursor points at a valid row of an intKey table.
7294 if( !pCsr->wrFlag ){
7295 return SQLITE_READONLY;
7297 assert( !pCsr->pBt->readOnly
7298 && pCsr->pBt->inTransaction==TRANS_WRITE );
7299 if( checkReadLocks(pCsr->pBtree, pCsr->pgnoRoot, pCsr, 0) ){
7300 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
7302 if( pCsr->eState==CURSOR_INVALID || !pCsr->pPage->intKey ){
7303 return SQLITE_ERROR;
7306 return accessPayload(pCsr, offset, amt, (unsigned char *)z, 0, 1);
7310 ** Set a flag on this cursor to cache the locations of pages from the
7311 ** overflow list for the current row. This is used by cursors opened
7312 ** for incremental blob IO only.
7314 ** This function sets a flag only. The actual page location cache
7315 ** (stored in BtCursor.aOverflow[]) is allocated and used by function
7316 ** accessPayload() (the worker function for sqlite3BtreeData() and
7317 ** sqlite3BtreePutData()).
7319 void sqlite3BtreeCacheOverflow(BtCursor *pCur){
7320 assert( cursorHoldsMutex(pCur) );
7321 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
7322 assert(!pCur->isIncrblobHandle);
7323 assert(!pCur->aOverflow);
7324 pCur->isIncrblobHandle = 1;