Update contrib.
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 *************************************************************************
12 ** $Id: btree.c,v 1.524 2008/09/30 17:18:17 drh Exp $
14 ** This file implements a external (disk-based) database using BTrees.
15 ** See the header comment on "btreeInt.h" for additional information.
16 ** Including a description of file format and an overview of operation.
21 ** The header string that appears at the beginning of every
24 static const char zMagicHeader[] = SQLITE_FILE_HEADER;
27 ** Set this global variable to 1 to enable tracing using the TRACE
31 int sqlite3BtreeTrace=0; /* True to enable tracing */
32 # define TRACE(X) if(sqlite3BtreeTrace){printf X;fflush(stdout);}
38 ** Sometimes we need a small amount of code such as a variable initialization
39 ** to setup for a later assert() statement. We do not want this code to
40 ** appear when assert() is disabled. The following macro is therefore
41 ** used to contain that setup code. The "VVA" acronym stands for
42 ** "Verification, Validation, and Accreditation". In other words, the
43 ** code within VVA_ONLY() will only run during verification processes.
46 # define VVA_ONLY(X) X
53 #ifndef SQLITE_OMIT_SHARED_CACHE
55 ** A list of BtShared objects that are eligible for participation
56 ** in shared cache. This variable has file scope during normal builds,
57 ** but the test harness needs to access it so we make it global for
61 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
63 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
65 #endif /* SQLITE_OMIT_SHARED_CACHE */
67 #ifndef SQLITE_OMIT_SHARED_CACHE
69 ** Enable or disable the shared pager and schema features.
71 ** This routine has no effect on existing database connections.
72 ** The shared cache setting effects only future calls to
73 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
75 SQLITE_EXPORT int sqlite3_enable_shared_cache(int enable){
76 sqlite3GlobalConfig.sharedCacheEnabled = enable;
83 ** Forward declaration
85 static int checkReadLocks(Btree*, Pgno, BtCursor*, i64);
88 #ifdef SQLITE_OMIT_SHARED_CACHE
90 ** The functions queryTableLock(), lockTable() and unlockAllTables()
91 ** manipulate entries in the BtShared.pLock linked list used to store
92 ** shared-cache table level locks. If the library is compiled with the
93 ** shared-cache feature disabled, then there is only ever one user
94 ** of each BtShared structure and so this locking is not necessary.
95 ** So define the lock related functions as no-ops.
97 #define queryTableLock(a,b,c) SQLITE_OK
98 #define lockTable(a,b,c) SQLITE_OK
99 #define unlockAllTables(a)
102 #ifndef SQLITE_OMIT_SHARED_CACHE
104 ** Query to see if btree handle p may obtain a lock of type eLock
105 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
106 ** SQLITE_OK if the lock may be obtained (by calling lockTable()), or
107 ** SQLITE_LOCKED if not.
109 static int queryTableLock(Btree *p, Pgno iTab, u8 eLock){
110 BtShared *pBt = p->pBt;
113 assert( sqlite3BtreeHoldsMutex(p) );
114 assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
117 /* This is a no-op if the shared-cache is not enabled */
122 /* If some other connection is holding an exclusive lock, the
123 ** requested lock may not be obtained.
125 if( pBt->pExclusive && pBt->pExclusive!=p ){
126 return SQLITE_LOCKED;
129 /* This (along with lockTable()) is where the ReadUncommitted flag is
130 ** dealt with. If the caller is querying for a read-lock and the flag is
131 ** set, it is unconditionally granted - even if there are write-locks
132 ** on the table. If a write-lock is requested, the ReadUncommitted flag
133 ** is not considered.
135 ** In function lockTable(), if a read-lock is demanded and the
136 ** ReadUncommitted flag is set, no entry is added to the locks list
139 ** To summarize: If the ReadUncommitted flag is set, then read cursors do
140 ** not create or respect table locks. The locking procedure for a
141 ** write-cursor does not change.
144 0==(p->db->flags&SQLITE_ReadUncommitted) ||
148 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
149 if( pIter->pBtree!=p && pIter->iTable==iTab &&
150 (pIter->eLock!=eLock || eLock!=READ_LOCK) ){
151 return SQLITE_LOCKED;
157 #endif /* !SQLITE_OMIT_SHARED_CACHE */
159 #ifndef SQLITE_OMIT_SHARED_CACHE
161 ** Add a lock on the table with root-page iTable to the shared-btree used
162 ** by Btree handle p. Parameter eLock must be either READ_LOCK or
165 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_BUSY and
166 ** SQLITE_NOMEM may also be returned.
168 static int lockTable(Btree *p, Pgno iTable, u8 eLock){
169 BtShared *pBt = p->pBt;
173 assert( sqlite3BtreeHoldsMutex(p) );
174 assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
177 /* This is a no-op if the shared-cache is not enabled */
182 assert( SQLITE_OK==queryTableLock(p, iTable, eLock) );
184 /* If the read-uncommitted flag is set and a read-lock is requested,
185 ** return early without adding an entry to the BtShared.pLock list. See
186 ** comment in function queryTableLock() for more info on handling
187 ** the ReadUncommitted flag.
190 (p->db->flags&SQLITE_ReadUncommitted) &&
191 (eLock==READ_LOCK) &&
197 /* First search the list for an existing lock on this table. */
198 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
199 if( pIter->iTable==iTable && pIter->pBtree==p ){
205 /* If the above search did not find a BtLock struct associating Btree p
206 ** with table iTable, allocate one and link it into the list.
209 pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
213 pLock->iTable = iTable;
215 pLock->pNext = pBt->pLock;
219 /* Set the BtLock.eLock variable to the maximum of the current lock
220 ** and the requested lock. This means if a write-lock was already held
221 ** and a read-lock requested, we don't incorrectly downgrade the lock.
223 assert( WRITE_LOCK>READ_LOCK );
224 if( eLock>pLock->eLock ){
225 pLock->eLock = eLock;
230 #endif /* !SQLITE_OMIT_SHARED_CACHE */
232 #ifndef SQLITE_OMIT_SHARED_CACHE
234 ** Release all the table locks (locks obtained via calls to the lockTable()
235 ** procedure) held by Btree handle p.
237 static void unlockAllTables(Btree *p){
238 BtShared *pBt = p->pBt;
239 BtLock **ppIter = &pBt->pLock;
241 assert( sqlite3BtreeHoldsMutex(p) );
242 assert( p->sharable || 0==*ppIter );
245 BtLock *pLock = *ppIter;
246 assert( pBt->pExclusive==0 || pBt->pExclusive==pLock->pBtree );
247 if( pLock->pBtree==p ){
248 *ppIter = pLock->pNext;
251 ppIter = &pLock->pNext;
255 if( pBt->pExclusive==p ){
259 #endif /* SQLITE_OMIT_SHARED_CACHE */
261 static void releasePage(MemPage *pPage); /* Forward reference */
264 ** Verify that the cursor holds a mutex on the BtShared
267 static int cursorHoldsMutex(BtCursor *p){
268 return sqlite3_mutex_held(p->pBt->mutex);
273 #ifndef SQLITE_OMIT_INCRBLOB
275 ** Invalidate the overflow page-list cache for cursor pCur, if any.
277 static void invalidateOverflowCache(BtCursor *pCur){
278 assert( cursorHoldsMutex(pCur) );
279 sqlite3_free(pCur->aOverflow);
284 ** Invalidate the overflow page-list cache for all cursors opened
285 ** on the shared btree structure pBt.
287 static void invalidateAllOverflowCache(BtShared *pBt){
289 assert( sqlite3_mutex_held(pBt->mutex) );
290 for(p=pBt->pCursor; p; p=p->pNext){
291 invalidateOverflowCache(p);
295 #define invalidateOverflowCache(x)
296 #define invalidateAllOverflowCache(x)
300 ** Save the current cursor position in the variables BtCursor.nKey
301 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
303 static int saveCursorPosition(BtCursor *pCur){
306 assert( CURSOR_VALID==pCur->eState );
307 assert( 0==pCur->pKey );
308 assert( cursorHoldsMutex(pCur) );
310 rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);
312 /* If this is an intKey table, then the above call to BtreeKeySize()
313 ** stores the integer key in pCur->nKey. In this case this value is
314 ** all that is required. Otherwise, if pCur is not open on an intKey
315 ** table, then malloc space for and store the pCur->nKey bytes of key
318 if( rc==SQLITE_OK && 0==pCur->apPage[0]->intKey){
319 void *pKey = sqlite3Malloc(pCur->nKey);
321 rc = sqlite3BtreeKey(pCur, 0, pCur->nKey, pKey);
331 assert( !pCur->apPage[0]->intKey || !pCur->pKey );
335 for(i=0; i<=pCur->iPage; i++){
336 releasePage(pCur->apPage[i]);
340 pCur->eState = CURSOR_REQUIRESEEK;
343 invalidateOverflowCache(pCur);
348 ** Save the positions of all cursors except pExcept open on the table
349 ** with root-page iRoot. Usually, this is called just before cursor
350 ** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()).
352 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
354 assert( sqlite3_mutex_held(pBt->mutex) );
355 assert( pExcept==0 || pExcept->pBt==pBt );
356 for(p=pBt->pCursor; p; p=p->pNext){
357 if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) &&
358 p->eState==CURSOR_VALID ){
359 int rc = saveCursorPosition(p);
369 ** Clear the current cursor position.
371 static void clearCursorPosition(BtCursor *pCur){
372 assert( cursorHoldsMutex(pCur) );
373 sqlite3_free(pCur->pKey);
375 pCur->eState = CURSOR_INVALID;
379 ** Restore the cursor to the position it was in (or as close to as possible)
380 ** when saveCursorPosition() was called. Note that this call deletes the
381 ** saved position info stored by saveCursorPosition(), so there can be
382 ** at most one effective restoreCursorPosition() call after each
383 ** saveCursorPosition().
385 int sqlite3BtreeRestoreCursorPosition(BtCursor *pCur){
387 assert( cursorHoldsMutex(pCur) );
388 assert( pCur->eState>=CURSOR_REQUIRESEEK );
389 if( pCur->eState==CURSOR_FAULT ){
392 pCur->eState = CURSOR_INVALID;
393 rc = sqlite3BtreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skip);
395 sqlite3_free(pCur->pKey);
397 assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
402 #define restoreCursorPosition(p) \
403 (p->eState>=CURSOR_REQUIRESEEK ? \
404 sqlite3BtreeRestoreCursorPosition(p) : \
408 ** Determine whether or not a cursor has moved from the position it
409 ** was last placed at. Cursor can move when the row they are pointing
410 ** at is deleted out from under them.
412 ** This routine returns an error code if something goes wrong. The
413 ** integer *pHasMoved is set to one if the cursor has moved and 0 if not.
415 int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved){
418 rc = restoreCursorPosition(pCur);
423 if( pCur->eState!=CURSOR_VALID || pCur->skip!=0 ){
431 #ifndef SQLITE_OMIT_AUTOVACUUM
433 ** Given a page number of a regular database page, return the page
434 ** number for the pointer-map page that contains the entry for the
435 ** input page number.
437 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
438 int nPagesPerMapPage, iPtrMap, ret;
439 assert( sqlite3_mutex_held(pBt->mutex) );
440 nPagesPerMapPage = (pBt->usableSize/5)+1;
441 iPtrMap = (pgno-2)/nPagesPerMapPage;
442 ret = (iPtrMap*nPagesPerMapPage) + 2;
443 if( ret==PENDING_BYTE_PAGE(pBt) ){
450 ** Write an entry into the pointer map.
452 ** This routine updates the pointer map entry for page number 'key'
453 ** so that it maps to type 'eType' and parent page number 'pgno'.
454 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
456 static int ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent){
457 DbPage *pDbPage; /* The pointer map page */
458 u8 *pPtrmap; /* The pointer map data */
459 Pgno iPtrmap; /* The pointer map page number */
460 int offset; /* Offset in pointer map page */
463 assert( sqlite3_mutex_held(pBt->mutex) );
464 /* The master-journal page number must never be used as a pointer map page */
465 assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
467 assert( pBt->autoVacuum );
469 return SQLITE_CORRUPT_BKPT;
471 iPtrmap = PTRMAP_PAGENO(pBt, key);
472 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
476 offset = PTRMAP_PTROFFSET(iPtrmap, key);
477 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
479 if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
480 TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
481 rc = sqlite3PagerWrite(pDbPage);
483 pPtrmap[offset] = eType;
484 put4byte(&pPtrmap[offset+1], parent);
488 sqlite3PagerUnref(pDbPage);
493 ** Read an entry from the pointer map.
495 ** This routine retrieves the pointer map entry for page 'key', writing
496 ** the type and parent page number to *pEType and *pPgno respectively.
497 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
499 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
500 DbPage *pDbPage; /* The pointer map page */
501 int iPtrmap; /* Pointer map page index */
502 u8 *pPtrmap; /* Pointer map page data */
503 int offset; /* Offset of entry in pointer map */
506 assert( sqlite3_mutex_held(pBt->mutex) );
508 iPtrmap = PTRMAP_PAGENO(pBt, key);
509 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
513 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
515 offset = PTRMAP_PTROFFSET(iPtrmap, key);
517 *pEType = pPtrmap[offset];
518 if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
520 sqlite3PagerUnref(pDbPage);
521 if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
525 #else /* if defined SQLITE_OMIT_AUTOVACUUM */
526 #define ptrmapPut(w,x,y,z) SQLITE_OK
527 #define ptrmapGet(w,x,y,z) SQLITE_OK
528 #define ptrmapPutOvfl(y,z) SQLITE_OK
532 ** Given a btree page and a cell index (0 means the first cell on
533 ** the page, 1 means the second cell, and so forth) return a pointer
534 ** to the cell content.
536 ** This routine works only for pages that do not contain overflow cells.
538 #define findCell(P,I) \
539 ((P)->aData + ((P)->maskPage & get2byte(&(P)->aData[(P)->cellOffset+2*(I)])))
542 ** This a more complex version of findCell() that works for
543 ** pages that do contain overflow cells. See insert
545 static u8 *findOverflowCell(MemPage *pPage, int iCell){
547 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
548 for(i=pPage->nOverflow-1; i>=0; i--){
550 struct _OvflCell *pOvfl;
551 pOvfl = &pPage->aOvfl[i];
560 return findCell(pPage, iCell);
564 ** Parse a cell content block and fill in the CellInfo structure. There
565 ** are two versions of this function. sqlite3BtreeParseCell() takes a
566 ** cell index as the second argument and sqlite3BtreeParseCellPtr()
567 ** takes a pointer to the body of the cell as its second argument.
569 ** Within this file, the parseCell() macro can be called instead of
570 ** sqlite3BtreeParseCellPtr(). Using some compilers, this will be faster.
572 void sqlite3BtreeParseCellPtr(
573 MemPage *pPage, /* Page containing the cell */
574 u8 *pCell, /* Pointer to the cell text. */
575 CellInfo *pInfo /* Fill in this structure */
577 int n; /* Number bytes in cell content header */
578 u32 nPayload; /* Number of bytes of cell payload */
580 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
582 pInfo->pCell = pCell;
583 assert( pPage->leaf==0 || pPage->leaf==1 );
584 n = pPage->childPtrSize;
585 assert( n==4-4*pPage->leaf );
587 if( pPage->hasData ){
588 n += getVarint32(&pCell[n], nPayload);
592 n += getVarint(&pCell[n], (u64*)&pInfo->nKey);
593 pInfo->nData = nPayload;
596 n += getVarint32(&pCell[n], nPayload);
597 pInfo->nKey = nPayload;
599 pInfo->nPayload = nPayload;
601 if( likely(nPayload<=pPage->maxLocal) ){
602 /* This is the (easy) common case where the entire payload fits
603 ** on the local page. No overflow is required.
605 int nSize; /* Total size of cell content in bytes */
606 nSize = nPayload + n;
607 pInfo->nLocal = nPayload;
608 pInfo->iOverflow = 0;
609 if( (nSize & ~3)==0 ){
610 nSize = 4; /* Minimum cell size is 4 */
612 pInfo->nSize = nSize;
614 /* If the payload will not fit completely on the local page, we have
615 ** to decide how much to store locally and how much to spill onto
616 ** overflow pages. The strategy is to minimize the amount of unused
617 ** space on overflow pages while keeping the amount of local storage
618 ** in between minLocal and maxLocal.
620 ** Warning: changing the way overflow payload is distributed in any
621 ** way will result in an incompatible file format.
623 int minLocal; /* Minimum amount of payload held locally */
624 int maxLocal; /* Maximum amount of payload held locally */
625 int surplus; /* Overflow payload available for local storage */
627 minLocal = pPage->minLocal;
628 maxLocal = pPage->maxLocal;
629 surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);
630 if( surplus <= maxLocal ){
631 pInfo->nLocal = surplus;
633 pInfo->nLocal = minLocal;
635 pInfo->iOverflow = pInfo->nLocal + n;
636 pInfo->nSize = pInfo->iOverflow + 4;
639 #define parseCell(pPage, iCell, pInfo) \
640 sqlite3BtreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo))
641 void sqlite3BtreeParseCell(
642 MemPage *pPage, /* Page containing the cell */
643 int iCell, /* The cell index. First cell is 0 */
644 CellInfo *pInfo /* Fill in this structure */
646 parseCell(pPage, iCell, pInfo);
650 ** Compute the total number of bytes that a Cell needs in the cell
651 ** data area of the btree-page. The return number includes the cell
652 ** data header and the local payload, but not any overflow page or
653 ** the space used by the cell pointer.
656 static u16 cellSize(MemPage *pPage, int iCell){
658 sqlite3BtreeParseCell(pPage, iCell, &info);
662 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
664 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
668 #ifndef SQLITE_OMIT_AUTOVACUUM
670 ** If the cell pCell, part of page pPage contains a pointer
671 ** to an overflow page, insert an entry into the pointer-map
672 ** for the overflow page.
674 static int ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell){
677 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
678 assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
679 if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
680 Pgno ovfl = get4byte(&pCell[info.iOverflow]);
681 return ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno);
686 ** If the cell with index iCell on page pPage contains a pointer
687 ** to an overflow page, insert an entry into the pointer-map
688 ** for the overflow page.
690 static int ptrmapPutOvfl(MemPage *pPage, int iCell){
692 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
693 pCell = findOverflowCell(pPage, iCell);
694 return ptrmapPutOvflPtr(pPage, pCell);
700 ** Defragment the page given. All Cells are moved to the
701 ** end of the page and all free space is collected into one
702 ** big FreeBlk that occurs in between the header and cell
703 ** pointer array and the cell content area.
705 static void defragmentPage(MemPage *pPage){
706 int i; /* Loop counter */
707 int pc; /* Address of a i-th cell */
708 int addr; /* Offset of first byte after cell pointer array */
709 int hdr; /* Offset to the page header */
710 int size; /* Size of a cell */
711 int usableSize; /* Number of usable bytes on a page */
712 int cellOffset; /* Offset to the cell pointer array */
713 int cbrk; /* Offset to the cell content area */
714 int nCell; /* Number of cells on the page */
715 unsigned char *data; /* The page data */
716 unsigned char *temp; /* Temp area for cell content */
718 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
719 assert( pPage->pBt!=0 );
720 assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
721 assert( pPage->nOverflow==0 );
722 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
723 temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
725 hdr = pPage->hdrOffset;
726 cellOffset = pPage->cellOffset;
727 nCell = pPage->nCell;
728 assert( nCell==get2byte(&data[hdr+3]) );
729 usableSize = pPage->pBt->usableSize;
730 cbrk = get2byte(&data[hdr+5]);
731 memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk);
733 for(i=0; i<nCell; i++){
734 u8 *pAddr; /* The i-th cell pointer */
735 pAddr = &data[cellOffset + i*2];
736 pc = get2byte(pAddr);
737 assert( pc<pPage->pBt->usableSize );
738 size = cellSizePtr(pPage, &temp[pc]);
740 memcpy(&data[cbrk], &temp[pc], size);
741 put2byte(pAddr, cbrk);
743 assert( cbrk>=cellOffset+2*nCell );
744 put2byte(&data[hdr+5], cbrk);
748 addr = cellOffset+2*nCell;
749 memset(&data[addr], 0, cbrk-addr);
753 ** Allocate nByte bytes of space on a page.
755 ** Return the index into pPage->aData[] of the first byte of
756 ** the new allocation. The caller guarantees that there is enough
757 ** space. This routine will never fail.
759 ** If the page contains nBytes of free space but does not contain
760 ** nBytes of contiguous free space, then this routine automatically
761 ** calls defragementPage() to consolidate all free space before
762 ** allocating the new chunk.
764 static int allocateSpace(MemPage *pPage, int nByte){
774 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
775 assert( pPage->pBt );
776 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
777 assert( nByte>=0 ); /* Minimum cell size is 4 */
778 assert( pPage->nFree>=nByte );
779 assert( pPage->nOverflow==0 );
780 pPage->nFree -= nByte;
781 hdr = pPage->hdrOffset;
785 /* Search the freelist looking for a slot big enough to satisfy the
788 while( (pc = get2byte(&data[addr]))>0 ){
789 size = get2byte(&data[pc+2]);
792 memcpy(&data[addr], &data[pc], 2);
793 data[hdr+7] = nFrag + size - nByte;
796 put2byte(&data[pc+2], size-nByte);
797 return pc + size - nByte;
804 /* Allocate memory from the gap in between the cell pointer array
805 ** and the cell content area.
807 top = get2byte(&data[hdr+5]);
808 nCell = get2byte(&data[hdr+3]);
809 cellOffset = pPage->cellOffset;
810 if( nFrag>=60 || cellOffset + 2*nCell > top - nByte ){
811 defragmentPage(pPage);
812 top = get2byte(&data[hdr+5]);
815 assert( cellOffset + 2*nCell <= top );
816 put2byte(&data[hdr+5], top);
821 ** Return a section of the pPage->aData to the freelist.
822 ** The first byte of the new free block is pPage->aDisk[start]
823 ** and the size of the block is "size" bytes.
825 ** Most of the effort here is involved in coalesing adjacent
826 ** free blocks into a single big free block.
828 static void freeSpace(MemPage *pPage, int start, int size){
829 int addr, pbegin, hdr;
830 unsigned char *data = pPage->aData;
832 assert( pPage->pBt!=0 );
833 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
834 assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) );
835 assert( (start + size)<=pPage->pBt->usableSize );
836 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
837 assert( size>=0 ); /* Minimum cell size is 4 */
839 #ifdef SQLITE_SECURE_DELETE
840 /* Overwrite deleted information with zeros when the SECURE_DELETE
841 ** option is enabled at compile-time */
842 memset(&data[start], 0, size);
845 /* Add the space back into the linked list of freeblocks */
846 hdr = pPage->hdrOffset;
848 while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
849 assert( pbegin<=pPage->pBt->usableSize-4 );
850 assert( pbegin>addr );
853 assert( pbegin<=pPage->pBt->usableSize-4 );
854 assert( pbegin>addr || pbegin==0 );
855 put2byte(&data[addr], start);
856 put2byte(&data[start], pbegin);
857 put2byte(&data[start+2], size);
858 pPage->nFree += size;
860 /* Coalesce adjacent free blocks */
861 addr = pPage->hdrOffset + 1;
862 while( (pbegin = get2byte(&data[addr]))>0 ){
864 assert( pbegin>addr );
865 assert( pbegin<=pPage->pBt->usableSize-4 );
866 pnext = get2byte(&data[pbegin]);
867 psize = get2byte(&data[pbegin+2]);
868 if( pbegin + psize + 3 >= pnext && pnext>0 ){
869 int frag = pnext - (pbegin+psize);
870 assert( frag<=data[pPage->hdrOffset+7] );
871 data[pPage->hdrOffset+7] -= frag;
872 put2byte(&data[pbegin], get2byte(&data[pnext]));
873 put2byte(&data[pbegin+2], pnext+get2byte(&data[pnext+2])-pbegin);
879 /* If the cell content area begins with a freeblock, remove it. */
880 if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){
882 pbegin = get2byte(&data[hdr+1]);
883 memcpy(&data[hdr+1], &data[pbegin], 2);
884 top = get2byte(&data[hdr+5]);
885 put2byte(&data[hdr+5], top + get2byte(&data[pbegin+2]));
890 ** Decode the flags byte (the first byte of the header) for a page
891 ** and initialize fields of the MemPage structure accordingly.
893 ** Only the following combinations are supported. Anything different
894 ** indicates a corrupt database files:
897 ** PTF_ZERODATA | PTF_LEAF
898 ** PTF_LEAFDATA | PTF_INTKEY
899 ** PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
901 static int decodeFlags(MemPage *pPage, int flagByte){
902 BtShared *pBt; /* A copy of pPage->pBt */
904 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
905 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
906 pPage->leaf = flagByte>>3; assert( PTF_LEAF == 1<<3 );
907 flagByte &= ~PTF_LEAF;
908 pPage->childPtrSize = 4-4*pPage->leaf;
910 if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
912 pPage->hasData = pPage->leaf;
913 pPage->maxLocal = pBt->maxLeaf;
914 pPage->minLocal = pBt->minLeaf;
915 }else if( flagByte==PTF_ZERODATA ){
918 pPage->maxLocal = pBt->maxLocal;
919 pPage->minLocal = pBt->minLocal;
921 return SQLITE_CORRUPT_BKPT;
927 ** Initialize the auxiliary information for a disk block.
929 ** Return SQLITE_OK on success. If we see that the page does
930 ** not contain a well-formed database page, then return
931 ** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not
932 ** guarantee that the page is well-formed. It only shows that
933 ** we failed to detect any corruption.
935 int sqlite3BtreeInitPage(MemPage *pPage){
937 assert( pPage->pBt!=0 );
938 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
939 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
940 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
941 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
943 if( !pPage->isInit ){
944 int pc; /* Address of a freeblock within pPage->aData[] */
945 int hdr; /* Offset to beginning of page header */
946 u8 *data; /* Equal to pPage->aData */
947 BtShared *pBt; /* The main btree structure */
948 int usableSize; /* Amount of usable space on each page */
949 int cellOffset; /* Offset from start of page to first cell pointer */
950 int nFree; /* Number of unused bytes on the page */
951 int top; /* First byte of the cell content area */
955 hdr = pPage->hdrOffset;
957 if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;
958 assert( pBt->pageSize>=512 && pBt->pageSize<=32768 );
959 pPage->maskPage = pBt->pageSize - 1;
960 pPage->nOverflow = 0;
961 usableSize = pBt->usableSize;
962 pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
963 top = get2byte(&data[hdr+5]);
964 pPage->nCell = get2byte(&data[hdr+3]);
965 if( pPage->nCell>MX_CELL(pBt) ){
966 /* To many cells for a single page. The page must be corrupt */
967 return SQLITE_CORRUPT_BKPT;
970 /* Compute the total free space on the page */
971 pc = get2byte(&data[hdr+1]);
972 nFree = data[hdr+7] + top - (cellOffset + 2*pPage->nCell);
975 if( pc>usableSize-4 ){
976 /* Free block is off the page */
977 return SQLITE_CORRUPT_BKPT;
979 next = get2byte(&data[pc]);
980 size = get2byte(&data[pc+2]);
981 if( next>0 && next<=pc+size+3 ){
982 /* Free blocks must be in accending order */
983 return SQLITE_CORRUPT_BKPT;
988 pPage->nFree = nFree;
989 if( nFree>=usableSize ){
990 /* Free space cannot exceed total page size */
991 return SQLITE_CORRUPT_BKPT;
995 /* Check that all the offsets in the cell offset array are within range.
997 ** Omitting this consistency check and using the pPage->maskPage mask
998 ** to prevent overrunning the page buffer in findCell() results in a
999 ** 2.5% performance gain.
1002 u8 *pOff; /* Iterator used to check all cell offsets are in range */
1003 u8 *pEnd; /* Pointer to end of cell offset array */
1004 u8 mask; /* Mask of bits that must be zero in MSB of cell offsets */
1005 mask = ~(((u8)(pBt->pageSize>>8))-1);
1006 pEnd = &data[cellOffset + pPage->nCell*2];
1007 for(pOff=&data[cellOffset]; pOff!=pEnd && !((*pOff)&mask); pOff+=2);
1009 return SQLITE_CORRUPT_BKPT;
1020 ** Set up a raw page so that it looks like a database page holding
1023 static void zeroPage(MemPage *pPage, int flags){
1024 unsigned char *data = pPage->aData;
1025 BtShared *pBt = pPage->pBt;
1026 int hdr = pPage->hdrOffset;
1029 assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
1030 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1031 assert( sqlite3PagerGetData(pPage->pDbPage) == data );
1032 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1033 assert( sqlite3_mutex_held(pBt->mutex) );
1034 /*memset(&data[hdr], 0, pBt->usableSize - hdr);*/
1036 first = hdr + 8 + 4*((flags&PTF_LEAF)==0);
1037 memset(&data[hdr+1], 0, 4);
1039 put2byte(&data[hdr+5], pBt->usableSize);
1040 pPage->nFree = pBt->usableSize - first;
1041 decodeFlags(pPage, flags);
1042 pPage->hdrOffset = hdr;
1043 pPage->cellOffset = first;
1044 pPage->nOverflow = 0;
1045 assert( pBt->pageSize>=512 && pBt->pageSize<=32768 );
1046 pPage->maskPage = pBt->pageSize - 1;
1053 ** Convert a DbPage obtained from the pager into a MemPage used by
1056 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
1057 MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
1058 pPage->aData = sqlite3PagerGetData(pDbPage);
1059 pPage->pDbPage = pDbPage;
1062 pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
1067 ** Get a page from the pager. Initialize the MemPage.pBt and
1068 ** MemPage.aData elements if needed.
1070 ** If the noContent flag is set, it means that we do not care about
1071 ** the content of the page at this time. So do not go to the disk
1072 ** to fetch the content. Just fill in the content with zeros for now.
1073 ** If in the future we call sqlite3PagerWrite() on this page, that
1074 ** means we have started to be concerned about content and the disk
1075 ** read should occur at that point.
1077 int sqlite3BtreeGetPage(
1078 BtShared *pBt, /* The btree */
1079 Pgno pgno, /* Number of the page to fetch */
1080 MemPage **ppPage, /* Return the page in this parameter */
1081 int noContent /* Do not load page content if true */
1086 assert( sqlite3_mutex_held(pBt->mutex) );
1087 rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent);
1089 *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
1094 ** Return the size of the database file in pages. Or return -1 if
1095 ** there is any kind of error.
1097 static int pagerPagecount(Pager *pPager){
1100 rc = sqlite3PagerPagecount(pPager, &nPage);
1101 return (rc==SQLITE_OK?nPage:-1);
1105 ** Get a page from the pager and initialize it. This routine
1106 ** is just a convenience wrapper around separate calls to
1107 ** sqlite3BtreeGetPage() and sqlite3BtreeInitPage().
1109 static int getAndInitPage(
1110 BtShared *pBt, /* The database file */
1111 Pgno pgno, /* Number of the page to get */
1112 MemPage **ppPage /* Write the page pointer here */
1118 assert( sqlite3_mutex_held(pBt->mutex) );
1120 return SQLITE_CORRUPT_BKPT;
1123 /* It is often the case that the page we want is already in cache.
1124 ** If so, get it directly. This saves us from having to call
1125 ** pagerPagecount() to make sure pgno is within limits, which results
1126 ** in a measureable performance improvements.
1128 pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
1130 /* Page is already in cache */
1131 *ppPage = pPage = btreePageFromDbPage(pDbPage, pgno, pBt);
1134 /* Page not in cache. Acquire it. */
1135 if( pgno>pagerPagecount(pBt->pPager) ){
1136 return SQLITE_CORRUPT_BKPT;
1138 rc = sqlite3BtreeGetPage(pBt, pgno, ppPage, 0);
1142 if( !pPage->isInit ){
1143 rc = sqlite3BtreeInitPage(pPage);
1145 if( rc!=SQLITE_OK ){
1153 ** Release a MemPage. This should be called once for each prior
1154 ** call to sqlite3BtreeGetPage.
1156 static void releasePage(MemPage *pPage){
1158 assert( pPage->aData );
1159 assert( pPage->pBt );
1160 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1161 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
1162 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1163 sqlite3PagerUnref(pPage->pDbPage);
1168 ** During a rollback, when the pager reloads information into the cache
1169 ** so that the cache is restored to its original state at the start of
1170 ** the transaction, for each page restored this routine is called.
1172 ** This routine needs to reset the extra data section at the end of the
1173 ** page to agree with the restored data.
1175 static void pageReinit(DbPage *pData){
1177 pPage = (MemPage *)sqlite3PagerGetExtra(pData);
1178 if( pPage->isInit ){
1179 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1181 if( sqlite3PagerPageRefcount(pData)>0 ){
1182 sqlite3BtreeInitPage(pPage);
1188 ** Invoke the busy handler for a btree.
1190 static int sqlite3BtreeInvokeBusyHandler(void *pArg, int n){
1191 BtShared *pBt = (BtShared*)pArg;
1193 assert( sqlite3_mutex_held(pBt->db->mutex) );
1194 return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
1198 ** Open a database file.
1200 ** zFilename is the name of the database file. If zFilename is NULL
1201 ** a new database with a random name is created. This randomly named
1202 ** database file will be deleted when sqlite3BtreeClose() is called.
1203 ** If zFilename is ":memory:" then an in-memory database is created
1204 ** that is automatically destroyed when it is closed.
1206 int sqlite3BtreeOpen(
1207 const char *zFilename, /* Name of the file containing the BTree database */
1208 sqlite3 *db, /* Associated database handle */
1209 Btree **ppBtree, /* Pointer to new Btree object written here */
1210 int flags, /* Options */
1211 int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */
1213 sqlite3_vfs *pVfs; /* The VFS to use for this btree */
1214 BtShared *pBt = 0; /* Shared part of btree structure */
1215 Btree *p; /* Handle to return */
1218 unsigned char zDbHeader[100];
1220 /* Set the variable isMemdb to true for an in-memory database, or
1221 ** false for a file-based database. This symbol is only required if
1222 ** either of the shared-data or autovacuum features are compiled
1223 ** into the library.
1225 #if !defined(SQLITE_OMIT_SHARED_CACHE) || !defined(SQLITE_OMIT_AUTOVACUUM)
1226 #ifdef SQLITE_OMIT_MEMORYDB
1227 const int isMemdb = 0;
1229 const int isMemdb = zFilename && !strcmp(zFilename, ":memory:");
1234 assert( sqlite3_mutex_held(db->mutex) );
1237 p = sqlite3MallocZero(sizeof(Btree));
1239 return SQLITE_NOMEM;
1241 p->inTrans = TRANS_NONE;
1244 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1246 ** If this Btree is a candidate for shared cache, try to find an
1247 ** existing BtShared object that we can share with
1250 && (db->flags & SQLITE_Vtab)==0
1251 && zFilename && zFilename[0]
1253 if( sqlite3GlobalConfig.sharedCacheEnabled ){
1254 int nFullPathname = pVfs->mxPathname+1;
1255 char *zFullPathname = sqlite3Malloc(nFullPathname);
1256 sqlite3_mutex *mutexShared;
1258 db->flags |= SQLITE_SharedCache;
1259 if( !zFullPathname ){
1261 return SQLITE_NOMEM;
1263 sqlite3OsFullPathname(pVfs, zFilename, nFullPathname, zFullPathname);
1264 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1265 sqlite3_mutex_enter(mutexShared);
1266 for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
1267 assert( pBt->nRef>0 );
1268 if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager))
1269 && sqlite3PagerVfs(pBt->pPager)==pVfs ){
1275 sqlite3_mutex_leave(mutexShared);
1276 sqlite3_free(zFullPathname);
1280 /* In debug mode, we mark all persistent databases as sharable
1281 ** even when they are not. This exercises the locking code and
1282 ** gives more opportunity for asserts(sqlite3_mutex_held())
1283 ** statements to find locking problems.
1292 ** The following asserts make sure that structures used by the btree are
1293 ** the right size. This is to guard against size changes that result
1294 ** when compiling on a different architecture.
1296 assert( sizeof(i64)==8 || sizeof(i64)==4 );
1297 assert( sizeof(u64)==8 || sizeof(u64)==4 );
1298 assert( sizeof(u32)==4 );
1299 assert( sizeof(u16)==2 );
1300 assert( sizeof(Pgno)==4 );
1302 pBt = sqlite3MallocZero( sizeof(*pBt) );
1305 goto btree_open_out;
1307 pBt->busyHdr.xFunc = sqlite3BtreeInvokeBusyHandler;
1308 pBt->busyHdr.pArg = pBt;
1309 rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
1310 EXTRA_SIZE, flags, vfsFlags);
1311 if( rc==SQLITE_OK ){
1312 rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
1314 if( rc!=SQLITE_OK ){
1315 goto btree_open_out;
1317 sqlite3PagerSetBusyhandler(pBt->pPager, &pBt->busyHdr);
1320 sqlite3PagerSetReiniter(pBt->pPager, pageReinit);
1323 pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager);
1324 pBt->pageSize = get2byte(&zDbHeader[16]);
1325 if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
1326 || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
1328 sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1329 #ifndef SQLITE_OMIT_AUTOVACUUM
1330 /* If the magic name ":memory:" will create an in-memory database, then
1331 ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
1332 ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
1333 ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
1334 ** regular file-name. In this case the auto-vacuum applies as per normal.
1336 if( zFilename && !isMemdb ){
1337 pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
1338 pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
1343 nReserve = zDbHeader[20];
1344 pBt->pageSizeFixed = 1;
1345 #ifndef SQLITE_OMIT_AUTOVACUUM
1346 pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
1347 pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
1350 pBt->usableSize = pBt->pageSize - nReserve;
1351 assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */
1352 sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1354 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1355 /* Add the new BtShared object to the linked list sharable BtShareds.
1358 sqlite3_mutex *mutexShared;
1360 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1361 if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
1362 pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
1363 if( pBt->mutex==0 ){
1365 db->mallocFailed = 0;
1366 goto btree_open_out;
1369 sqlite3_mutex_enter(mutexShared);
1370 pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
1371 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
1372 sqlite3_mutex_leave(mutexShared);
1377 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1378 /* If the new Btree uses a sharable pBtShared, then link the new
1379 ** Btree into the list of all sharable Btrees for the same connection.
1380 ** The list is kept in ascending order by pBt address.
1385 for(i=0; i<db->nDb; i++){
1386 if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
1387 while( pSib->pPrev ){ pSib = pSib->pPrev; }
1388 if( p->pBt<pSib->pBt ){
1393 while( pSib->pNext && pSib->pNext->pBt<p->pBt ){
1396 p->pNext = pSib->pNext;
1399 p->pNext->pPrev = p;
1411 if( rc!=SQLITE_OK ){
1412 if( pBt && pBt->pPager ){
1413 sqlite3PagerClose(pBt->pPager);
1423 ** Decrement the BtShared.nRef counter. When it reaches zero,
1424 ** remove the BtShared structure from the sharing list. Return
1425 ** true if the BtShared.nRef counter reaches zero and return
1426 ** false if it is still positive.
1428 static int removeFromSharingList(BtShared *pBt){
1429 #ifndef SQLITE_OMIT_SHARED_CACHE
1430 sqlite3_mutex *pMaster;
1434 assert( sqlite3_mutex_notheld(pBt->mutex) );
1435 pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1436 sqlite3_mutex_enter(pMaster);
1439 if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
1440 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
1442 pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
1443 while( ALWAYS(pList) && pList->pNext!=pBt ){
1446 if( ALWAYS(pList) ){
1447 pList->pNext = pBt->pNext;
1450 if( SQLITE_THREADSAFE ){
1451 sqlite3_mutex_free(pBt->mutex);
1455 sqlite3_mutex_leave(pMaster);
1463 ** Make sure pBt->pTmpSpace points to an allocation of
1464 ** MX_CELL_SIZE(pBt) bytes.
1466 static void allocateTempSpace(BtShared *pBt){
1467 if( !pBt->pTmpSpace ){
1468 pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
1473 ** Free the pBt->pTmpSpace allocation
1475 static void freeTempSpace(BtShared *pBt){
1476 sqlite3PageFree( pBt->pTmpSpace);
1481 ** Close an open database and invalidate all cursors.
1483 int sqlite3BtreeClose(Btree *p){
1484 BtShared *pBt = p->pBt;
1487 /* Close all cursors opened via this handle. */
1488 assert( sqlite3_mutex_held(p->db->mutex) );
1489 sqlite3BtreeEnter(p);
1491 pCur = pBt->pCursor;
1493 BtCursor *pTmp = pCur;
1495 if( pTmp->pBtree==p ){
1496 sqlite3BtreeCloseCursor(pTmp);
1500 /* Rollback any active transaction and free the handle structure.
1501 ** The call to sqlite3BtreeRollback() drops any table-locks held by
1504 sqlite3BtreeRollback(p);
1505 sqlite3BtreeLeave(p);
1507 /* If there are still other outstanding references to the shared-btree
1508 ** structure, return now. The remainder of this procedure cleans
1509 ** up the shared-btree.
1511 assert( p->wantToLock==0 && p->locked==0 );
1512 if( !p->sharable || removeFromSharingList(pBt) ){
1513 /* The pBt is no longer on the sharing list, so we can access
1514 ** it without having to hold the mutex.
1516 ** Clean out and delete the BtShared object.
1518 assert( !pBt->pCursor );
1519 sqlite3PagerClose(pBt->pPager);
1520 if( pBt->xFreeSchema && pBt->pSchema ){
1521 pBt->xFreeSchema(pBt->pSchema);
1523 sqlite3_free(pBt->pSchema);
1528 #ifndef SQLITE_OMIT_SHARED_CACHE
1529 assert( p->wantToLock==0 );
1530 assert( p->locked==0 );
1531 if( p->pPrev ) p->pPrev->pNext = p->pNext;
1532 if( p->pNext ) p->pNext->pPrev = p->pPrev;
1540 ** Change the limit on the number of pages allowed in the cache.
1542 ** The maximum number of cache pages is set to the absolute
1543 ** value of mxPage. If mxPage is negative, the pager will
1544 ** operate asynchronously - it will not stop to do fsync()s
1545 ** to insure data is written to the disk surface before
1546 ** continuing. Transactions still work if synchronous is off,
1547 ** and the database cannot be corrupted if this program
1548 ** crashes. But if the operating system crashes or there is
1549 ** an abrupt power failure when synchronous is off, the database
1550 ** could be left in an inconsistent and unrecoverable state.
1551 ** Synchronous is on by default so database corruption is not
1552 ** normally a worry.
1554 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
1555 BtShared *pBt = p->pBt;
1556 assert( sqlite3_mutex_held(p->db->mutex) );
1557 sqlite3BtreeEnter(p);
1558 sqlite3PagerSetCachesize(pBt->pPager, mxPage);
1559 sqlite3BtreeLeave(p);
1564 ** Change the way data is synced to disk in order to increase or decrease
1565 ** how well the database resists damage due to OS crashes and power
1566 ** failures. Level 1 is the same as asynchronous (no syncs() occur and
1567 ** there is a high probability of damage) Level 2 is the default. There
1568 ** is a very low but non-zero probability of damage. Level 3 reduces the
1569 ** probability of damage to near zero but with a write performance reduction.
1571 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
1572 int sqlite3BtreeSetSafetyLevel(Btree *p, int level, int fullSync){
1573 BtShared *pBt = p->pBt;
1574 assert( sqlite3_mutex_held(p->db->mutex) );
1575 sqlite3BtreeEnter(p);
1576 sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync);
1577 sqlite3BtreeLeave(p);
1583 ** Return TRUE if the given btree is set to safety level 1. In other
1584 ** words, return TRUE if no sync() occurs on the disk files.
1586 int sqlite3BtreeSyncDisabled(Btree *p){
1587 BtShared *pBt = p->pBt;
1589 assert( sqlite3_mutex_held(p->db->mutex) );
1590 sqlite3BtreeEnter(p);
1591 assert( pBt && pBt->pPager );
1592 rc = sqlite3PagerNosync(pBt->pPager);
1593 sqlite3BtreeLeave(p);
1597 #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
1599 ** Change the default pages size and the number of reserved bytes per page.
1601 ** The page size must be a power of 2 between 512 and 65536. If the page
1602 ** size supplied does not meet this constraint then the page size is not
1605 ** Page sizes are constrained to be a power of two so that the region
1606 ** of the database file used for locking (beginning at PENDING_BYTE,
1607 ** the first byte past the 1GB boundary, 0x40000000) needs to occur
1608 ** at the beginning of a page.
1610 ** If parameter nReserve is less than zero, then the number of reserved
1611 ** bytes per page is left unchanged.
1613 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve){
1615 BtShared *pBt = p->pBt;
1616 sqlite3BtreeEnter(p);
1617 if( pBt->pageSizeFixed ){
1618 sqlite3BtreeLeave(p);
1619 return SQLITE_READONLY;
1622 nReserve = pBt->pageSize - pBt->usableSize;
1624 if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
1625 ((pageSize-1)&pageSize)==0 ){
1626 assert( (pageSize & 7)==0 );
1627 assert( !pBt->pPage1 && !pBt->pCursor );
1628 pBt->pageSize = pageSize;
1630 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1632 pBt->usableSize = pBt->pageSize - nReserve;
1633 sqlite3BtreeLeave(p);
1638 ** Return the currently defined page size
1640 int sqlite3BtreeGetPageSize(Btree *p){
1641 return p->pBt->pageSize;
1643 int sqlite3BtreeGetReserve(Btree *p){
1645 sqlite3BtreeEnter(p);
1646 n = p->pBt->pageSize - p->pBt->usableSize;
1647 sqlite3BtreeLeave(p);
1652 ** Set the maximum page count for a database if mxPage is positive.
1653 ** No changes are made if mxPage is 0 or negative.
1654 ** Regardless of the value of mxPage, return the maximum page count.
1656 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
1658 sqlite3BtreeEnter(p);
1659 n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
1660 sqlite3BtreeLeave(p);
1663 #endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */
1666 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
1667 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
1668 ** is disabled. The default value for the auto-vacuum property is
1669 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
1671 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
1672 #ifdef SQLITE_OMIT_AUTOVACUUM
1673 return SQLITE_READONLY;
1675 BtShared *pBt = p->pBt;
1677 int av = (autoVacuum?1:0);
1679 sqlite3BtreeEnter(p);
1680 if( pBt->pageSizeFixed && av!=pBt->autoVacuum ){
1681 rc = SQLITE_READONLY;
1683 pBt->autoVacuum = av;
1685 sqlite3BtreeLeave(p);
1691 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is
1692 ** enabled 1 is returned. Otherwise 0.
1694 int sqlite3BtreeGetAutoVacuum(Btree *p){
1695 #ifdef SQLITE_OMIT_AUTOVACUUM
1696 return BTREE_AUTOVACUUM_NONE;
1699 sqlite3BtreeEnter(p);
1701 (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
1702 (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
1703 BTREE_AUTOVACUUM_INCR
1705 sqlite3BtreeLeave(p);
1712 ** Get a reference to pPage1 of the database file. This will
1713 ** also acquire a readlock on that file.
1715 ** SQLITE_OK is returned on success. If the file is not a
1716 ** well-formed database file, then SQLITE_CORRUPT is returned.
1717 ** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM
1718 ** is returned if we run out of memory.
1720 static int lockBtree(BtShared *pBt){
1725 assert( sqlite3_mutex_held(pBt->mutex) );
1726 if( pBt->pPage1 ) return SQLITE_OK;
1727 rc = sqlite3BtreeGetPage(pBt, 1, &pPage1, 0);
1728 if( rc!=SQLITE_OK ) return rc;
1730 /* Do some checking to help insure the file we opened really is
1731 ** a valid database file.
1733 rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
1734 if( rc!=SQLITE_OK ){
1735 goto page1_init_failed;
1736 }else if( nPage>0 ){
1739 u8 *page1 = pPage1->aData;
1741 if( memcmp(page1, zMagicHeader, 16)!=0 ){
1742 goto page1_init_failed;
1748 goto page1_init_failed;
1751 /* The maximum embedded fraction must be exactly 25%. And the minimum
1752 ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data.
1753 ** The original design allowed these amounts to vary, but as of
1754 ** version 3.6.0, we require them to be fixed.
1756 if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
1757 goto page1_init_failed;
1759 pageSize = get2byte(&page1[16]);
1760 if( ((pageSize-1)&pageSize)!=0 || pageSize<512 ||
1761 (SQLITE_MAX_PAGE_SIZE<32768 && pageSize>SQLITE_MAX_PAGE_SIZE)
1763 goto page1_init_failed;
1765 assert( (pageSize & 7)==0 );
1766 usableSize = pageSize - page1[20];
1767 if( pageSize!=pBt->pageSize ){
1768 /* After reading the first page of the database assuming a page size
1769 ** of BtShared.pageSize, we have discovered that the page-size is
1770 ** actually pageSize. Unlock the database, leave pBt->pPage1 at
1771 ** zero and return SQLITE_OK. The caller will call this function
1772 ** again with the correct page-size.
1774 releasePage(pPage1);
1775 pBt->usableSize = usableSize;
1776 pBt->pageSize = pageSize;
1778 sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1781 if( usableSize<500 ){
1782 goto page1_init_failed;
1784 pBt->pageSize = pageSize;
1785 pBt->usableSize = usableSize;
1786 #ifndef SQLITE_OMIT_AUTOVACUUM
1787 pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
1788 pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
1792 /* maxLocal is the maximum amount of payload to store locally for
1793 ** a cell. Make sure it is small enough so that at least minFanout
1794 ** cells can will fit on one page. We assume a 10-byte page header.
1795 ** Besides the payload, the cell must store:
1796 ** 2-byte pointer to the cell
1797 ** 4-byte child pointer
1798 ** 9-byte nKey value
1799 ** 4-byte nData value
1800 ** 4-byte overflow page pointer
1801 ** So a cell consists of a 2-byte poiner, a header which is as much as
1802 ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
1805 pBt->maxLocal = (pBt->usableSize-12)*64/255 - 23;
1806 pBt->minLocal = (pBt->usableSize-12)*32/255 - 23;
1807 pBt->maxLeaf = pBt->usableSize - 35;
1808 pBt->minLeaf = (pBt->usableSize-12)*32/255 - 23;
1809 assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
1810 pBt->pPage1 = pPage1;
1814 releasePage(pPage1);
1820 ** This routine works like lockBtree() except that it also invokes the
1821 ** busy callback if there is lock contention.
1823 static int lockBtreeWithRetry(Btree *pRef){
1826 assert( sqlite3BtreeHoldsMutex(pRef) );
1827 if( pRef->inTrans==TRANS_NONE ){
1828 u8 inTransaction = pRef->pBt->inTransaction;
1829 btreeIntegrity(pRef);
1830 rc = sqlite3BtreeBeginTrans(pRef, 0);
1831 pRef->pBt->inTransaction = inTransaction;
1832 pRef->inTrans = TRANS_NONE;
1833 if( rc==SQLITE_OK ){
1834 pRef->pBt->nTransaction--;
1836 btreeIntegrity(pRef);
1843 ** If there are no outstanding cursors and we are not in the middle
1844 ** of a transaction but there is a read lock on the database, then
1845 ** this routine unrefs the first page of the database file which
1846 ** has the effect of releasing the read lock.
1848 ** If there are any outstanding cursors, this routine is a no-op.
1850 ** If there is a transaction in progress, this routine is a no-op.
1852 static void unlockBtreeIfUnused(BtShared *pBt){
1853 assert( sqlite3_mutex_held(pBt->mutex) );
1854 if( pBt->inTransaction==TRANS_NONE && pBt->pCursor==0 && pBt->pPage1!=0 ){
1855 if( sqlite3PagerRefcount(pBt->pPager)>=1 ){
1856 assert( pBt->pPage1->aData );
1858 if( pBt->pPage1->aData==0 ){
1859 MemPage *pPage = pBt->pPage1;
1860 pPage->aData = sqlite3PagerGetData(pPage->pDbPage);
1865 releasePage(pBt->pPage1);
1873 ** Create a new database by initializing the first page of the
1876 static int newDatabase(BtShared *pBt){
1878 unsigned char *data;
1882 assert( sqlite3_mutex_held(pBt->mutex) );
1883 rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
1884 if( rc!=SQLITE_OK || nPage>0 ){
1890 rc = sqlite3PagerWrite(pP1->pDbPage);
1892 memcpy(data, zMagicHeader, sizeof(zMagicHeader));
1893 assert( sizeof(zMagicHeader)==16 );
1894 put2byte(&data[16], pBt->pageSize);
1897 data[20] = pBt->pageSize - pBt->usableSize;
1901 memset(&data[24], 0, 100-24);
1902 zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
1903 pBt->pageSizeFixed = 1;
1904 #ifndef SQLITE_OMIT_AUTOVACUUM
1905 assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
1906 assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
1907 put4byte(&data[36 + 4*4], pBt->autoVacuum);
1908 put4byte(&data[36 + 7*4], pBt->incrVacuum);
1914 ** Attempt to start a new transaction. A write-transaction
1915 ** is started if the second argument is nonzero, otherwise a read-
1916 ** transaction. If the second argument is 2 or more and exclusive
1917 ** transaction is started, meaning that no other process is allowed
1918 ** to access the database. A preexisting transaction may not be
1919 ** upgraded to exclusive by calling this routine a second time - the
1920 ** exclusivity flag only works for a new transaction.
1922 ** A write-transaction must be started before attempting any
1923 ** changes to the database. None of the following routines
1924 ** will work unless a transaction is started first:
1926 ** sqlite3BtreeCreateTable()
1927 ** sqlite3BtreeCreateIndex()
1928 ** sqlite3BtreeClearTable()
1929 ** sqlite3BtreeDropTable()
1930 ** sqlite3BtreeInsert()
1931 ** sqlite3BtreeDelete()
1932 ** sqlite3BtreeUpdateMeta()
1934 ** If an initial attempt to acquire the lock fails because of lock contention
1935 ** and the database was previously unlocked, then invoke the busy handler
1936 ** if there is one. But if there was previously a read-lock, do not
1937 ** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is
1938 ** returned when there is already a read-lock in order to avoid a deadlock.
1940 ** Suppose there are two processes A and B. A has a read lock and B has
1941 ** a reserved lock. B tries to promote to exclusive but is blocked because
1942 ** of A's read lock. A tries to promote to reserved but is blocked by B.
1943 ** One or the other of the two processes must give way or there can be
1944 ** no progress. By returning SQLITE_BUSY and not invoking the busy callback
1945 ** when A already has a read lock, we encourage A to give up and let B
1948 int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
1949 BtShared *pBt = p->pBt;
1952 sqlite3BtreeEnter(p);
1956 /* If the btree is already in a write-transaction, or it
1957 ** is already in a read-transaction and a read-transaction
1958 ** is requested, this is a no-op.
1960 if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
1964 /* Write transactions are not possible on a read-only database */
1965 if( pBt->readOnly && wrflag ){
1966 rc = SQLITE_READONLY;
1970 /* If another database handle has already opened a write transaction
1971 ** on this shared-btree structure and a second write transaction is
1972 ** requested, return SQLITE_BUSY.
1974 if( pBt->inTransaction==TRANS_WRITE && wrflag ){
1979 #ifndef SQLITE_OMIT_SHARED_CACHE
1982 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
1983 if( pIter->pBtree!=p ){
1992 if( pBt->pPage1==0 ){
1994 rc = lockBtree(pBt);
1995 }while( pBt->pPage1==0 && rc==SQLITE_OK );
1998 if( rc==SQLITE_OK && wrflag ){
1999 if( pBt->readOnly ){
2000 rc = SQLITE_READONLY;
2002 rc = sqlite3PagerBegin(pBt->pPage1->pDbPage, wrflag>1);
2003 if( rc==SQLITE_OK ){
2004 rc = newDatabase(pBt);
2009 if( rc==SQLITE_OK ){
2010 if( wrflag ) pBt->inStmt = 0;
2012 unlockBtreeIfUnused(pBt);
2014 }while( rc==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
2015 sqlite3BtreeInvokeBusyHandler(pBt, 0) );
2017 if( rc==SQLITE_OK ){
2018 if( p->inTrans==TRANS_NONE ){
2019 pBt->nTransaction++;
2021 p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
2022 if( p->inTrans>pBt->inTransaction ){
2023 pBt->inTransaction = p->inTrans;
2025 #ifndef SQLITE_OMIT_SHARED_CACHE
2027 assert( !pBt->pExclusive );
2028 pBt->pExclusive = p;
2036 sqlite3BtreeLeave(p);
2041 #ifndef SQLITE_OMIT_AUTOVACUUM
2044 ** Set the pointer-map entries for all children of page pPage. Also, if
2045 ** pPage contains cells that point to overflow pages, set the pointer
2046 ** map entries for the overflow pages as well.
2048 static int setChildPtrmaps(MemPage *pPage){
2049 int i; /* Counter variable */
2050 int nCell; /* Number of cells in page pPage */
2051 int rc; /* Return code */
2052 BtShared *pBt = pPage->pBt;
2053 int isInitOrig = pPage->isInit;
2054 Pgno pgno = pPage->pgno;
2056 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2057 rc = sqlite3BtreeInitPage(pPage);
2058 if( rc!=SQLITE_OK ){
2059 goto set_child_ptrmaps_out;
2061 nCell = pPage->nCell;
2063 for(i=0; i<nCell; i++){
2064 u8 *pCell = findCell(pPage, i);
2066 rc = ptrmapPutOvflPtr(pPage, pCell);
2067 if( rc!=SQLITE_OK ){
2068 goto set_child_ptrmaps_out;
2072 Pgno childPgno = get4byte(pCell);
2073 rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
2074 if( rc!=SQLITE_OK ) goto set_child_ptrmaps_out;
2079 Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
2080 rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
2083 set_child_ptrmaps_out:
2084 pPage->isInit = isInitOrig;
2089 ** Somewhere on pPage, which is guarenteed to be a btree page, not an overflow
2090 ** page, is a pointer to page iFrom. Modify this pointer so that it points to
2091 ** iTo. Parameter eType describes the type of pointer to be modified, as
2094 ** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child
2097 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
2098 ** page pointed to by one of the cells on pPage.
2100 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
2101 ** overflow page in the list.
2103 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
2104 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2105 if( eType==PTRMAP_OVERFLOW2 ){
2106 /* The pointer is always the first 4 bytes of the page in this case. */
2107 if( get4byte(pPage->aData)!=iFrom ){
2108 return SQLITE_CORRUPT_BKPT;
2110 put4byte(pPage->aData, iTo);
2112 int isInitOrig = pPage->isInit;
2116 sqlite3BtreeInitPage(pPage);
2117 nCell = pPage->nCell;
2119 for(i=0; i<nCell; i++){
2120 u8 *pCell = findCell(pPage, i);
2121 if( eType==PTRMAP_OVERFLOW1 ){
2123 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
2124 if( info.iOverflow ){
2125 if( iFrom==get4byte(&pCell[info.iOverflow]) ){
2126 put4byte(&pCell[info.iOverflow], iTo);
2131 if( get4byte(pCell)==iFrom ){
2132 put4byte(pCell, iTo);
2139 if( eType!=PTRMAP_BTREE ||
2140 get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
2141 return SQLITE_CORRUPT_BKPT;
2143 put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
2146 pPage->isInit = isInitOrig;
2153 ** Move the open database page pDbPage to location iFreePage in the
2154 ** database. The pDbPage reference remains valid.
2156 static int relocatePage(
2157 BtShared *pBt, /* Btree */
2158 MemPage *pDbPage, /* Open page to move */
2159 u8 eType, /* Pointer map 'type' entry for pDbPage */
2160 Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */
2161 Pgno iFreePage, /* The location to move pDbPage to */
2164 MemPage *pPtrPage; /* The page that contains a pointer to pDbPage */
2165 Pgno iDbPage = pDbPage->pgno;
2166 Pager *pPager = pBt->pPager;
2169 assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
2170 eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
2171 assert( sqlite3_mutex_held(pBt->mutex) );
2172 assert( pDbPage->pBt==pBt );
2174 /* Move page iDbPage from its current location to page number iFreePage */
2175 TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
2176 iDbPage, iFreePage, iPtrPage, eType));
2177 rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
2178 if( rc!=SQLITE_OK ){
2181 pDbPage->pgno = iFreePage;
2183 /* If pDbPage was a btree-page, then it may have child pages and/or cells
2184 ** that point to overflow pages. The pointer map entries for all these
2185 ** pages need to be changed.
2187 ** If pDbPage is an overflow page, then the first 4 bytes may store a
2188 ** pointer to a subsequent overflow page. If this is the case, then
2189 ** the pointer map needs to be updated for the subsequent overflow page.
2191 if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
2192 rc = setChildPtrmaps(pDbPage);
2193 if( rc!=SQLITE_OK ){
2197 Pgno nextOvfl = get4byte(pDbPage->aData);
2199 rc = ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage);
2200 if( rc!=SQLITE_OK ){
2206 /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
2207 ** that it points at iFreePage. Also fix the pointer map entry for
2210 if( eType!=PTRMAP_ROOTPAGE ){
2211 rc = sqlite3BtreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
2212 if( rc!=SQLITE_OK ){
2215 rc = sqlite3PagerWrite(pPtrPage->pDbPage);
2216 if( rc!=SQLITE_OK ){
2217 releasePage(pPtrPage);
2220 rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
2221 releasePage(pPtrPage);
2222 if( rc==SQLITE_OK ){
2223 rc = ptrmapPut(pBt, iFreePage, eType, iPtrPage);
2229 /* Forward declaration required by incrVacuumStep(). */
2230 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
2233 ** Perform a single step of an incremental-vacuum. If successful,
2234 ** return SQLITE_OK. If there is no work to do (and therefore no
2235 ** point in calling this function again), return SQLITE_DONE.
2237 ** More specificly, this function attempts to re-organize the
2238 ** database so that the last page of the file currently in use
2239 ** is no longer in use.
2241 ** If the nFin parameter is non-zero, the implementation assumes
2242 ** that the caller will keep calling incrVacuumStep() until
2243 ** it returns SQLITE_DONE or an error, and that nFin is the
2244 ** number of pages the database file will contain after this
2245 ** process is complete.
2247 static int incrVacuumStep(BtShared *pBt, Pgno nFin){
2248 Pgno iLastPg; /* Last page in the database */
2249 Pgno nFreeList; /* Number of pages still on the free-list */
2251 assert( sqlite3_mutex_held(pBt->mutex) );
2252 iLastPg = pBt->nTrunc;
2254 iLastPg = pagerPagecount(pBt->pPager);
2257 if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
2262 nFreeList = get4byte(&pBt->pPage1->aData[36]);
2263 if( nFreeList==0 || nFin==iLastPg ){
2267 rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
2268 if( rc!=SQLITE_OK ){
2271 if( eType==PTRMAP_ROOTPAGE ){
2272 return SQLITE_CORRUPT_BKPT;
2275 if( eType==PTRMAP_FREEPAGE ){
2277 /* Remove the page from the files free-list. This is not required
2278 ** if nFin is non-zero. In that case, the free-list will be
2279 ** truncated to zero after this function returns, so it doesn't
2280 ** matter if it still contains some garbage entries.
2284 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1);
2285 if( rc!=SQLITE_OK ){
2288 assert( iFreePg==iLastPg );
2289 releasePage(pFreePg);
2292 Pgno iFreePg; /* Index of free page to move pLastPg to */
2295 rc = sqlite3BtreeGetPage(pBt, iLastPg, &pLastPg, 0);
2296 if( rc!=SQLITE_OK ){
2300 /* If nFin is zero, this loop runs exactly once and page pLastPg
2301 ** is swapped with the first free page pulled off the free list.
2303 ** On the other hand, if nFin is greater than zero, then keep
2304 ** looping until a free-page located within the first nFin pages
2305 ** of the file is found.
2309 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0);
2310 if( rc!=SQLITE_OK ){
2311 releasePage(pLastPg);
2314 releasePage(pFreePg);
2315 }while( nFin!=0 && iFreePg>nFin );
2316 assert( iFreePg<iLastPg );
2318 rc = sqlite3PagerWrite(pLastPg->pDbPage);
2319 if( rc==SQLITE_OK ){
2320 rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, nFin!=0);
2322 releasePage(pLastPg);
2323 if( rc!=SQLITE_OK ){
2329 pBt->nTrunc = iLastPg - 1;
2330 while( pBt->nTrunc==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, pBt->nTrunc) ){
2337 ** A write-transaction must be opened before calling this function.
2338 ** It performs a single unit of work towards an incremental vacuum.
2340 ** If the incremental vacuum is finished after this function has run,
2341 ** SQLITE_DONE is returned. If it is not finished, but no error occured,
2342 ** SQLITE_OK is returned. Otherwise an SQLite error code.
2344 int sqlite3BtreeIncrVacuum(Btree *p){
2346 BtShared *pBt = p->pBt;
2348 sqlite3BtreeEnter(p);
2350 assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
2351 if( !pBt->autoVacuum ){
2354 invalidateAllOverflowCache(pBt);
2355 rc = incrVacuumStep(pBt, 0);
2357 sqlite3BtreeLeave(p);
2362 ** This routine is called prior to sqlite3PagerCommit when a transaction
2363 ** is commited for an auto-vacuum database.
2365 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
2366 ** the database file should be truncated to during the commit process.
2367 ** i.e. the database has been reorganized so that only the first *pnTrunc
2368 ** pages are in use.
2370 static int autoVacuumCommit(BtShared *pBt, Pgno *pnTrunc){
2372 Pager *pPager = pBt->pPager;
2373 VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager) );
2375 assert( sqlite3_mutex_held(pBt->mutex) );
2376 invalidateAllOverflowCache(pBt);
2377 assert(pBt->autoVacuum);
2378 if( !pBt->incrVacuum ){
2381 if( pBt->nTrunc==0 ){
2384 const int pgsz = pBt->pageSize;
2385 int nOrig = pagerPagecount(pBt->pPager);
2387 if( PTRMAP_ISPAGE(pBt, nOrig) ){
2388 return SQLITE_CORRUPT_BKPT;
2390 if( nOrig==PENDING_BYTE_PAGE(pBt) ){
2393 nFree = get4byte(&pBt->pPage1->aData[36]);
2394 nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+pgsz/5)/(pgsz/5);
2395 nFin = nOrig - nFree - nPtrmap;
2396 if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<=PENDING_BYTE_PAGE(pBt) ){
2399 while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
2404 while( rc==SQLITE_OK ){
2405 rc = incrVacuumStep(pBt, nFin);
2407 if( rc==SQLITE_DONE ){
2408 assert(nFin==0 || pBt->nTrunc==0 || nFin<=pBt->nTrunc);
2410 if( pBt->nTrunc && nFin ){
2411 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
2412 put4byte(&pBt->pPage1->aData[32], 0);
2413 put4byte(&pBt->pPage1->aData[36], 0);
2417 if( rc!=SQLITE_OK ){
2418 sqlite3PagerRollback(pPager);
2422 if( rc==SQLITE_OK ){
2423 *pnTrunc = pBt->nTrunc;
2426 assert( nRef==sqlite3PagerRefcount(pPager) );
2433 ** This routine does the first phase of a two-phase commit. This routine
2434 ** causes a rollback journal to be created (if it does not already exist)
2435 ** and populated with enough information so that if a power loss occurs
2436 ** the database can be restored to its original state by playing back
2437 ** the journal. Then the contents of the journal are flushed out to
2438 ** the disk. After the journal is safely on oxide, the changes to the
2439 ** database are written into the database file and flushed to oxide.
2440 ** At the end of this call, the rollback journal still exists on the
2441 ** disk and we are still holding all locks, so the transaction has not
2442 ** committed. See sqlite3BtreeCommit() for the second phase of the
2445 ** This call is a no-op if no write-transaction is currently active on pBt.
2447 ** Otherwise, sync the database file for the btree pBt. zMaster points to
2448 ** the name of a master journal file that should be written into the
2449 ** individual journal file, or is NULL, indicating no master journal file
2450 ** (single database transaction).
2452 ** When this is called, the master journal should already have been
2453 ** created, populated with this journal pointer and synced to disk.
2455 ** Once this is routine has returned, the only thing required to commit
2456 ** the write-transaction for this database file is to delete the journal.
2458 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
2460 if( p->inTrans==TRANS_WRITE ){
2461 BtShared *pBt = p->pBt;
2463 sqlite3BtreeEnter(p);
2465 #ifndef SQLITE_OMIT_AUTOVACUUM
2466 if( pBt->autoVacuum ){
2467 rc = autoVacuumCommit(pBt, &nTrunc);
2468 if( rc!=SQLITE_OK ){
2469 sqlite3BtreeLeave(p);
2474 rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, nTrunc, 0);
2475 sqlite3BtreeLeave(p);
2481 ** Commit the transaction currently in progress.
2483 ** This routine implements the second phase of a 2-phase commit. The
2484 ** sqlite3BtreeSync() routine does the first phase and should be invoked
2485 ** prior to calling this routine. The sqlite3BtreeSync() routine did
2486 ** all the work of writing information out to disk and flushing the
2487 ** contents so that they are written onto the disk platter. All this
2488 ** routine has to do is delete or truncate the rollback journal
2489 ** (which causes the transaction to commit) and drop locks.
2491 ** This will release the write lock on the database file. If there
2492 ** are no active cursors, it also releases the read lock.
2494 int sqlite3BtreeCommitPhaseTwo(Btree *p){
2495 BtShared *pBt = p->pBt;
2497 sqlite3BtreeEnter(p);
2501 /* If the handle has a write-transaction open, commit the shared-btrees
2502 ** transaction and set the shared state to TRANS_READ.
2504 if( p->inTrans==TRANS_WRITE ){
2506 assert( pBt->inTransaction==TRANS_WRITE );
2507 assert( pBt->nTransaction>0 );
2508 rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
2509 if( rc!=SQLITE_OK ){
2510 sqlite3BtreeLeave(p);
2513 pBt->inTransaction = TRANS_READ;
2518 /* If the handle has any kind of transaction open, decrement the transaction
2519 ** count of the shared btree. If the transaction count reaches 0, set
2520 ** the shared state to TRANS_NONE. The unlockBtreeIfUnused() call below
2521 ** will unlock the pager.
2523 if( p->inTrans!=TRANS_NONE ){
2524 pBt->nTransaction--;
2525 if( 0==pBt->nTransaction ){
2526 pBt->inTransaction = TRANS_NONE;
2530 /* Set the handles current transaction state to TRANS_NONE and unlock
2531 ** the pager if this call closed the only read or write transaction.
2533 p->inTrans = TRANS_NONE;
2534 unlockBtreeIfUnused(pBt);
2537 sqlite3BtreeLeave(p);
2542 ** Do both phases of a commit.
2544 int sqlite3BtreeCommit(Btree *p){
2546 sqlite3BtreeEnter(p);
2547 rc = sqlite3BtreeCommitPhaseOne(p, 0);
2548 if( rc==SQLITE_OK ){
2549 rc = sqlite3BtreeCommitPhaseTwo(p);
2551 sqlite3BtreeLeave(p);
2557 ** Return the number of write-cursors open on this handle. This is for use
2558 ** in assert() expressions, so it is only compiled if NDEBUG is not
2561 ** For the purposes of this routine, a write-cursor is any cursor that
2562 ** is capable of writing to the databse. That means the cursor was
2563 ** originally opened for writing and the cursor has not be disabled
2564 ** by having its state changed to CURSOR_FAULT.
2566 static int countWriteCursors(BtShared *pBt){
2569 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
2570 if( pCur->wrFlag && pCur->eState!=CURSOR_FAULT ) r++;
2577 ** This routine sets the state to CURSOR_FAULT and the error
2578 ** code to errCode for every cursor on BtShared that pBtree
2581 ** Every cursor is tripped, including cursors that belong
2582 ** to other database connections that happen to be sharing
2583 ** the cache with pBtree.
2585 ** This routine gets called when a rollback occurs.
2586 ** All cursors using the same cache must be tripped
2587 ** to prevent them from trying to use the btree after
2588 ** the rollback. The rollback may have deleted tables
2589 ** or moved root pages, so it is not sufficient to
2590 ** save the state of the cursor. The cursor must be
2593 void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){
2595 sqlite3BtreeEnter(pBtree);
2596 for(p=pBtree->pBt->pCursor; p; p=p->pNext){
2597 clearCursorPosition(p);
2598 p->eState = CURSOR_FAULT;
2601 sqlite3BtreeLeave(pBtree);
2605 ** Rollback the transaction in progress. All cursors will be
2606 ** invalided by this operation. Any attempt to use a cursor
2607 ** that was open at the beginning of this operation will result
2610 ** This will release the write lock on the database file. If there
2611 ** are no active cursors, it also releases the read lock.
2613 int sqlite3BtreeRollback(Btree *p){
2615 BtShared *pBt = p->pBt;
2618 sqlite3BtreeEnter(p);
2620 rc = saveAllCursors(pBt, 0, 0);
2621 #ifndef SQLITE_OMIT_SHARED_CACHE
2622 if( rc!=SQLITE_OK ){
2623 /* This is a horrible situation. An IO or malloc() error occured whilst
2624 ** trying to save cursor positions. If this is an automatic rollback (as
2625 ** the result of a constraint, malloc() failure or IO error) then
2626 ** the cache may be internally inconsistent (not contain valid trees) so
2627 ** we cannot simply return the error to the caller. Instead, abort
2628 ** all queries that may be using any of the cursors that failed to save.
2630 sqlite3BtreeTripAllCursors(p, rc);
2636 if( p->inTrans==TRANS_WRITE ){
2639 #ifndef SQLITE_OMIT_AUTOVACUUM
2643 assert( TRANS_WRITE==pBt->inTransaction );
2644 rc2 = sqlite3PagerRollback(pBt->pPager);
2645 if( rc2!=SQLITE_OK ){
2649 /* The rollback may have destroyed the pPage1->aData value. So
2650 ** call sqlite3BtreeGetPage() on page 1 again to make
2651 ** sure pPage1->aData is set correctly. */
2652 if( sqlite3BtreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
2653 releasePage(pPage1);
2655 assert( countWriteCursors(pBt)==0 );
2656 pBt->inTransaction = TRANS_READ;
2659 if( p->inTrans!=TRANS_NONE ){
2660 assert( pBt->nTransaction>0 );
2661 pBt->nTransaction--;
2662 if( 0==pBt->nTransaction ){
2663 pBt->inTransaction = TRANS_NONE;
2667 p->inTrans = TRANS_NONE;
2669 unlockBtreeIfUnused(pBt);
2672 sqlite3BtreeLeave(p);
2677 ** Start a statement subtransaction. The subtransaction can
2678 ** can be rolled back independently of the main transaction.
2679 ** You must start a transaction before starting a subtransaction.
2680 ** The subtransaction is ended automatically if the main transaction
2681 ** commits or rolls back.
2683 ** Only one subtransaction may be active at a time. It is an error to try
2684 ** to start a new subtransaction if another subtransaction is already active.
2686 ** Statement subtransactions are used around individual SQL statements
2687 ** that are contained within a BEGIN...COMMIT block. If a constraint
2688 ** error occurs within the statement, the effect of that one statement
2689 ** can be rolled back without having to rollback the entire transaction.
2691 int sqlite3BtreeBeginStmt(Btree *p){
2693 BtShared *pBt = p->pBt;
2694 sqlite3BtreeEnter(p);
2696 if( (p->inTrans!=TRANS_WRITE) || pBt->inStmt ){
2697 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
2699 assert( pBt->inTransaction==TRANS_WRITE );
2700 rc = pBt->readOnly ? SQLITE_OK : sqlite3PagerStmtBegin(pBt->pPager);
2703 sqlite3BtreeLeave(p);
2709 ** Commit the statment subtransaction currently in progress. If no
2710 ** subtransaction is active, this is a no-op.
2712 int sqlite3BtreeCommitStmt(Btree *p){
2714 BtShared *pBt = p->pBt;
2715 sqlite3BtreeEnter(p);
2717 if( pBt->inStmt && !pBt->readOnly ){
2718 rc = sqlite3PagerStmtCommit(pBt->pPager);
2723 sqlite3BtreeLeave(p);
2728 ** Rollback the active statement subtransaction. If no subtransaction
2729 ** is active this routine is a no-op.
2731 ** All cursors will be invalidated by this operation. Any attempt
2732 ** to use a cursor that was open at the beginning of this operation
2733 ** will result in an error.
2735 int sqlite3BtreeRollbackStmt(Btree *p){
2737 BtShared *pBt = p->pBt;
2738 sqlite3BtreeEnter(p);
2740 if( pBt->inStmt && !pBt->readOnly ){
2741 rc = sqlite3PagerStmtRollback(pBt->pPager);
2744 sqlite3BtreeLeave(p);
2749 ** Create a new cursor for the BTree whose root is on the page
2750 ** iTable. The act of acquiring a cursor gets a read lock on
2751 ** the database file.
2753 ** If wrFlag==0, then the cursor can only be used for reading.
2754 ** If wrFlag==1, then the cursor can be used for reading or for
2755 ** writing if other conditions for writing are also met. These
2756 ** are the conditions that must be met in order for writing to
2759 ** 1: The cursor must have been opened with wrFlag==1
2761 ** 2: Other database connections that share the same pager cache
2762 ** but which are not in the READ_UNCOMMITTED state may not have
2763 ** cursors open with wrFlag==0 on the same table. Otherwise
2764 ** the changes made by this write cursor would be visible to
2765 ** the read cursors in the other database connection.
2767 ** 3: The database must be writable (not on read-only media)
2769 ** 4: There must be an active transaction.
2771 ** No checking is done to make sure that page iTable really is the
2772 ** root page of a b-tree. If it is not, then the cursor acquired
2773 ** will not work correctly.
2775 ** It is assumed that the sqlite3BtreeCursorSize() bytes of memory
2776 ** pointed to by pCur have been zeroed by the caller.
2778 static int btreeCursor(
2779 Btree *p, /* The btree */
2780 int iTable, /* Root page of table to open */
2781 int wrFlag, /* 1 to write. 0 read-only */
2782 struct KeyInfo *pKeyInfo, /* First arg to comparison function */
2783 BtCursor *pCur /* Space for new cursor */
2786 BtShared *pBt = p->pBt;
2788 assert( sqlite3BtreeHoldsMutex(p) );
2790 if( pBt->readOnly ){
2791 return SQLITE_READONLY;
2793 if( checkReadLocks(p, iTable, 0, 0) ){
2794 return SQLITE_LOCKED;
2798 if( pBt->pPage1==0 ){
2799 rc = lockBtreeWithRetry(p);
2800 if( rc!=SQLITE_OK ){
2803 if( pBt->readOnly && wrFlag ){
2804 return SQLITE_READONLY;
2807 pCur->pgnoRoot = (Pgno)iTable;
2808 if( iTable==1 && pagerPagecount(pBt->pPager)==0 ){
2810 goto create_cursor_exception;
2812 rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]);
2813 if( rc!=SQLITE_OK ){
2814 goto create_cursor_exception;
2817 /* Now that no other errors can occur, finish filling in the BtCursor
2818 ** variables, link the cursor into the BtShared list and set *ppCur (the
2819 ** output argument to this function).
2821 pCur->pKeyInfo = pKeyInfo;
2824 pCur->wrFlag = wrFlag;
2825 pCur->pNext = pBt->pCursor;
2827 pCur->pNext->pPrev = pCur;
2829 pBt->pCursor = pCur;
2830 pCur->eState = CURSOR_INVALID;
2834 create_cursor_exception:
2835 releasePage(pCur->apPage[0]);
2836 unlockBtreeIfUnused(pBt);
2839 int sqlite3BtreeCursor(
2840 Btree *p, /* The btree */
2841 int iTable, /* Root page of table to open */
2842 int wrFlag, /* 1 to write. 0 read-only */
2843 struct KeyInfo *pKeyInfo, /* First arg to xCompare() */
2844 BtCursor *pCur /* Write new cursor here */
2847 sqlite3BtreeEnter(p);
2849 rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
2850 sqlite3BtreeLeave(p);
2853 int sqlite3BtreeCursorSize(){
2854 return sizeof(BtCursor);
2860 ** Close a cursor. The read lock on the database file is released
2861 ** when the last cursor is closed.
2863 int sqlite3BtreeCloseCursor(BtCursor *pCur){
2864 Btree *pBtree = pCur->pBtree;
2867 BtShared *pBt = pCur->pBt;
2868 sqlite3BtreeEnter(pBtree);
2869 pBt->db = pBtree->db;
2870 clearCursorPosition(pCur);
2872 pCur->pPrev->pNext = pCur->pNext;
2874 pBt->pCursor = pCur->pNext;
2877 pCur->pNext->pPrev = pCur->pPrev;
2879 for(i=0; i<=pCur->iPage; i++){
2880 releasePage(pCur->apPage[i]);
2882 unlockBtreeIfUnused(pBt);
2883 invalidateOverflowCache(pCur);
2884 /* sqlite3_free(pCur); */
2885 sqlite3BtreeLeave(pBtree);
2891 ** Make a temporary cursor by filling in the fields of pTempCur.
2892 ** The temporary cursor is not on the cursor list for the Btree.
2894 void sqlite3BtreeGetTempCursor(BtCursor *pCur, BtCursor *pTempCur){
2896 assert( cursorHoldsMutex(pCur) );
2897 memcpy(pTempCur, pCur, sizeof(BtCursor));
2898 pTempCur->pNext = 0;
2899 pTempCur->pPrev = 0;
2900 for(i=0; i<=pTempCur->iPage; i++){
2901 sqlite3PagerRef(pTempCur->apPage[i]->pDbPage);
2903 assert( pCur->pKey==0 );
2907 ** Delete a temporary cursor such as was made by the CreateTemporaryCursor()
2910 void sqlite3BtreeReleaseTempCursor(BtCursor *pCur){
2912 assert( cursorHoldsMutex(pCur) );
2913 for(i=0; i<=pCur->iPage; i++){
2914 sqlite3PagerUnref(pCur->apPage[i]->pDbPage);
2916 sqlite3_free(pCur->pKey);
2920 ** Make sure the BtCursor* given in the argument has a valid
2921 ** BtCursor.info structure. If it is not already valid, call
2922 ** sqlite3BtreeParseCell() to fill it in.
2924 ** BtCursor.info is a cache of the information in the current cell.
2925 ** Using this cache reduces the number of calls to sqlite3BtreeParseCell().
2927 ** 2007-06-25: There is a bug in some versions of MSVC that cause the
2928 ** compiler to crash when getCellInfo() is implemented as a macro.
2929 ** But there is a measureable speed advantage to using the macro on gcc
2930 ** (when less compiler optimizations like -Os or -O0 are used and the
2931 ** compiler is not doing agressive inlining.) So we use a real function
2932 ** for MSVC and a macro for everything else. Ticket #2457.
2935 static void assertCellInfo(BtCursor *pCur){
2937 int iPage = pCur->iPage;
2938 memset(&info, 0, sizeof(info));
2939 sqlite3BtreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info);
2940 assert( memcmp(&info, &pCur->info, sizeof(info))==0 );
2943 #define assertCellInfo(x)
2946 /* Use a real function in MSVC to work around bugs in that compiler. */
2947 static void getCellInfo(BtCursor *pCur){
2948 if( pCur->info.nSize==0 ){
2949 int iPage = pCur->iPage;
2950 sqlite3BtreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);
2951 pCur->validNKey = 1;
2953 assertCellInfo(pCur);
2956 #else /* if not _MSC_VER */
2957 /* Use a macro in all other compilers so that the function is inlined */
2958 #define getCellInfo(pCur) \
2959 if( pCur->info.nSize==0 ){ \
2960 int iPage = pCur->iPage; \
2961 sqlite3BtreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); \
2962 pCur->validNKey = 1; \
2964 assertCellInfo(pCur); \
2966 #endif /* _MSC_VER */
2969 ** Set *pSize to the size of the buffer needed to hold the value of
2970 ** the key for the current entry. If the cursor is not pointing
2971 ** to a valid entry, *pSize is set to 0.
2973 ** For a table with the INTKEY flag set, this routine returns the key
2974 ** itself, not the number of bytes in the key.
2976 int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
2979 assert( cursorHoldsMutex(pCur) );
2980 rc = restoreCursorPosition(pCur);
2981 if( rc==SQLITE_OK ){
2982 assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
2983 if( pCur->eState==CURSOR_INVALID ){
2987 *pSize = pCur->info.nKey;
2994 ** Set *pSize to the number of bytes of data in the entry the
2995 ** cursor currently points to. Always return SQLITE_OK.
2996 ** Failure is not possible. If the cursor is not currently
2997 ** pointing to an entry (which can happen, for example, if
2998 ** the database is empty) then *pSize is set to 0.
3000 int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
3003 assert( cursorHoldsMutex(pCur) );
3004 rc = restoreCursorPosition(pCur);
3005 if( rc==SQLITE_OK ){
3006 assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
3007 if( pCur->eState==CURSOR_INVALID ){
3008 /* Not pointing at a valid entry - set *pSize to 0. */
3012 *pSize = pCur->info.nData;
3019 ** Given the page number of an overflow page in the database (parameter
3020 ** ovfl), this function finds the page number of the next page in the
3021 ** linked list of overflow pages. If possible, it uses the auto-vacuum
3022 ** pointer-map data instead of reading the content of page ovfl to do so.
3024 ** If an error occurs an SQLite error code is returned. Otherwise:
3026 ** Unless pPgnoNext is NULL, the page number of the next overflow
3027 ** page in the linked list is written to *pPgnoNext. If page ovfl
3028 ** is the last page in its linked list, *pPgnoNext is set to zero.
3030 ** If ppPage is not NULL, *ppPage is set to the MemPage* handle
3031 ** for page ovfl. The underlying pager page may have been requested
3032 ** with the noContent flag set, so the page data accessable via
3033 ** this handle may not be trusted.
3035 static int getOverflowPage(
3037 Pgno ovfl, /* Overflow page */
3038 MemPage **ppPage, /* OUT: MemPage handle */
3039 Pgno *pPgnoNext /* OUT: Next overflow page number */
3044 assert( sqlite3_mutex_held(pBt->mutex) );
3045 /* One of these must not be NULL. Otherwise, why call this function? */
3046 assert(ppPage || pPgnoNext);
3048 /* If pPgnoNext is NULL, then this function is being called to obtain
3049 ** a MemPage* reference only. No page-data is required in this case.
3052 return sqlite3BtreeGetPage(pBt, ovfl, ppPage, 1);
3055 #ifndef SQLITE_OMIT_AUTOVACUUM
3056 /* Try to find the next page in the overflow list using the
3057 ** autovacuum pointer-map pages. Guess that the next page in
3058 ** the overflow list is page number (ovfl+1). If that guess turns
3059 ** out to be wrong, fall back to loading the data of page
3060 ** number ovfl to determine the next page number.
3062 if( pBt->autoVacuum ){
3064 Pgno iGuess = ovfl+1;
3067 while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
3071 if( iGuess<=pagerPagecount(pBt->pPager) ){
3072 rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
3073 if( rc!=SQLITE_OK ){
3076 if( eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
3083 if( next==0 || ppPage ){
3086 rc = sqlite3BtreeGetPage(pBt, ovfl, &pPage, next!=0);
3087 assert(rc==SQLITE_OK || pPage==0);
3088 if( next==0 && rc==SQLITE_OK ){
3089 next = get4byte(pPage->aData);
3104 ** Copy data from a buffer to a page, or from a page to a buffer.
3106 ** pPayload is a pointer to data stored on database page pDbPage.
3107 ** If argument eOp is false, then nByte bytes of data are copied
3108 ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
3109 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
3110 ** of data are copied from the buffer pBuf to pPayload.
3112 ** SQLITE_OK is returned on success, otherwise an error code.
3114 static int copyPayload(
3115 void *pPayload, /* Pointer to page data */
3116 void *pBuf, /* Pointer to buffer */
3117 int nByte, /* Number of bytes to copy */
3118 int eOp, /* 0 -> copy from page, 1 -> copy to page */
3119 DbPage *pDbPage /* Page containing pPayload */
3122 /* Copy data from buffer to page (a write operation) */
3123 int rc = sqlite3PagerWrite(pDbPage);
3124 if( rc!=SQLITE_OK ){
3127 memcpy(pPayload, pBuf, nByte);
3129 /* Copy data from page to buffer (a read operation) */
3130 memcpy(pBuf, pPayload, nByte);
3136 ** This function is used to read or overwrite payload information
3137 ** for the entry that the pCur cursor is pointing to. If the eOp
3138 ** parameter is 0, this is a read operation (data copied into
3139 ** buffer pBuf). If it is non-zero, a write (data copied from
3142 ** A total of "amt" bytes are read or written beginning at "offset".
3143 ** Data is read to or from the buffer pBuf.
3145 ** This routine does not make a distinction between key and data.
3146 ** It just reads or writes bytes from the payload area. Data might
3147 ** appear on the main page or be scattered out on multiple overflow
3150 ** If the BtCursor.isIncrblobHandle flag is set, and the current
3151 ** cursor entry uses one or more overflow pages, this function
3152 ** allocates space for and lazily popluates the overflow page-list
3153 ** cache array (BtCursor.aOverflow). Subsequent calls use this
3154 ** cache to make seeking to the supplied offset more efficient.
3156 ** Once an overflow page-list cache has been allocated, it may be
3157 ** invalidated if some other cursor writes to the same table, or if
3158 ** the cursor is moved to a different row. Additionally, in auto-vacuum
3159 ** mode, the following events may invalidate an overflow page-list cache.
3161 ** * An incremental vacuum,
3162 ** * A commit in auto_vacuum="full" mode,
3163 ** * Creating a table (may require moving an overflow page).
3165 static int accessPayload(
3166 BtCursor *pCur, /* Cursor pointing to entry to read from */
3167 int offset, /* Begin reading this far into payload */
3168 int amt, /* Read this many bytes */
3169 unsigned char *pBuf, /* Write the bytes into this buffer */
3170 int skipKey, /* offset begins at data if this is true */
3171 int eOp /* zero to read. non-zero to write. */
3173 unsigned char *aPayload;
3177 MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */
3178 BtShared *pBt; /* Btree this cursor belongs to */
3181 assert( pCur->eState==CURSOR_VALID );
3182 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
3183 assert( offset>=0 );
3184 assert( cursorHoldsMutex(pCur) );
3187 aPayload = pCur->info.pCell + pCur->info.nHeader;
3188 nKey = (pPage->intKey ? 0 : pCur->info.nKey);
3193 if( offset+amt > nKey+pCur->info.nData ){
3194 /* Trying to read or write past the end of the data is an error */
3195 return SQLITE_CORRUPT_BKPT;
3198 /* Check if data must be read/written to/from the btree page itself. */
3199 if( offset<pCur->info.nLocal ){
3201 if( a+offset>pCur->info.nLocal ){
3202 a = pCur->info.nLocal - offset;
3204 rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
3209 offset -= pCur->info.nLocal;
3213 if( rc==SQLITE_OK && amt>0 ){
3214 const int ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */
3217 nextPage = get4byte(&aPayload[pCur->info.nLocal]);
3219 #ifndef SQLITE_OMIT_INCRBLOB
3220 /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[]
3221 ** has not been allocated, allocate it now. The array is sized at
3222 ** one entry for each overflow page in the overflow chain. The
3223 ** page number of the first overflow page is stored in aOverflow[0],
3224 ** etc. A value of 0 in the aOverflow[] array means "not yet known"
3225 ** (the cache is lazily populated).
3227 if( pCur->isIncrblobHandle && !pCur->aOverflow ){
3228 int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
3229 pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl);
3230 if( nOvfl && !pCur->aOverflow ){
3235 /* If the overflow page-list cache has been allocated and the
3236 ** entry for the first required overflow page is valid, skip
3239 if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){
3240 iIdx = (offset/ovflSize);
3241 nextPage = pCur->aOverflow[iIdx];
3242 offset = (offset%ovflSize);
3246 for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){
3248 #ifndef SQLITE_OMIT_INCRBLOB
3249 /* If required, populate the overflow page-list cache. */
3250 if( pCur->aOverflow ){
3251 assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage);
3252 pCur->aOverflow[iIdx] = nextPage;
3256 if( offset>=ovflSize ){
3257 /* The only reason to read this page is to obtain the page
3258 ** number for the next page in the overflow chain. The page
3259 ** data is not required. So first try to lookup the overflow
3260 ** page-list cache, if any, then fall back to the getOverflowPage()
3263 #ifndef SQLITE_OMIT_INCRBLOB
3264 if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){
3265 nextPage = pCur->aOverflow[iIdx+1];
3268 rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
3271 /* Need to read this page properly. It contains some of the
3272 ** range of data that is being read (eOp==0) or written (eOp!=0).
3276 rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage);
3277 if( rc==SQLITE_OK ){
3278 aPayload = sqlite3PagerGetData(pDbPage);
3279 nextPage = get4byte(aPayload);
3280 if( a + offset > ovflSize ){
3281 a = ovflSize - offset;
3283 rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
3284 sqlite3PagerUnref(pDbPage);
3293 if( rc==SQLITE_OK && amt>0 ){
3294 return SQLITE_CORRUPT_BKPT;
3300 ** Read part of the key associated with cursor pCur. Exactly
3301 ** "amt" bytes will be transfered into pBuf[]. The transfer
3302 ** begins at "offset".
3304 ** Return SQLITE_OK on success or an error code if anything goes
3305 ** wrong. An error is returned if "offset+amt" is larger than
3306 ** the available payload.
3308 int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
3311 assert( cursorHoldsMutex(pCur) );
3312 rc = restoreCursorPosition(pCur);
3313 if( rc==SQLITE_OK ){
3314 assert( pCur->eState==CURSOR_VALID );
3315 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
3316 if( pCur->apPage[0]->intKey ){
3317 return SQLITE_CORRUPT_BKPT;
3319 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
3320 rc = accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0, 0);
3326 ** Read part of the data associated with cursor pCur. Exactly
3327 ** "amt" bytes will be transfered into pBuf[]. The transfer
3328 ** begins at "offset".
3330 ** Return SQLITE_OK on success or an error code if anything goes
3331 ** wrong. An error is returned if "offset+amt" is larger than
3332 ** the available payload.
3334 int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
3337 #ifndef SQLITE_OMIT_INCRBLOB
3338 if ( pCur->eState==CURSOR_INVALID ){
3339 return SQLITE_ABORT;
3343 assert( cursorHoldsMutex(pCur) );
3344 rc = restoreCursorPosition(pCur);
3345 if( rc==SQLITE_OK ){
3346 assert( pCur->eState==CURSOR_VALID );
3347 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
3348 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
3349 rc = accessPayload(pCur, offset, amt, pBuf, 1, 0);
3355 ** Return a pointer to payload information from the entry that the
3356 ** pCur cursor is pointing to. The pointer is to the beginning of
3357 ** the key if skipKey==0 and it points to the beginning of data if
3358 ** skipKey==1. The number of bytes of available key/data is written
3359 ** into *pAmt. If *pAmt==0, then the value returned will not be
3362 ** This routine is an optimization. It is common for the entire key
3363 ** and data to fit on the local page and for there to be no overflow
3364 ** pages. When that is so, this routine can be used to access the
3365 ** key and data without making a copy. If the key and/or data spills
3366 ** onto overflow pages, then accessPayload() must be used to reassembly
3367 ** the key/data and copy it into a preallocated buffer.
3369 ** The pointer returned by this routine looks directly into the cached
3370 ** page of the database. The data might change or move the next time
3371 ** any btree routine is called.
3373 static const unsigned char *fetchPayload(
3374 BtCursor *pCur, /* Cursor pointing to entry to read from */
3375 int *pAmt, /* Write the number of available bytes here */
3376 int skipKey /* read beginning at data if this is true */
3378 unsigned char *aPayload;
3383 assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]);
3384 assert( pCur->eState==CURSOR_VALID );
3385 assert( cursorHoldsMutex(pCur) );
3386 pPage = pCur->apPage[pCur->iPage];
3387 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
3389 aPayload = pCur->info.pCell;
3390 aPayload += pCur->info.nHeader;
3391 if( pPage->intKey ){
3394 nKey = pCur->info.nKey;
3398 nLocal = pCur->info.nLocal - nKey;
3400 nLocal = pCur->info.nLocal;
3411 ** For the entry that cursor pCur is point to, return as
3412 ** many bytes of the key or data as are available on the local
3413 ** b-tree page. Write the number of available bytes into *pAmt.
3415 ** The pointer returned is ephemeral. The key/data may move
3416 ** or be destroyed on the next call to any Btree routine,
3417 ** including calls from other threads against the same cache.
3418 ** Hence, a mutex on the BtShared should be held prior to calling
3421 ** These routines is used to get quick access to key and data
3422 ** in the common case where no overflow pages are used.
3424 const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){
3425 assert( cursorHoldsMutex(pCur) );
3426 if( pCur->eState==CURSOR_VALID ){
3427 return (const void*)fetchPayload(pCur, pAmt, 0);
3431 const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){
3432 assert( cursorHoldsMutex(pCur) );
3433 if( pCur->eState==CURSOR_VALID ){
3434 return (const void*)fetchPayload(pCur, pAmt, 1);
3441 ** Move the cursor down to a new child page. The newPgno argument is the
3442 ** page number of the child page to move to.
3444 static int moveToChild(BtCursor *pCur, u32 newPgno){
3446 int i = pCur->iPage;
3448 BtShared *pBt = pCur->pBt;
3450 assert( cursorHoldsMutex(pCur) );
3451 assert( pCur->eState==CURSOR_VALID );
3452 assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
3453 if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
3454 return SQLITE_CORRUPT_BKPT;
3456 rc = getAndInitPage(pBt, newPgno, &pNewPage);
3458 pCur->apPage[i+1] = pNewPage;
3459 pCur->aiIdx[i+1] = 0;
3462 pCur->info.nSize = 0;
3463 pCur->validNKey = 0;
3464 if( pNewPage->nCell<1 ){
3465 return SQLITE_CORRUPT_BKPT;
3472 ** Page pParent is an internal (non-leaf) tree page. This function
3473 ** asserts that page number iChild is the left-child if the iIdx'th
3474 ** cell in page pParent. Or, if iIdx is equal to the total number of
3475 ** cells in pParent, that page number iChild is the right-child of
3478 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
3479 assert( iIdx<=pParent->nCell );
3480 if( iIdx==pParent->nCell ){
3481 assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
3483 assert( get4byte(findCell(pParent, iIdx))==iChild );
3487 # define assertParentIndex(x,y,z)
3491 ** Move the cursor up to the parent page.
3493 ** pCur->idx is set to the cell index that contains the pointer
3494 ** to the page we are coming from. If we are coming from the
3495 ** right-most child page then pCur->idx is set to one more than
3496 ** the largest cell index.
3498 void sqlite3BtreeMoveToParent(BtCursor *pCur){
3499 assert( cursorHoldsMutex(pCur) );
3500 assert( pCur->eState==CURSOR_VALID );
3501 assert( pCur->iPage>0 );
3502 assert( pCur->apPage[pCur->iPage] );
3504 pCur->apPage[pCur->iPage-1],
3505 pCur->aiIdx[pCur->iPage-1],
3506 pCur->apPage[pCur->iPage]->pgno
3508 releasePage(pCur->apPage[pCur->iPage]);
3510 pCur->info.nSize = 0;
3511 pCur->validNKey = 0;
3515 ** Move the cursor to the root page
3517 static int moveToRoot(BtCursor *pCur){
3520 Btree *p = pCur->pBtree;
3521 BtShared *pBt = p->pBt;
3523 assert( cursorHoldsMutex(pCur) );
3524 assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
3525 assert( CURSOR_VALID < CURSOR_REQUIRESEEK );
3526 assert( CURSOR_FAULT > CURSOR_REQUIRESEEK );
3527 if( pCur->eState>=CURSOR_REQUIRESEEK ){
3528 if( pCur->eState==CURSOR_FAULT ){
3531 clearCursorPosition(pCur);
3534 if( pCur->iPage>=0 ){
3536 for(i=1; i<=pCur->iPage; i++){
3537 releasePage(pCur->apPage[i]);
3541 SQLITE_OK!=(rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]))
3543 pCur->eState = CURSOR_INVALID;
3548 pRoot = pCur->apPage[0];
3549 assert( pRoot->pgno==pCur->pgnoRoot );
3552 pCur->info.nSize = 0;
3554 pCur->validNKey = 0;
3556 if( pRoot->nCell==0 && !pRoot->leaf ){
3558 assert( pRoot->pgno==1 );
3559 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
3560 assert( subpage>0 );
3561 pCur->eState = CURSOR_VALID;
3562 rc = moveToChild(pCur, subpage);
3564 pCur->eState = ((pRoot->nCell>0)?CURSOR_VALID:CURSOR_INVALID);
3570 ** Move the cursor down to the left-most leaf entry beneath the
3571 ** entry to which it is currently pointing.
3573 ** The left-most leaf is the one with the smallest key - the first
3574 ** in ascending order.
3576 static int moveToLeftmost(BtCursor *pCur){
3581 assert( cursorHoldsMutex(pCur) );
3582 assert( pCur->eState==CURSOR_VALID );
3583 while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
3584 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
3585 pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage]));
3586 rc = moveToChild(pCur, pgno);
3592 ** Move the cursor down to the right-most leaf entry beneath the
3593 ** page to which it is currently pointing. Notice the difference
3594 ** between moveToLeftmost() and moveToRightmost(). moveToLeftmost()
3595 ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
3596 ** finds the right-most entry beneath the *page*.
3598 ** The right-most entry is the one with the largest key - the last
3599 ** key in ascending order.
3601 static int moveToRightmost(BtCursor *pCur){
3606 assert( cursorHoldsMutex(pCur) );
3607 assert( pCur->eState==CURSOR_VALID );
3608 while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
3609 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
3610 pCur->aiIdx[pCur->iPage] = pPage->nCell;
3611 rc = moveToChild(pCur, pgno);
3613 if( rc==SQLITE_OK ){
3614 pCur->aiIdx[pCur->iPage] = pPage->nCell-1;
3615 pCur->info.nSize = 0;
3616 pCur->validNKey = 0;
3621 /* Move the cursor to the first entry in the table. Return SQLITE_OK
3622 ** on success. Set *pRes to 0 if the cursor actually points to something
3623 ** or set *pRes to 1 if the table is empty.
3625 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
3628 assert( cursorHoldsMutex(pCur) );
3629 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
3630 rc = moveToRoot(pCur);
3631 if( rc==SQLITE_OK ){
3632 if( pCur->eState==CURSOR_INVALID ){
3633 assert( pCur->apPage[pCur->iPage]->nCell==0 );
3637 assert( pCur->apPage[pCur->iPage]->nCell>0 );
3639 rc = moveToLeftmost(pCur);
3645 /* Move the cursor to the last entry in the table. Return SQLITE_OK
3646 ** on success. Set *pRes to 0 if the cursor actually points to something
3647 ** or set *pRes to 1 if the table is empty.
3649 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
3652 assert( cursorHoldsMutex(pCur) );
3653 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
3654 rc = moveToRoot(pCur);
3655 if( rc==SQLITE_OK ){
3656 if( CURSOR_INVALID==pCur->eState ){
3657 assert( pCur->apPage[pCur->iPage]->nCell==0 );
3660 assert( pCur->eState==CURSOR_VALID );
3662 rc = moveToRightmost(pCur);
3664 pCur->atLast = rc==SQLITE_OK;
3670 /* Move the cursor so that it points to an entry near the key
3671 ** specified by pIdxKey or intKey. Return a success code.
3673 ** For INTKEY tables, the intKey parameter is used. pIdxKey
3674 ** must be NULL. For index tables, pIdxKey is used and intKey
3677 ** If an exact match is not found, then the cursor is always
3678 ** left pointing at a leaf page which would hold the entry if it
3679 ** were present. The cursor might point to an entry that comes
3680 ** before or after the key.
3682 ** The result of comparing the key with the entry to which the
3683 ** cursor is written to *pRes if pRes!=NULL. The meaning of
3684 ** this value is as follows:
3686 ** *pRes<0 The cursor is left pointing at an entry that
3687 ** is smaller than pKey or if the table is empty
3688 ** and the cursor is therefore left point to nothing.
3690 ** *pRes==0 The cursor is left pointing at an entry that
3691 ** exactly matches pKey.
3693 ** *pRes>0 The cursor is left pointing at an entry that
3694 ** is larger than pKey.
3697 int sqlite3BtreeMovetoUnpacked(
3698 BtCursor *pCur, /* The cursor to be moved */
3699 UnpackedRecord *pIdxKey, /* Unpacked index key */
3700 i64 intKey, /* The table key */
3701 int biasRight, /* If true, bias the search to the high end */
3702 int *pRes /* Write search results here */
3706 assert( cursorHoldsMutex(pCur) );
3707 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
3709 /* If the cursor is already positioned at the point we are trying
3710 ** to move to, then just return without doing any work */
3711 if( pCur->eState==CURSOR_VALID && pCur->validNKey
3712 && pCur->apPage[0]->intKey
3714 if( pCur->info.nKey==intKey ){
3718 if( pCur->atLast && pCur->info.nKey<intKey ){
3724 rc = moveToRoot(pCur);
3728 assert( pCur->apPage[pCur->iPage] );
3729 assert( pCur->apPage[pCur->iPage]->isInit );
3730 if( pCur->eState==CURSOR_INVALID ){
3732 assert( pCur->apPage[pCur->iPage]->nCell==0 );
3735 assert( pCur->apPage[0]->intKey || pIdxKey );
3739 MemPage *pPage = pCur->apPage[pCur->iPage];
3740 int c = -1; /* pRes return if table is empty must be -1 */
3742 upr = pPage->nCell-1;
3743 if( !pPage->intKey && pIdxKey==0 ){
3744 rc = SQLITE_CORRUPT_BKPT;
3748 pCur->aiIdx[pCur->iPage] = upr;
3750 pCur->aiIdx[pCur->iPage] = (upr+lwr)/2;
3752 if( lwr<=upr ) for(;;){
3755 int idx = pCur->aiIdx[pCur->iPage];
3756 pCur->info.nSize = 0;
3757 pCur->validNKey = 1;
3758 if( pPage->intKey ){
3760 pCell = findCell(pPage, idx) + pPage->childPtrSize;
3761 if( pPage->hasData ){
3763 pCell += getVarint32(pCell, dummy);
3765 getVarint(pCell, (u64*)&nCellKey);
3766 if( nCellKey==intKey ){
3768 }else if( nCellKey<intKey ){
3771 assert( nCellKey>intKey );
3776 pCellKey = (void *)fetchPayload(pCur, &available, 0);
3777 nCellKey = pCur->info.nKey;
3778 if( available>=nCellKey ){
3779 c = sqlite3VdbeRecordCompare(nCellKey, pCellKey, pIdxKey);
3781 pCellKey = sqlite3Malloc( nCellKey );
3786 rc = sqlite3BtreeKey(pCur, 0, nCellKey, (void *)pCellKey);
3787 c = sqlite3VdbeRecordCompare(nCellKey, pCellKey, pIdxKey);
3788 sqlite3_free(pCellKey);
3789 if( rc ) goto moveto_finish;
3793 pCur->info.nKey = nCellKey;
3794 if( pPage->intKey && !pPage->leaf ){
3799 if( pRes ) *pRes = 0;
3810 pCur->info.nKey = nCellKey;
3813 pCur->aiIdx[pCur->iPage] = (lwr+upr)/2;
3815 assert( lwr==upr+1 );
3816 assert( pPage->isInit );
3819 }else if( lwr>=pPage->nCell ){
3820 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
3822 chldPg = get4byte(findCell(pPage, lwr));
3825 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
3826 if( pRes ) *pRes = c;
3830 pCur->aiIdx[pCur->iPage] = lwr;
3831 pCur->info.nSize = 0;
3832 pCur->validNKey = 0;
3833 rc = moveToChild(pCur, chldPg);
3834 if( rc ) goto moveto_finish;
3841 ** In this version of BtreeMoveto, pKey is a packed index record
3842 ** such as is generated by the OP_MakeRecord opcode. Unpack the
3843 ** record and then call BtreeMovetoUnpacked() to do the work.
3845 int sqlite3BtreeMoveto(
3846 BtCursor *pCur, /* Cursor open on the btree to be searched */
3847 const void *pKey, /* Packed key if the btree is an index */
3848 i64 nKey, /* Integer key for tables. Size of pKey for indices */
3849 int bias, /* Bias search to the high end */
3850 int *pRes /* Write search results here */
3852 int rc; /* Status code */
3853 UnpackedRecord *pIdxKey; /* Unpacked index key */
3854 UnpackedRecord aSpace[16]; /* Temp space for pIdxKey - to avoid a malloc */
3857 pIdxKey = sqlite3VdbeRecordUnpack(pCur->pKeyInfo, nKey, pKey,
3858 aSpace, sizeof(aSpace));
3859 if( pIdxKey==0 ) return SQLITE_NOMEM;
3863 rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
3865 sqlite3VdbeDeleteUnpackedRecord(pIdxKey);
3872 ** Return TRUE if the cursor is not pointing at an entry of the table.
3874 ** TRUE will be returned after a call to sqlite3BtreeNext() moves
3875 ** past the last entry in the table or sqlite3BtreePrev() moves past
3876 ** the first entry. TRUE is also returned if the table is empty.
3878 int sqlite3BtreeEof(BtCursor *pCur){
3879 /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
3880 ** have been deleted? This API will need to change to return an error code
3881 ** as well as the boolean result value.
3883 return (CURSOR_VALID!=pCur->eState);
3887 ** Return the database connection handle for a cursor.
3889 sqlite3 *sqlite3BtreeCursorDb(const BtCursor *pCur){
3890 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
3891 return pCur->pBtree->db;
3895 ** Advance the cursor to the next entry in the database. If
3896 ** successful then set *pRes=0. If the cursor
3897 ** was already pointing to the last entry in the database before
3898 ** this routine was called, then set *pRes=1.
3900 int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
3905 assert( cursorHoldsMutex(pCur) );
3906 rc = restoreCursorPosition(pCur);
3907 if( rc!=SQLITE_OK ){
3911 if( CURSOR_INVALID==pCur->eState ){
3922 pPage = pCur->apPage[pCur->iPage];
3923 idx = ++pCur->aiIdx[pCur->iPage];
3924 assert( pPage->isInit );
3925 assert( idx<=pPage->nCell );
3927 pCur->info.nSize = 0;
3928 pCur->validNKey = 0;
3929 if( idx>=pPage->nCell ){
3931 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
3933 rc = moveToLeftmost(pCur);
3938 if( pCur->iPage==0 ){
3940 pCur->eState = CURSOR_INVALID;
3943 sqlite3BtreeMoveToParent(pCur);
3944 pPage = pCur->apPage[pCur->iPage];
3945 }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell );
3947 if( pPage->intKey ){
3948 rc = sqlite3BtreeNext(pCur, pRes);
3958 rc = moveToLeftmost(pCur);
3964 ** Step the cursor to the back to the previous entry in the database. If
3965 ** successful then set *pRes=0. If the cursor
3966 ** was already pointing to the first entry in the database before
3967 ** this routine was called, then set *pRes=1.
3969 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
3973 assert( cursorHoldsMutex(pCur) );
3974 rc = restoreCursorPosition(pCur);
3975 if( rc!=SQLITE_OK ){
3979 if( CURSOR_INVALID==pCur->eState ){
3990 pPage = pCur->apPage[pCur->iPage];
3991 assert( pPage->isInit );
3993 int idx = pCur->aiIdx[pCur->iPage];
3994 rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
3998 rc = moveToRightmost(pCur);
4000 while( pCur->aiIdx[pCur->iPage]==0 ){
4001 if( pCur->iPage==0 ){
4002 pCur->eState = CURSOR_INVALID;
4006 sqlite3BtreeMoveToParent(pCur);
4008 pCur->info.nSize = 0;
4009 pCur->validNKey = 0;
4011 pCur->aiIdx[pCur->iPage]--;
4012 pPage = pCur->apPage[pCur->iPage];
4013 if( pPage->intKey && !pPage->leaf ){
4014 rc = sqlite3BtreePrevious(pCur, pRes);
4024 ** Allocate a new page from the database file.
4026 ** The new page is marked as dirty. (In other words, sqlite3PagerWrite()
4027 ** has already been called on the new page.) The new page has also
4028 ** been referenced and the calling routine is responsible for calling
4029 ** sqlite3PagerUnref() on the new page when it is done.
4031 ** SQLITE_OK is returned on success. Any other return value indicates
4032 ** an error. *ppPage and *pPgno are undefined in the event of an error.
4033 ** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned.
4035 ** If the "nearby" parameter is not 0, then a (feeble) effort is made to
4036 ** locate a page close to the page number "nearby". This can be used in an
4037 ** attempt to keep related pages close to each other in the database file,
4038 ** which in turn can make database access faster.
4040 ** If the "exact" parameter is not 0, and the page-number nearby exists
4041 ** anywhere on the free-list, then it is guarenteed to be returned. This
4042 ** is only used by auto-vacuum databases when allocating a new table.
4044 static int allocateBtreePage(
4053 int n; /* Number of pages on the freelist */
4054 int k; /* Number of leaves on the trunk of the freelist */
4055 MemPage *pTrunk = 0;
4056 MemPage *pPrevTrunk = 0;
4058 assert( sqlite3_mutex_held(pBt->mutex) );
4059 pPage1 = pBt->pPage1;
4060 n = get4byte(&pPage1->aData[36]);
4062 /* There are pages on the freelist. Reuse one of those pages. */
4064 u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
4066 /* If the 'exact' parameter was true and a query of the pointer-map
4067 ** shows that the page 'nearby' is somewhere on the free-list, then
4068 ** the entire-list will be searched for that page.
4070 #ifndef SQLITE_OMIT_AUTOVACUUM
4071 if( exact && nearby<=pagerPagecount(pBt->pPager) ){
4074 assert( pBt->autoVacuum );
4075 rc = ptrmapGet(pBt, nearby, &eType, 0);
4077 if( eType==PTRMAP_FREEPAGE ){
4084 /* Decrement the free-list count by 1. Set iTrunk to the index of the
4085 ** first free-list trunk page. iPrevTrunk is initially 1.
4087 rc = sqlite3PagerWrite(pPage1->pDbPage);
4089 put4byte(&pPage1->aData[36], n-1);
4091 /* The code within this loop is run only once if the 'searchList' variable
4092 ** is not true. Otherwise, it runs once for each trunk-page on the
4093 ** free-list until the page 'nearby' is located.
4096 pPrevTrunk = pTrunk;
4098 iTrunk = get4byte(&pPrevTrunk->aData[0]);
4100 iTrunk = get4byte(&pPage1->aData[32]);
4102 rc = sqlite3BtreeGetPage(pBt, iTrunk, &pTrunk, 0);
4105 goto end_allocate_page;
4108 k = get4byte(&pTrunk->aData[4]);
4109 if( k==0 && !searchList ){
4110 /* The trunk has no leaves and the list is not being searched.
4111 ** So extract the trunk page itself and use it as the newly
4112 ** allocated page */
4113 assert( pPrevTrunk==0 );
4114 rc = sqlite3PagerWrite(pTrunk->pDbPage);
4116 goto end_allocate_page;
4119 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
4122 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
4123 }else if( k>pBt->usableSize/4 - 2 ){
4124 /* Value of k is out of range. Database corruption */
4125 rc = SQLITE_CORRUPT_BKPT;
4126 goto end_allocate_page;
4127 #ifndef SQLITE_OMIT_AUTOVACUUM
4128 }else if( searchList && nearby==iTrunk ){
4129 /* The list is being searched and this trunk page is the page
4130 ** to allocate, regardless of whether it has leaves.
4132 assert( *pPgno==iTrunk );
4135 rc = sqlite3PagerWrite(pTrunk->pDbPage);
4137 goto end_allocate_page;
4141 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
4143 memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
4146 /* The trunk page is required by the caller but it contains
4147 ** pointers to free-list leaves. The first leaf becomes a trunk
4148 ** page in this case.
4151 Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
4152 rc = sqlite3BtreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0);
4153 if( rc!=SQLITE_OK ){
4154 goto end_allocate_page;
4156 rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
4157 if( rc!=SQLITE_OK ){
4158 releasePage(pNewTrunk);
4159 goto end_allocate_page;
4161 memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
4162 put4byte(&pNewTrunk->aData[4], k-1);
4163 memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
4164 releasePage(pNewTrunk);
4166 put4byte(&pPage1->aData[32], iNewTrunk);
4168 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
4170 goto end_allocate_page;
4172 put4byte(&pPrevTrunk->aData[0], iNewTrunk);
4176 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
4179 /* Extract a leaf from the trunk */
4182 unsigned char *aData = pTrunk->aData;
4183 rc = sqlite3PagerWrite(pTrunk->pDbPage);
4185 goto end_allocate_page;
4190 dist = get4byte(&aData[8]) - nearby;
4191 if( dist<0 ) dist = -dist;
4193 int d2 = get4byte(&aData[8+i*4]) - nearby;
4194 if( d2<0 ) d2 = -d2;
4204 iPage = get4byte(&aData[8+closest*4]);
4205 if( !searchList || iPage==nearby ){
4208 nPage = pagerPagecount(pBt->pPager);
4210 /* Free page off the end of the file */
4211 rc = SQLITE_CORRUPT_BKPT;
4212 goto end_allocate_page;
4214 TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
4215 ": %d more free pages\n",
4216 *pPgno, closest+1, k, pTrunk->pgno, n-1));
4218 memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
4220 put4byte(&aData[4], k-1);
4221 rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 1);
4222 if( rc==SQLITE_OK ){
4223 sqlite3PagerDontRollback((*ppPage)->pDbPage);
4224 rc = sqlite3PagerWrite((*ppPage)->pDbPage);
4225 if( rc!=SQLITE_OK ){
4226 releasePage(*ppPage);
4232 releasePage(pPrevTrunk);
4234 }while( searchList );
4236 /* There are no pages on the freelist, so create a new page at the
4237 ** end of the file */
4238 int nPage = pagerPagecount(pBt->pPager);
4241 #ifndef SQLITE_OMIT_AUTOVACUUM
4243 /* An incr-vacuum has already run within this transaction. So the
4244 ** page to allocate is not from the physical end of the file, but
4247 *pPgno = pBt->nTrunc+1;
4248 if( *pPgno==PENDING_BYTE_PAGE(pBt) ){
4252 if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, *pPgno) ){
4253 /* If *pPgno refers to a pointer-map page, allocate two new pages
4254 ** at the end of the file instead of one. The first allocated page
4255 ** becomes a new pointer-map page, the second is used by the caller.
4257 TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", *pPgno));
4258 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
4260 if( *pPgno==PENDING_BYTE_PAGE(pBt) ){ (*pPgno)++; }
4263 pBt->nTrunc = *pPgno;
4267 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
4268 rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 0);
4270 rc = sqlite3PagerWrite((*ppPage)->pDbPage);
4271 if( rc!=SQLITE_OK ){
4272 releasePage(*ppPage);
4274 TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
4277 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
4280 releasePage(pTrunk);
4281 releasePage(pPrevTrunk);
4282 if( rc==SQLITE_OK && sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
4283 releasePage(*ppPage);
4284 return SQLITE_CORRUPT_BKPT;
4290 ** Add a page of the database file to the freelist.
4292 ** sqlite3PagerUnref() is NOT called for pPage.
4294 static int freePage(MemPage *pPage){
4295 BtShared *pBt = pPage->pBt;
4296 MemPage *pPage1 = pBt->pPage1;
4299 /* Prepare the page for freeing */
4300 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4301 assert( pPage->pgno>1 );
4304 /* Increment the free page count on pPage1 */
4305 rc = sqlite3PagerWrite(pPage1->pDbPage);
4307 n = get4byte(&pPage1->aData[36]);
4308 put4byte(&pPage1->aData[36], n+1);
4310 #ifdef SQLITE_SECURE_DELETE
4311 /* If the SQLITE_SECURE_DELETE compile-time option is enabled, then
4312 ** always fully overwrite deleted information with zeros.
4314 rc = sqlite3PagerWrite(pPage->pDbPage);
4316 memset(pPage->aData, 0, pPage->pBt->pageSize);
4319 /* If the database supports auto-vacuum, write an entry in the pointer-map
4320 ** to indicate that the page is free.
4323 rc = ptrmapPut(pBt, pPage->pgno, PTRMAP_FREEPAGE, 0);
4328 /* This is the first free page */
4329 rc = sqlite3PagerWrite(pPage->pDbPage);
4331 memset(pPage->aData, 0, 8);
4332 put4byte(&pPage1->aData[32], pPage->pgno);
4333 TRACE(("FREE-PAGE: %d first\n", pPage->pgno));
4335 /* Other free pages already exist. Retrive the first trunk page
4336 ** of the freelist and find out how many leaves it has. */
4338 rc = sqlite3BtreeGetPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk, 0);
4340 k = get4byte(&pTrunk->aData[4]);
4341 if( k>=pBt->usableSize/4 - 8 ){
4342 /* The trunk is full. Turn the page being freed into a new
4343 ** trunk page with no leaves.
4345 ** Note that the trunk page is not really full until it contains
4346 ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
4347 ** coded. But due to a coding error in versions of SQLite prior to
4348 ** 3.6.0, databases with freelist trunk pages holding more than
4349 ** usableSize/4 - 8 entries will be reported as corrupt. In order
4350 ** to maintain backwards compatibility with older versions of SQLite,
4351 ** we will contain to restrict the number of entries to usableSize/4 - 8
4352 ** for now. At some point in the future (once everyone has upgraded
4353 ** to 3.6.0 or later) we should consider fixing the conditional above
4354 ** to read "usableSize/4-2" instead of "usableSize/4-8".
4356 rc = sqlite3PagerWrite(pPage->pDbPage);
4357 if( rc==SQLITE_OK ){
4358 put4byte(pPage->aData, pTrunk->pgno);
4359 put4byte(&pPage->aData[4], 0);
4360 put4byte(&pPage1->aData[32], pPage->pgno);
4361 TRACE(("FREE-PAGE: %d new trunk page replacing %d\n",
4362 pPage->pgno, pTrunk->pgno));
4365 rc = SQLITE_CORRUPT;
4367 /* Add the newly freed page as a leaf on the current trunk */
4368 rc = sqlite3PagerWrite(pTrunk->pDbPage);
4369 if( rc==SQLITE_OK ){
4370 put4byte(&pTrunk->aData[4], k+1);
4371 put4byte(&pTrunk->aData[8+k*4], pPage->pgno);
4372 #ifndef SQLITE_SECURE_DELETE
4373 rc = sqlite3PagerDontWrite(pPage->pDbPage);
4376 TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
4378 releasePage(pTrunk);
4384 ** Free any overflow pages associated with the given Cell.
4386 static int clearCell(MemPage *pPage, unsigned char *pCell){
4387 BtShared *pBt = pPage->pBt;
4394 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4395 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
4396 if( info.iOverflow==0 ){
4397 return SQLITE_OK; /* No overflow pages. Return without doing anything */
4399 ovflPgno = get4byte(&pCell[info.iOverflow]);
4400 ovflPageSize = pBt->usableSize - 4;
4401 nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;
4402 assert( ovflPgno==0 || nOvfl>0 );
4405 if( ovflPgno==0 || ovflPgno>pagerPagecount(pBt->pPager) ){
4406 return SQLITE_CORRUPT_BKPT;
4409 rc = getOverflowPage(pBt, ovflPgno, &pOvfl, (nOvfl==0)?0:&ovflPgno);
4411 rc = freePage(pOvfl);
4412 sqlite3PagerUnref(pOvfl->pDbPage);
4419 ** Create the byte sequence used to represent a cell on page pPage
4420 ** and write that byte sequence into pCell[]. Overflow pages are
4421 ** allocated and filled in as necessary. The calling procedure
4422 ** is responsible for making sure sufficient space has been allocated
4425 ** Note that pCell does not necessary need to point to the pPage->aData
4426 ** area. pCell might point to some temporary storage. The cell will
4427 ** be constructed in this temporary area then copied into pPage->aData
4430 static int fillInCell(
4431 MemPage *pPage, /* The page that contains the cell */
4432 unsigned char *pCell, /* Complete text of the cell */
4433 const void *pKey, i64 nKey, /* The key */
4434 const void *pData,int nData, /* The data */
4435 int nZero, /* Extra zero bytes to append to pData */
4436 int *pnSize /* Write cell size here */
4443 MemPage *pToRelease = 0;
4444 unsigned char *pPrior;
4445 unsigned char *pPayload;
4446 BtShared *pBt = pPage->pBt;
4451 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4453 /* Fill in the header. */
4458 if( pPage->hasData ){
4459 nHeader += putVarint(&pCell[nHeader], nData+nZero);
4463 nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
4464 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
4465 assert( info.nHeader==nHeader );
4466 assert( info.nKey==nKey );
4467 assert( info.nData==nData+nZero );
4469 /* Fill in the payload */
4470 nPayload = nData + nZero;
4471 if( pPage->intKey ){
4480 *pnSize = info.nSize;
4481 spaceLeft = info.nLocal;
4482 pPayload = &pCell[nHeader];
4483 pPrior = &pCell[info.iOverflow];
4485 while( nPayload>0 ){
4488 #ifndef SQLITE_OMIT_AUTOVACUUM
4489 Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
4490 if( pBt->autoVacuum ){
4494 PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
4501 rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, isExact);
4502 #ifndef SQLITE_OMIT_AUTOVACUUM
4503 /* If the database supports auto-vacuum, and the second or subsequent
4504 ** overflow page is being allocated, add an entry to the pointer-map
4505 ** for that page now.
4507 ** If this is the first overflow page, then write a partial entry
4508 ** to the pointer-map. If we write nothing to this pointer-map slot,
4509 ** then the optimistic overflow chain processing in clearCell()
4510 ** may misinterpret the uninitialised values and delete the
4511 ** wrong pages from the database.
4513 if( pBt->autoVacuum && rc==SQLITE_OK ){
4514 u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
4515 rc = ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap);
4522 releasePage(pToRelease);
4525 put4byte(pPrior, pgnoOvfl);
4526 releasePage(pToRelease);
4528 pPrior = pOvfl->aData;
4529 put4byte(pPrior, 0);
4530 pPayload = &pOvfl->aData[4];
4531 spaceLeft = pBt->usableSize - 4;
4534 if( n>spaceLeft ) n = spaceLeft;
4536 if( n>nSrc ) n = nSrc;
4538 memcpy(pPayload, pSrc, n);
4540 memset(pPayload, 0, n);
4552 releasePage(pToRelease);
4557 ** Remove the i-th cell from pPage. This routine effects pPage only.
4558 ** The cell content is not freed or deallocated. It is assumed that
4559 ** the cell content has been copied someplace else. This routine just
4560 ** removes the reference to the cell from pPage.
4562 ** "sz" must be the number of bytes in the cell.
4564 static void dropCell(MemPage *pPage, int idx, int sz){
4565 int i; /* Loop counter */
4566 int pc; /* Offset to cell content of cell being deleted */
4567 u8 *data; /* pPage->aData */
4568 u8 *ptr; /* Used to move bytes around within data[] */
4570 assert( idx>=0 && idx<pPage->nCell );
4571 assert( sz==cellSize(pPage, idx) );
4572 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
4573 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4574 data = pPage->aData;
4575 ptr = &data[pPage->cellOffset + 2*idx];
4577 assert( pc>10 && pc+sz<=pPage->pBt->usableSize );
4578 freeSpace(pPage, pc, sz);
4579 for(i=idx+1; i<pPage->nCell; i++, ptr+=2){
4584 put2byte(&data[pPage->hdrOffset+3], pPage->nCell);
4589 ** Insert a new cell on pPage at cell index "i". pCell points to the
4590 ** content of the cell.
4592 ** If the cell content will fit on the page, then put it there. If it
4593 ** will not fit, then make a copy of the cell content into pTemp if
4594 ** pTemp is not null. Regardless of pTemp, allocate a new entry
4595 ** in pPage->aOvfl[] and make it point to the cell content (either
4596 ** in pTemp or the original pCell) and also record its index.
4597 ** Allocating a new entry in pPage->aCell[] implies that
4598 ** pPage->nOverflow is incremented.
4600 ** If nSkip is non-zero, then do not copy the first nSkip bytes of the
4601 ** cell. The caller will overwrite them after this function returns. If
4602 ** nSkip is non-zero, then pCell may not point to an invalid memory location
4603 ** (but pCell+nSkip is always valid).
4605 static int insertCell(
4606 MemPage *pPage, /* Page into which we are copying */
4607 int i, /* New cell becomes the i-th cell of the page */
4608 u8 *pCell, /* Content of the new cell */
4609 int sz, /* Bytes of content in pCell */
4610 u8 *pTemp, /* Temp storage space for pCell, if needed */
4611 u8 nSkip /* Do not write the first nSkip bytes of the cell */
4613 int idx; /* Where to write new cell content in data[] */
4614 int j; /* Loop counter */
4615 int top; /* First byte of content for any cell in data[] */
4616 int end; /* First byte past the last cell pointer in data[] */
4617 int ins; /* Index in data[] where new cell pointer is inserted */
4618 int hdr; /* Offset into data[] of the page header */
4619 int cellOffset; /* Address of first cell pointer in data[] */
4620 u8 *data; /* The content of the whole page */
4621 u8 *ptr; /* Used for moving information around in data[] */
4623 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
4624 assert( sz==cellSizePtr(pPage, pCell) );
4625 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4626 if( pPage->nOverflow || sz+2>pPage->nFree ){
4628 memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip);
4631 j = pPage->nOverflow++;
4632 assert( j<sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0]) );
4633 pPage->aOvfl[j].pCell = pCell;
4634 pPage->aOvfl[j].idx = i;
4637 int rc = sqlite3PagerWrite(pPage->pDbPage);
4638 if( rc!=SQLITE_OK ){
4641 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
4642 data = pPage->aData;
4643 hdr = pPage->hdrOffset;
4644 top = get2byte(&data[hdr+5]);
4645 cellOffset = pPage->cellOffset;
4646 end = cellOffset + 2*pPage->nCell + 2;
4647 ins = cellOffset + 2*i;
4648 if( end > top - sz ){
4649 defragmentPage(pPage);
4650 top = get2byte(&data[hdr+5]);
4651 assert( end + sz <= top );
4653 idx = allocateSpace(pPage, sz);
4655 assert( end <= get2byte(&data[hdr+5]) );
4658 memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip);
4659 for(j=end-2, ptr=&data[j]; j>ins; j-=2, ptr-=2){
4663 put2byte(&data[ins], idx);
4664 put2byte(&data[hdr+3], pPage->nCell);
4665 #ifndef SQLITE_OMIT_AUTOVACUUM
4666 if( pPage->pBt->autoVacuum ){
4667 /* The cell may contain a pointer to an overflow page. If so, write
4668 ** the entry for the overflow page into the pointer map.
4671 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
4672 assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
4673 if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
4674 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
4675 rc = ptrmapPut(pPage->pBt, pgnoOvfl, PTRMAP_OVERFLOW1, pPage->pgno);
4676 if( rc!=SQLITE_OK ) return rc;
4686 ** Add a list of cells to a page. The page should be initially empty.
4687 ** The cells are guaranteed to fit on the page.
4689 static void assemblePage(
4690 MemPage *pPage, /* The page to be assemblied */
4691 int nCell, /* The number of cells to add to this page */
4692 u8 **apCell, /* Pointers to cell bodies */
4693 u16 *aSize /* Sizes of the cells */
4695 int i; /* Loop counter */
4696 int totalSize; /* Total size of all cells */
4697 int hdr; /* Index of page header */
4698 int cellptr; /* Address of next cell pointer */
4699 int cellbody; /* Address of next cell body */
4700 u8 *data; /* Data for the page */
4702 assert( pPage->nOverflow==0 );
4703 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4705 for(i=0; i<nCell; i++){
4706 totalSize += aSize[i];
4708 assert( totalSize+2*nCell<=pPage->nFree );
4709 assert( pPage->nCell==0 );
4710 cellptr = pPage->cellOffset;
4711 data = pPage->aData;
4712 hdr = pPage->hdrOffset;
4713 put2byte(&data[hdr+3], nCell);
4715 cellbody = allocateSpace(pPage, totalSize);
4716 assert( cellbody>0 );
4717 assert( pPage->nFree >= 2*nCell );
4718 pPage->nFree -= 2*nCell;
4719 for(i=0; i<nCell; i++){
4720 put2byte(&data[cellptr], cellbody);
4721 memcpy(&data[cellbody], apCell[i], aSize[i]);
4723 cellbody += aSize[i];
4725 assert( cellbody==pPage->pBt->usableSize );
4727 pPage->nCell = nCell;
4731 ** The following parameters determine how many adjacent pages get involved
4732 ** in a balancing operation. NN is the number of neighbors on either side
4733 ** of the page that participate in the balancing operation. NB is the
4734 ** total number of pages that participate, including the target page and
4735 ** NN neighbors on either side.
4737 ** The minimum value of NN is 1 (of course). Increasing NN above 1
4738 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
4739 ** in exchange for a larger degradation in INSERT and UPDATE performance.
4740 ** The value of NN appears to give the best results overall.
4742 #define NN 1 /* Number of neighbors on either side of pPage */
4743 #define NB (NN*2+1) /* Total pages involved in the balance */
4745 /* Forward reference */
4746 static int balance(BtCursor*, int);
4748 #ifndef SQLITE_OMIT_QUICKBALANCE
4750 ** This version of balance() handles the common special case where
4751 ** a new entry is being inserted on the extreme right-end of the
4752 ** tree, in other words, when the new entry will become the largest
4753 ** entry in the tree.
4755 ** Instead of trying balance the 3 right-most leaf pages, just add
4756 ** a new page to the right-hand side and put the one new entry in
4757 ** that page. This leaves the right side of the tree somewhat
4758 ** unbalanced. But odds are that we will be inserting new entries
4759 ** at the end soon afterwards so the nearly empty page will quickly
4760 ** fill up. On average.
4762 ** pPage is the leaf page which is the right-most page in the tree.
4763 ** pParent is its parent. pPage must have a single overflow entry
4764 ** which is also the right-most entry on the page.
4766 static int balance_quick(BtCursor *pCur){
4773 MemPage *pPage = pCur->apPage[pCur->iPage];
4774 MemPage *pParent = pCur->apPage[pCur->iPage-1];
4775 BtShared *pBt = pPage->pBt;
4776 int parentIdx = pParent->nCell; /* pParent new divider cell index */
4777 int parentSize; /* Size of new divider cell */
4778 u8 parentCell[64]; /* Space for the new divider cell */
4780 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4782 /* Allocate a new page. Insert the overflow cell from pPage
4783 ** into it. Then remove the overflow cell from pPage.
4785 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
4786 if( rc==SQLITE_OK ){
4787 pCell = pPage->aOvfl[0].pCell;
4788 szCell = cellSizePtr(pPage, pCell);
4789 zeroPage(pNew, pPage->aData[0]);
4790 assemblePage(pNew, 1, &pCell, &szCell);
4791 pPage->nOverflow = 0;
4793 /* pPage is currently the right-child of pParent. Change this
4794 ** so that the right-child is the new page allocated above and
4795 ** pPage is the next-to-right child.
4797 ** Ignore the return value of the call to fillInCell(). fillInCell()
4798 ** may only return other than SQLITE_OK if it is required to allocate
4799 ** one or more overflow pages. Since an internal table B-Tree cell
4800 ** may never spill over onto an overflow page (it is a maximum of
4801 ** 13 bytes in size), it is not neccessary to check the return code.
4803 ** Similarly, the insertCell() function cannot fail if the page
4804 ** being inserted into is already writable and the cell does not
4805 ** contain an overflow pointer. So ignore this return code too.
4807 assert( pPage->nCell>0 );
4808 pCell = findCell(pPage, pPage->nCell-1);
4809 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
4810 fillInCell(pParent, parentCell, 0, info.nKey, 0, 0, 0, &parentSize);
4811 assert( parentSize<64 );
4812 assert( sqlite3PagerIswriteable(pParent->pDbPage) );
4813 insertCell(pParent, parentIdx, parentCell, parentSize, 0, 4);
4814 put4byte(findOverflowCell(pParent,parentIdx), pPage->pgno);
4815 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
4817 /* If this is an auto-vacuum database, update the pointer map
4818 ** with entries for the new page, and any pointer from the
4819 ** cell on the page to an overflow page.
4822 rc = ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno);
4823 if( rc==SQLITE_OK ){
4824 rc = ptrmapPutOvfl(pNew, 0);
4828 /* Release the reference to the new page. */
4832 /* At this point the pPage->nFree variable is not set correctly with
4833 ** respect to the content of the page (because it was set to 0 by
4834 ** insertCell). So call sqlite3BtreeInitPage() to make sure it is
4837 ** This has to be done even if an error will be returned. Normally, if
4838 ** an error occurs during tree balancing, the contents of MemPage are
4839 ** not important, as they will be recalculated when the page is rolled
4840 ** back. But here, in balance_quick(), it is possible that pPage has
4841 ** not yet been marked dirty or written into the journal file. Therefore
4842 ** it will not be rolled back and so it is important to make sure that
4843 ** the page data and contents of MemPage are consistent.
4846 sqlite3BtreeInitPage(pPage);
4848 /* If everything else succeeded, balance the parent page, in
4849 ** case the divider cell inserted caused it to become overfull.
4851 if( rc==SQLITE_OK ){
4854 rc = balance(pCur, 0);
4858 #endif /* SQLITE_OMIT_QUICKBALANCE */
4861 ** This routine redistributes Cells on pPage and up to NN*2 siblings
4862 ** of pPage so that all pages have about the same amount of free space.
4863 ** Usually NN siblings on either side of pPage is used in the balancing,
4864 ** though more siblings might come from one side if pPage is the first
4865 ** or last child of its parent. If pPage has fewer than 2*NN siblings
4866 ** (something which can only happen if pPage is the root page or a
4867 ** child of root) then all available siblings participate in the balancing.
4869 ** The number of siblings of pPage might be increased or decreased by one or
4870 ** two in an effort to keep pages nearly full but not over full. The root page
4871 ** is special and is allowed to be nearly empty. If pPage is
4872 ** the root page, then the depth of the tree might be increased
4873 ** or decreased by one, as necessary, to keep the root page from being
4874 ** overfull or completely empty.
4876 ** Note that when this routine is called, some of the Cells on pPage
4877 ** might not actually be stored in pPage->aData[]. This can happen
4878 ** if the page is overfull. Part of the job of this routine is to
4879 ** make sure all Cells for pPage once again fit in pPage->aData[].
4881 ** In the course of balancing the siblings of pPage, the parent of pPage
4882 ** might become overfull or underfull. If that happens, then this routine
4883 ** is called recursively on the parent.
4885 ** If this routine fails for any reason, it might leave the database
4886 ** in a corrupted state. So if this routine fails, the database should
4889 static int balance_nonroot(BtCursor *pCur){
4890 MemPage *pPage; /* The over or underfull page to balance */
4891 MemPage *pParent; /* The parent of pPage */
4892 BtShared *pBt; /* The whole database */
4893 int nCell = 0; /* Number of cells in apCell[] */
4894 int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */
4895 int nOld; /* Number of pages in apOld[] */
4896 int nNew; /* Number of pages in apNew[] */
4897 int nDiv; /* Number of cells in apDiv[] */
4898 int i, j, k; /* Loop counters */
4899 int idx; /* Index of pPage in pParent->aCell[] */
4900 int nxDiv; /* Next divider slot in pParent->aCell[] */
4901 int rc; /* The return code */
4902 int leafCorrection; /* 4 if pPage is a leaf. 0 if not */
4903 int leafData; /* True if pPage is a leaf of a LEAFDATA tree */
4904 int usableSpace; /* Bytes in pPage beyond the header */
4905 int pageFlags; /* Value of pPage->aData[0] */
4906 int subtotal; /* Subtotal of bytes in cells on one page */
4907 int iSpace1 = 0; /* First unused byte of aSpace1[] */
4908 int iSpace2 = 0; /* First unused byte of aSpace2[] */
4909 int szScratch; /* Size of scratch memory requested */
4910 MemPage *apOld[NB]; /* pPage and up to two siblings */
4911 Pgno pgnoOld[NB]; /* Page numbers for each page in apOld[] */
4912 MemPage *apCopy[NB]; /* Private copies of apOld[] pages */
4913 MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */
4914 Pgno pgnoNew[NB+2]; /* Page numbers for each page in apNew[] */
4915 u8 *apDiv[NB]; /* Divider cells in pParent */
4916 int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */
4917 int szNew[NB+2]; /* Combined size of cells place on i-th page */
4918 u8 **apCell = 0; /* All cells begin balanced */
4919 u16 *szCell; /* Local size of all cells in apCell[] */
4920 u8 *aCopy[NB]; /* Space for holding data of apCopy[] */
4921 u8 *aSpace1; /* Space for copies of dividers cells before balance */
4922 u8 *aSpace2 = 0; /* Space for overflow dividers cells after balance */
4925 pPage = pCur->apPage[pCur->iPage];
4926 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4927 VVA_ONLY( pCur->pagesShuffled = 1 );
4930 ** Find the parent page.
4932 assert( pCur->iPage>0 );
4933 assert( pPage->isInit );
4934 assert( sqlite3PagerIswriteable(pPage->pDbPage) || pPage->nOverflow==1 );
4936 pParent = pCur->apPage[pCur->iPage-1];
4938 if( SQLITE_OK!=(rc = sqlite3PagerWrite(pParent->pDbPage)) ){
4942 TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
4944 #ifndef SQLITE_OMIT_QUICKBALANCE
4946 ** A special case: If a new entry has just been inserted into a
4947 ** table (that is, a btree with integer keys and all data at the leaves)
4948 ** and the new entry is the right-most entry in the tree (it has the
4949 ** largest key) then use the special balance_quick() routine for
4950 ** balancing. balance_quick() is much faster and results in a tighter
4951 ** packing of data in the common case.
4955 pPage->nOverflow==1 &&
4956 pPage->aOvfl[0].idx==pPage->nCell &&
4958 get4byte(&pParent->aData[pParent->hdrOffset+8])==pPage->pgno
4960 assert( pPage->intKey );
4962 ** TODO: Check the siblings to the left of pPage. It may be that
4963 ** they are not full and no new page is required.
4965 return balance_quick(pCur);
4969 if( SQLITE_OK!=(rc = sqlite3PagerWrite(pPage->pDbPage)) ){
4974 ** Find the cell in the parent page whose left child points back
4975 ** to pPage. The "idx" variable is the index of that cell. If pPage
4976 ** is the rightmost child of pParent then set idx to pParent->nCell
4978 idx = pCur->aiIdx[pCur->iPage-1];
4979 assertParentIndex(pParent, idx, pPage->pgno);
4982 ** Initialize variables so that it will be safe to jump
4983 ** directly to balance_cleanup at any moment.
4988 ** Find sibling pages to pPage and the cells in pParent that divide
4989 ** the siblings. An attempt is made to find NN siblings on either
4990 ** side of pPage. More siblings are taken from one side, however, if
4991 ** pPage there are fewer than NN siblings on the other side. If pParent
4992 ** has NB or fewer children then all children of pParent are taken.
4995 if( nxDiv + NB > pParent->nCell ){
4996 nxDiv = pParent->nCell - NB + 1;
5002 for(i=0, k=nxDiv; i<NB; i++, k++){
5003 if( k<pParent->nCell ){
5004 apDiv[i] = findCell(pParent, k);
5006 assert( !pParent->leaf );
5007 pgnoOld[i] = get4byte(apDiv[i]);
5008 }else if( k==pParent->nCell ){
5009 pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+8]);
5013 rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i]);
5014 if( rc ) goto balance_cleanup;
5015 /* apOld[i]->idxParent = k; */
5019 nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
5022 /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
5024 nMaxCells = (nMaxCells + 3)&~3;
5027 ** Allocate space for memory structures
5030 nMaxCells*sizeof(u8*) /* apCell */
5031 + nMaxCells*sizeof(u16) /* szCell */
5032 + (ROUND8(sizeof(MemPage))+pBt->pageSize)*NB /* aCopy */
5033 + pBt->pageSize /* aSpace1 */
5034 + (ISAUTOVACUUM ? nMaxCells : 0); /* aFrom */
5035 apCell = sqlite3ScratchMalloc( szScratch );
5038 goto balance_cleanup;
5040 szCell = (u16*)&apCell[nMaxCells];
5041 aCopy[0] = (u8*)&szCell[nMaxCells];
5042 assert( ((aCopy[0] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
5043 for(i=1; i<NB; i++){
5044 aCopy[i] = &aCopy[i-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
5045 assert( ((aCopy[i] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
5047 aSpace1 = &aCopy[NB-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
5048 assert( ((aSpace1 - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
5050 aFrom = &aSpace1[pBt->pageSize];
5052 aSpace2 = sqlite3PageMalloc(pBt->pageSize);
5055 goto balance_cleanup;
5059 ** Make copies of the content of pPage and its siblings into aOld[].
5060 ** The rest of this function will use data from the copies rather
5061 ** that the original pages since the original pages will be in the
5062 ** process of being overwritten.
5064 for(i=0; i<nOld; i++){
5065 MemPage *p = apCopy[i] = (MemPage*)aCopy[i];
5066 memcpy(p, apOld[i], sizeof(MemPage));
5067 p->aData = (void*)&p[1];
5068 memcpy(p->aData, apOld[i]->aData, pBt->pageSize);
5072 ** Load pointers to all cells on sibling pages and the divider cells
5073 ** into the local apCell[] array. Make copies of the divider cells
5074 ** into space obtained form aSpace1[] and remove the the divider Cells
5077 ** If the siblings are on leaf pages, then the child pointers of the
5078 ** divider cells are stripped from the cells before they are copied
5079 ** into aSpace1[]. In this way, all cells in apCell[] are without
5080 ** child pointers. If siblings are not leaves, then all cell in
5081 ** apCell[] include child pointers. Either way, all cells in apCell[]
5084 ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf.
5085 ** leafData: 1 if pPage holds key+data and pParent holds only keys.
5088 leafCorrection = pPage->leaf*4;
5089 leafData = pPage->hasData;
5090 for(i=0; i<nOld; i++){
5091 MemPage *pOld = apCopy[i];
5092 int limit = pOld->nCell+pOld->nOverflow;
5093 for(j=0; j<limit; j++){
5094 assert( nCell<nMaxCells );
5095 apCell[nCell] = findOverflowCell(pOld, j);
5096 szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
5100 for(a=0; a<pOld->nOverflow; a++){
5101 if( pOld->aOvfl[a].pCell==apCell[nCell] ){
5102 aFrom[nCell] = 0xFF;
5110 u16 sz = cellSizePtr(pParent, apDiv[i]);
5112 /* With the LEAFDATA flag, pParent cells hold only INTKEYs that
5113 ** are duplicates of keys on the child pages. We need to remove
5114 ** the divider cells from pParent, but the dividers cells are not
5115 ** added to apCell[] because they are duplicates of child cells.
5117 dropCell(pParent, nxDiv, sz);
5120 assert( nCell<nMaxCells );
5122 pTemp = &aSpace1[iSpace1];
5124 assert( sz<=pBt->pageSize/4 );
5125 assert( iSpace1<=pBt->pageSize );
5126 memcpy(pTemp, apDiv[i], sz);
5127 apCell[nCell] = pTemp+leafCorrection;
5129 aFrom[nCell] = 0xFF;
5131 dropCell(pParent, nxDiv, sz);
5132 szCell[nCell] -= leafCorrection;
5133 assert( get4byte(pTemp)==pgnoOld[i] );
5135 assert( leafCorrection==0 );
5136 /* The right pointer of the child page pOld becomes the left
5137 ** pointer of the divider cell */
5138 memcpy(apCell[nCell], &pOld->aData[pOld->hdrOffset+8], 4);
5140 assert( leafCorrection==4 );
5141 if( szCell[nCell]<4 ){
5142 /* Do not allow any cells smaller than 4 bytes. */
5152 ** Figure out the number of pages needed to hold all nCell cells.
5153 ** Store this number in "k". Also compute szNew[] which is the total
5154 ** size of all cells on the i-th page and cntNew[] which is the index
5155 ** in apCell[] of the cell that divides page i from page i+1.
5156 ** cntNew[k] should equal nCell.
5158 ** Values computed by this block:
5160 ** k: The total number of sibling pages
5161 ** szNew[i]: Spaced used on the i-th sibling page.
5162 ** cntNew[i]: Index in apCell[] and szCell[] for the first cell to
5163 ** the right of the i-th sibling page.
5164 ** usableSpace: Number of bytes of space available on each sibling.
5167 usableSpace = pBt->usableSize - 12 + leafCorrection;
5168 for(subtotal=k=i=0; i<nCell; i++){
5169 assert( i<nMaxCells );
5170 subtotal += szCell[i] + 2;
5171 if( subtotal > usableSpace ){
5172 szNew[k] = subtotal - szCell[i];
5174 if( leafData ){ i--; }
5179 szNew[k] = subtotal;
5184 ** The packing computed by the previous block is biased toward the siblings
5185 ** on the left side. The left siblings are always nearly full, while the
5186 ** right-most sibling might be nearly empty. This block of code attempts
5187 ** to adjust the packing of siblings to get a better balance.
5189 ** This adjustment is more than an optimization. The packing above might
5190 ** be so out of balance as to be illegal. For example, the right-most
5191 ** sibling might be completely empty. This adjustment is not optional.
5193 for(i=k-1; i>0; i--){
5194 int szRight = szNew[i]; /* Size of sibling on the right */
5195 int szLeft = szNew[i-1]; /* Size of sibling on the left */
5196 int r; /* Index of right-most cell in left sibling */
5197 int d; /* Index of first cell to the left of right sibling */
5199 r = cntNew[i-1] - 1;
5200 d = r + 1 - leafData;
5201 assert( d<nMaxCells );
5202 assert( r<nMaxCells );
5203 while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){
5204 szRight += szCell[d] + 2;
5205 szLeft -= szCell[r] + 2;
5207 r = cntNew[i-1] - 1;
5208 d = r + 1 - leafData;
5211 szNew[i-1] = szLeft;
5214 /* Either we found one or more cells (cntnew[0])>0) or we are the
5215 ** a virtual root page. A virtual root page is when the real root
5216 ** page is page 1 and we are the only child of that page.
5218 assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
5221 ** Allocate k new pages. Reuse old pages where possible.
5223 assert( pPage->pgno>1 );
5224 pageFlags = pPage->aData[0];
5228 pNew = apNew[i] = apOld[i];
5229 pgnoNew[i] = pgnoOld[i];
5231 rc = sqlite3PagerWrite(pNew->pDbPage);
5233 if( rc ) goto balance_cleanup;
5236 rc = allocateBtreePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1], 0);
5237 if( rc ) goto balance_cleanup;
5243 /* Free any old pages that were not reused as new pages.
5246 rc = freePage(apOld[i]);
5247 if( rc ) goto balance_cleanup;
5248 releasePage(apOld[i]);
5254 ** Put the new pages in accending order. This helps to
5255 ** keep entries in the disk file in order so that a scan
5256 ** of the table is a linear scan through the file. That
5257 ** in turn helps the operating system to deliver pages
5258 ** from the disk more rapidly.
5260 ** An O(n^2) insertion sort algorithm is used, but since
5261 ** n is never more than NB (a small constant), that should
5262 ** not be a problem.
5264 ** When NB==3, this one optimization makes the database
5265 ** about 25% faster for large insertions and deletions.
5267 for(i=0; i<k-1; i++){
5268 int minV = pgnoNew[i];
5270 for(j=i+1; j<k; j++){
5271 if( pgnoNew[j]<(unsigned)minV ){
5281 pgnoNew[i] = pgnoNew[minI];
5282 apNew[i] = apNew[minI];
5287 TRACE(("BALANCE: old: %d %d %d new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
5289 nOld>=2 ? pgnoOld[1] : 0,
5290 nOld>=3 ? pgnoOld[2] : 0,
5291 pgnoNew[0], szNew[0],
5292 nNew>=2 ? pgnoNew[1] : 0, nNew>=2 ? szNew[1] : 0,
5293 nNew>=3 ? pgnoNew[2] : 0, nNew>=3 ? szNew[2] : 0,
5294 nNew>=4 ? pgnoNew[3] : 0, nNew>=4 ? szNew[3] : 0,
5295 nNew>=5 ? pgnoNew[4] : 0, nNew>=5 ? szNew[4] : 0));
5298 ** Evenly distribute the data in apCell[] across the new pages.
5299 ** Insert divider cells into pParent as necessary.
5302 for(i=0; i<nNew; i++){
5303 /* Assemble the new sibling page. */
5304 MemPage *pNew = apNew[i];
5305 assert( j<nMaxCells );
5306 assert( pNew->pgno==pgnoNew[i] );
5307 zeroPage(pNew, pageFlags);
5308 assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
5309 assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );
5310 assert( pNew->nOverflow==0 );
5312 /* If this is an auto-vacuum database, update the pointer map entries
5313 ** that point to the siblings that were rearranged. These can be: left
5314 ** children of cells, the right-child of the page, or overflow pages
5315 ** pointed to by cells.
5318 for(k=j; k<cntNew[i]; k++){
5319 assert( k<nMaxCells );
5320 if( aFrom[k]==0xFF || apCopy[aFrom[k]]->pgno!=pNew->pgno ){
5321 rc = ptrmapPutOvfl(pNew, k-j);
5322 if( rc==SQLITE_OK && leafCorrection==0 ){
5323 rc = ptrmapPut(pBt, get4byte(apCell[k]), PTRMAP_BTREE, pNew->pgno);
5325 if( rc!=SQLITE_OK ){
5326 goto balance_cleanup;
5334 /* If the sibling page assembled above was not the right-most sibling,
5335 ** insert a divider cell into the parent page.
5337 if( i<nNew-1 && j<nCell ){
5342 assert( j<nMaxCells );
5344 sz = szCell[j] + leafCorrection;
5345 pTemp = &aSpace2[iSpace2];
5347 memcpy(&pNew->aData[8], pCell, 4);
5349 && (aFrom[j]==0xFF || apCopy[aFrom[j]]->pgno!=pNew->pgno)
5351 rc = ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno);
5352 if( rc!=SQLITE_OK ){
5353 goto balance_cleanup;
5356 }else if( leafData ){
5357 /* If the tree is a leaf-data tree, and the siblings are leaves,
5358 ** then there is no divider cell in apCell[]. Instead, the divider
5359 ** cell consists of the integer key for the right-most cell of
5360 ** the sibling-page assembled above only.
5364 sqlite3BtreeParseCellPtr(pNew, apCell[j], &info);
5366 fillInCell(pParent, pCell, 0, info.nKey, 0, 0, 0, &sz);
5370 /* Obscure case for non-leaf-data trees: If the cell at pCell was
5371 ** previously stored on a leaf node, and its reported size was 4
5372 ** bytes, then it may actually be smaller than this
5373 ** (see sqlite3BtreeParseCellPtr(), 4 bytes is the minimum size of
5374 ** any cell). But it is important to pass the correct size to
5375 ** insertCell(), so reparse the cell now.
5377 ** Note that this can never happen in an SQLite data file, as all
5378 ** cells are at least 4 bytes. It only happens in b-trees used
5379 ** to evaluate "IN (SELECT ...)" and similar clauses.
5382 assert(leafCorrection==4);
5383 sz = cellSizePtr(pParent, pCell);
5387 assert( sz<=pBt->pageSize/4 );
5388 assert( iSpace2<=pBt->pageSize );
5389 rc = insertCell(pParent, nxDiv, pCell, sz, pTemp, 4);
5390 if( rc!=SQLITE_OK ) goto balance_cleanup;
5391 put4byte(findOverflowCell(pParent,nxDiv), pNew->pgno);
5393 /* If this is an auto-vacuum database, and not a leaf-data tree,
5394 ** then update the pointer map with an entry for the overflow page
5395 ** that the cell just inserted points to (if any).
5397 if( ISAUTOVACUUM && !leafData ){
5398 rc = ptrmapPutOvfl(pParent, nxDiv);
5399 if( rc!=SQLITE_OK ){
5400 goto balance_cleanup;
5407 /* Set the pointer-map entry for the new sibling page. */
5409 rc = ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno);
5410 if( rc!=SQLITE_OK ){
5411 goto balance_cleanup;
5418 if( (pageFlags & PTF_LEAF)==0 ){
5419 u8 *zChild = &apCopy[nOld-1]->aData[8];
5420 memcpy(&apNew[nNew-1]->aData[8], zChild, 4);
5422 rc = ptrmapPut(pBt, get4byte(zChild), PTRMAP_BTREE, apNew[nNew-1]->pgno);
5423 if( rc!=SQLITE_OK ){
5424 goto balance_cleanup;
5428 if( nxDiv==pParent->nCell+pParent->nOverflow ){
5429 /* Right-most sibling is the right-most child of pParent */
5430 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew[nNew-1]);
5432 /* Right-most sibling is the left child of the first entry in pParent
5433 ** past the right-most divider entry */
5434 put4byte(findOverflowCell(pParent, nxDiv), pgnoNew[nNew-1]);
5438 ** Balance the parent page. Note that the current page (pPage) might
5439 ** have been added to the freelist so it might no longer be initialized.
5440 ** But the parent page will always be initialized.
5442 assert( pParent->isInit );
5443 sqlite3ScratchFree(apCell);
5447 rc = balance(pCur, 0);
5450 ** Cleanup before returning.
5453 sqlite3PageFree(aSpace2);
5454 sqlite3ScratchFree(apCell);
5455 for(i=0; i<nOld; i++){
5456 releasePage(apOld[i]);
5458 for(i=0; i<nNew; i++){
5459 releasePage(apNew[i]);
5462 /* releasePage(pParent); */
5463 TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n",
5464 pPage->pgno, nOld, nNew, nCell));
5470 ** This routine is called for the root page of a btree when the root
5471 ** page contains no cells. This is an opportunity to make the tree
5472 ** shallower by one level.
5474 static int balance_shallower(BtCursor *pCur){
5475 MemPage *pPage; /* Root page of B-Tree */
5476 MemPage *pChild; /* The only child page of pPage */
5477 Pgno pgnoChild; /* Page number for pChild */
5478 int rc = SQLITE_OK; /* Return code from subprocedures */
5479 BtShared *pBt; /* The main BTree structure */
5480 int mxCellPerPage; /* Maximum number of cells per page */
5481 u8 **apCell; /* All cells from pages being balanced */
5482 u16 *szCell; /* Local size of all cells */
5484 assert( pCur->iPage==0 );
5485 pPage = pCur->apPage[0];
5487 assert( pPage->nCell==0 );
5488 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5490 mxCellPerPage = MX_CELL(pBt);
5491 apCell = sqlite3Malloc( mxCellPerPage*(sizeof(u8*)+sizeof(u16)) );
5492 if( apCell==0 ) return SQLITE_NOMEM;
5493 szCell = (u16*)&apCell[mxCellPerPage];
5495 /* The table is completely empty */
5496 TRACE(("BALANCE: empty table %d\n", pPage->pgno));
5498 /* The root page is empty but has one child. Transfer the
5499 ** information from that one child into the root page if it
5500 ** will fit. This reduces the depth of the tree by one.
5502 ** If the root page is page 1, it has less space available than
5503 ** its child (due to the 100 byte header that occurs at the beginning
5504 ** of the database fle), so it might not be able to hold all of the
5505 ** information currently contained in the child. If this is the
5506 ** case, then do not do the transfer. Leave page 1 empty except
5507 ** for the right-pointer to the child page. The child page becomes
5508 ** the virtual root of the tree.
5510 VVA_ONLY( pCur->pagesShuffled = 1 );
5511 pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5512 assert( pgnoChild>0 );
5513 assert( pgnoChild<=pagerPagecount(pPage->pBt->pPager) );
5514 rc = sqlite3BtreeGetPage(pPage->pBt, pgnoChild, &pChild, 0);
5515 if( rc ) goto end_shallow_balance;
5516 if( pPage->pgno==1 ){
5517 rc = sqlite3BtreeInitPage(pChild);
5518 if( rc ) goto end_shallow_balance;
5519 assert( pChild->nOverflow==0 );
5520 if( pChild->nFree>=100 ){
5521 /* The child information will fit on the root page, so do the
5524 zeroPage(pPage, pChild->aData[0]);
5525 for(i=0; i<pChild->nCell; i++){
5526 apCell[i] = findCell(pChild,i);
5527 szCell[i] = cellSizePtr(pChild, apCell[i]);
5529 assemblePage(pPage, pChild->nCell, apCell, szCell);
5530 /* Copy the right-pointer of the child to the parent. */
5531 put4byte(&pPage->aData[pPage->hdrOffset+8],
5532 get4byte(&pChild->aData[pChild->hdrOffset+8]));
5534 TRACE(("BALANCE: child %d transfer to page 1\n", pChild->pgno));
5536 /* The child has more information that will fit on the root.
5537 ** The tree is already balanced. Do nothing. */
5538 TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno));
5541 memcpy(pPage->aData, pChild->aData, pPage->pBt->usableSize);
5543 rc = sqlite3BtreeInitPage(pPage);
5544 assert( rc==SQLITE_OK );
5546 TRACE(("BALANCE: transfer child %d into root %d\n",
5547 pChild->pgno, pPage->pgno));
5549 assert( pPage->nOverflow==0 );
5551 rc = setChildPtrmaps(pPage);
5553 releasePage(pChild);
5555 end_shallow_balance:
5556 sqlite3_free(apCell);
5562 ** The root page is overfull
5564 ** When this happens, Create a new child page and copy the
5565 ** contents of the root into the child. Then make the root
5566 ** page an empty page with rightChild pointing to the new
5567 ** child. Finally, call balance_internal() on the new child
5568 ** to cause it to split.
5570 static int balance_deeper(BtCursor *pCur){
5571 int rc; /* Return value from subprocedures */
5572 MemPage *pPage; /* Pointer to the root page */
5573 MemPage *pChild; /* Pointer to a new child page */
5574 Pgno pgnoChild; /* Page number of the new child page */
5575 BtShared *pBt; /* The BTree */
5576 int usableSize; /* Total usable size of a page */
5577 u8 *data; /* Content of the parent page */
5578 u8 *cdata; /* Content of the child page */
5579 int hdr; /* Offset to page header in parent */
5580 int cbrk; /* Offset to content of first cell in parent */
5582 assert( pCur->iPage==0 );
5583 assert( pCur->apPage[0]->nOverflow>0 );
5585 VVA_ONLY( pCur->pagesShuffled = 1 );
5586 pPage = pCur->apPage[0];
5588 assert( sqlite3_mutex_held(pBt->mutex) );
5589 rc = allocateBtreePage(pBt, &pChild, &pgnoChild, pPage->pgno, 0);
5591 assert( sqlite3PagerIswriteable(pChild->pDbPage) );
5592 usableSize = pBt->usableSize;
5593 data = pPage->aData;
5594 hdr = pPage->hdrOffset;
5595 cbrk = get2byte(&data[hdr+5]);
5596 cdata = pChild->aData;
5597 memcpy(cdata, &data[hdr], pPage->cellOffset+2*pPage->nCell-hdr);
5598 memcpy(&cdata[cbrk], &data[cbrk], usableSize-cbrk);
5600 rc = sqlite3BtreeInitPage(pChild);
5601 if( rc==SQLITE_OK ){
5602 int nCopy = pPage->nOverflow*sizeof(pPage->aOvfl[0]);
5603 memcpy(pChild->aOvfl, pPage->aOvfl, nCopy);
5604 pChild->nOverflow = pPage->nOverflow;
5605 if( pChild->nOverflow ){
5608 assert( pChild->nCell==pPage->nCell );
5609 zeroPage(pPage, pChild->aData[0] & ~PTF_LEAF);
5610 put4byte(&pPage->aData[pPage->hdrOffset+8], pgnoChild);
5611 TRACE(("BALANCE: copy root %d into %d\n", pPage->pgno, pChild->pgno));
5613 rc = ptrmapPut(pBt, pChild->pgno, PTRMAP_BTREE, pPage->pgno);
5614 if( rc==SQLITE_OK ){
5615 rc = setChildPtrmaps(pChild);
5620 if( rc==SQLITE_OK ){
5622 pCur->apPage[1] = pChild;
5624 rc = balance_nonroot(pCur);
5626 releasePage(pChild);
5633 ** The page that pCur currently points to has just been modified in
5634 ** some way. This function figures out if this modification means the
5635 ** tree needs to be balanced, and if so calls the appropriate balancing
5638 ** Parameter isInsert is true if a new cell was just inserted into the
5639 ** page, or false otherwise.
5641 static int balance(BtCursor *pCur, int isInsert){
5643 MemPage *pPage = pCur->apPage[pCur->iPage];
5645 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5646 if( pCur->iPage==0 ){
5647 rc = sqlite3PagerWrite(pPage->pDbPage);
5648 if( rc==SQLITE_OK && pPage->nOverflow>0 ){
5649 rc = balance_deeper(pCur);
5651 if( rc==SQLITE_OK && pPage->nCell==0 ){
5652 rc = balance_shallower(pCur);
5655 if( pPage->nOverflow>0 ||
5656 (!isInsert && pPage->nFree>pPage->pBt->usableSize*2/3) ){
5657 rc = balance_nonroot(pCur);
5664 ** This routine checks all cursors that point to table pgnoRoot.
5665 ** If any of those cursors were opened with wrFlag==0 in a different
5666 ** database connection (a database connection that shares the pager
5667 ** cache with the current connection) and that other connection
5668 ** is not in the ReadUncommmitted state, then this routine returns
5671 ** As well as cursors with wrFlag==0, cursors with wrFlag==1 and
5672 ** isIncrblobHandle==1 are also considered 'read' cursors. Incremental
5673 ** blob cursors are used for both reading and writing.
5675 ** When pgnoRoot is the root page of an intkey table, this function is also
5676 ** responsible for invalidating incremental blob cursors when the table row
5677 ** on which they are opened is deleted or modified. Cursors are invalidated
5678 ** according to the following rules:
5680 ** 1) When BtreeClearTable() is called to completely delete the contents
5681 ** of a B-Tree table, pExclude is set to zero and parameter iRow is
5682 ** set to non-zero. In this case all incremental blob cursors open
5683 ** on the table rooted at pgnoRoot are invalidated.
5685 ** 2) When BtreeInsert(), BtreeDelete() or BtreePutData() is called to
5686 ** modify a table row via an SQL statement, pExclude is set to the
5687 ** write cursor used to do the modification and parameter iRow is set
5688 ** to the integer row id of the B-Tree entry being modified. Unless
5689 ** pExclude is itself an incremental blob cursor, then all incremental
5690 ** blob cursors open on row iRow of the B-Tree are invalidated.
5692 ** 3) If both pExclude and iRow are set to zero, no incremental blob
5693 ** cursors are invalidated.
5695 static int checkReadLocks(
5702 BtShared *pBt = pBtree->pBt;
5703 sqlite3 *db = pBtree->db;
5704 assert( sqlite3BtreeHoldsMutex(pBtree) );
5705 for(p=pBt->pCursor; p; p=p->pNext){
5706 if( p==pExclude ) continue;
5707 if( p->pgnoRoot!=pgnoRoot ) continue;
5708 #ifndef SQLITE_OMIT_INCRBLOB
5709 if( p->isIncrblobHandle && (
5711 || (pExclude && !pExclude->isIncrblobHandle && p->info.nKey==iRow)
5713 p->eState = CURSOR_INVALID;
5716 if( p->eState!=CURSOR_VALID ) continue;
5718 #ifndef SQLITE_OMIT_INCRBLOB
5719 || p->isIncrblobHandle
5722 sqlite3 *dbOther = p->pBtree->db;
5724 (dbOther!=db && (dbOther->flags & SQLITE_ReadUncommitted)==0) ){
5725 return SQLITE_LOCKED;
5733 ** Insert a new record into the BTree. The key is given by (pKey,nKey)
5734 ** and the data is given by (pData,nData). The cursor is used only to
5735 ** define what table the record should be inserted into. The cursor
5736 ** is left pointing at a random location.
5738 ** For an INTKEY table, only the nKey value of the key is used. pKey is
5739 ** ignored. For a ZERODATA table, the pData and nData are both ignored.
5741 int sqlite3BtreeInsert(
5742 BtCursor *pCur, /* Insert data into the table of this cursor */
5743 const void *pKey, i64 nKey, /* The key of the new record */
5744 const void *pData, int nData, /* The data of the new record */
5745 int nZero, /* Number of extra 0 bytes to append to data */
5746 int appendBias /* True if this is likely an append */
5753 Btree *p = pCur->pBtree;
5754 BtShared *pBt = p->pBt;
5755 unsigned char *oldCell;
5756 unsigned char *newCell = 0;
5758 assert( cursorHoldsMutex(pCur) );
5759 if( pBt->inTransaction!=TRANS_WRITE ){
5760 /* Must start a transaction before doing an insert */
5761 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
5764 assert( !pBt->readOnly );
5765 if( !pCur->wrFlag ){
5766 return SQLITE_PERM; /* Cursor not open for writing */
5768 if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur, nKey) ){
5769 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
5771 if( pCur->eState==CURSOR_FAULT ){
5775 /* Save the positions of any other cursors open on this table */
5776 clearCursorPosition(pCur);
5778 SQLITE_OK!=(rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur)) ||
5779 SQLITE_OK!=(rc = sqlite3BtreeMoveto(pCur, pKey, nKey, appendBias, &loc))
5784 pPage = pCur->apPage[pCur->iPage];
5785 assert( pPage->intKey || nKey>=0 );
5786 assert( pPage->leaf || !pPage->intKey );
5787 TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
5788 pCur->pgnoRoot, nKey, nData, pPage->pgno,
5789 loc==0 ? "overwrite" : "new entry"));
5790 assert( pPage->isInit );
5791 allocateTempSpace(pBt);
5792 newCell = pBt->pTmpSpace;
5793 if( newCell==0 ) return SQLITE_NOMEM;
5794 rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);
5795 if( rc ) goto end_insert;
5796 assert( szNew==cellSizePtr(pPage, newCell) );
5797 assert( szNew<=MX_CELL_SIZE(pBt) );
5798 idx = pCur->aiIdx[pCur->iPage];
5799 if( loc==0 && CURSOR_VALID==pCur->eState ){
5801 assert( idx<pPage->nCell );
5802 rc = sqlite3PagerWrite(pPage->pDbPage);
5806 oldCell = findCell(pPage, idx);
5808 memcpy(newCell, oldCell, 4);
5810 szOld = cellSizePtr(pPage, oldCell);
5811 rc = clearCell(pPage, oldCell);
5812 if( rc ) goto end_insert;
5813 dropCell(pPage, idx, szOld);
5814 }else if( loc<0 && pPage->nCell>0 ){
5815 assert( pPage->leaf );
5816 idx = ++pCur->aiIdx[pCur->iPage];
5817 pCur->info.nSize = 0;
5818 pCur->validNKey = 0;
5820 assert( pPage->leaf );
5822 rc = insertCell(pPage, idx, newCell, szNew, 0, 0);
5823 if( rc!=SQLITE_OK ) goto end_insert;
5824 rc = balance(pCur, 1);
5825 if( rc==SQLITE_OK ){
5833 ** Delete the entry that the cursor is pointing to. The cursor
5834 ** is left pointing at a arbitrary location.
5836 int sqlite3BtreeDelete(BtCursor *pCur){
5837 MemPage *pPage = pCur->apPage[pCur->iPage];
5839 unsigned char *pCell;
5842 Btree *p = pCur->pBtree;
5843 BtShared *pBt = p->pBt;
5845 assert( cursorHoldsMutex(pCur) );
5846 assert( pPage->isInit );
5847 if( pBt->inTransaction!=TRANS_WRITE ){
5848 /* Must start a transaction before doing a delete */
5849 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
5852 assert( !pBt->readOnly );
5853 if( pCur->eState==CURSOR_FAULT ){
5856 if( pCur->aiIdx[pCur->iPage]>=pPage->nCell ){
5857 return SQLITE_ERROR; /* The cursor is not pointing to anything */
5859 if( !pCur->wrFlag ){
5860 return SQLITE_PERM; /* Did not open this cursor for writing */
5862 if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur, pCur->info.nKey) ){
5863 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
5866 /* Restore the current cursor position (a no-op if the cursor is not in
5867 ** CURSOR_REQUIRESEEK state) and save the positions of any other cursors
5868 ** open on the same table. Then call sqlite3PagerWrite() on the page
5869 ** that the entry will be deleted from.
5872 (rc = restoreCursorPosition(pCur))!=0 ||
5873 (rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur))!=0 ||
5874 (rc = sqlite3PagerWrite(pPage->pDbPage))!=0
5879 /* Locate the cell within its page and leave pCell pointing to the
5880 ** data. The clearCell() call frees any overflow pages associated with the
5881 ** cell. The cell itself is still intact.
5883 idx = pCur->aiIdx[pCur->iPage];
5884 pCell = findCell(pPage, idx);
5886 pgnoChild = get4byte(pCell);
5888 rc = clearCell(pPage, pCell);
5895 ** The entry we are about to delete is not a leaf so if we do not
5896 ** do something we will leave a hole on an internal page.
5897 ** We have to fill the hole by moving in a cell from a leaf. The
5898 ** next Cell after the one to be deleted is guaranteed to exist and
5899 ** to be a leaf so we can use it.
5904 unsigned char *pNext;
5906 unsigned char *tempCell = 0;
5907 assert( !pPage->intKey );
5908 sqlite3BtreeGetTempCursor(pCur, &leafCur);
5909 rc = sqlite3BtreeNext(&leafCur, ¬Used);
5910 if( rc==SQLITE_OK ){
5911 assert( leafCur.aiIdx[leafCur.iPage]==0 );
5912 pLeafPage = leafCur.apPage[leafCur.iPage];
5913 rc = sqlite3PagerWrite(pLeafPage->pDbPage);
5915 if( rc==SQLITE_OK ){
5916 int leafCursorInvalid = 0;
5918 TRACE(("DELETE: table=%d delete internal from %d replace from leaf %d\n",
5919 pCur->pgnoRoot, pPage->pgno, pLeafPage->pgno));
5920 dropCell(pPage, idx, cellSizePtr(pPage, pCell));
5921 pNext = findCell(pLeafPage, 0);
5922 szNext = cellSizePtr(pLeafPage, pNext);
5923 assert( MX_CELL_SIZE(pBt)>=szNext+4 );
5924 allocateTempSpace(pBt);
5925 tempCell = pBt->pTmpSpace;
5929 if( rc==SQLITE_OK ){
5930 rc = insertCell(pPage, idx, pNext-4, szNext+4, tempCell, 0);
5934 /* The "if" statement in the next code block is critical. The
5935 ** slightest error in that statement would allow SQLite to operate
5936 ** correctly most of the time but produce very rare failures. To
5937 ** guard against this, the following macros help to verify that
5938 ** the "if" statement is well tested.
5940 testcase( pPage->nOverflow==0 && pPage->nFree<pBt->usableSize*2/3
5941 && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
5942 testcase( pPage->nOverflow==0 && pPage->nFree==pBt->usableSize*2/3
5943 && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
5944 testcase( pPage->nOverflow==0 && pPage->nFree==pBt->usableSize*2/3+1
5945 && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
5946 testcase( pPage->nOverflow>0 && pPage->nFree<=pBt->usableSize*2/3
5947 && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
5948 testcase( (pPage->nOverflow>0 || (pPage->nFree > pBt->usableSize*2/3))
5949 && pLeafPage->nFree+2+szNext == pBt->usableSize*2/3 );
5952 if( (pPage->nOverflow>0 || (pPage->nFree > pBt->usableSize*2/3)) &&
5953 (pLeafPage->nFree+2+szNext > pBt->usableSize*2/3)
5955 /* This branch is taken if the internal node is now either overflowing
5956 ** or underfull and the leaf node will be underfull after the just cell
5957 ** copied to the internal node is deleted from it. This is a special
5958 ** case because the call to balance() to correct the internal node
5959 ** may change the tree structure and invalidate the contents of
5960 ** the leafCur.apPage[] and leafCur.aiIdx[] arrays, which will be
5961 ** used by the balance() required to correct the underfull leaf
5964 ** The formula used in the expression above are based on facets of
5965 ** the SQLite file-format that do not change over time.
5967 testcase( pPage->nFree==pBt->usableSize*2/3+1 );
5968 testcase( pLeafPage->nFree+2+szNext==pBt->usableSize*2/3+1 );
5969 leafCursorInvalid = 1;
5972 if( rc==SQLITE_OK ){
5973 put4byte(findOverflowCell(pPage, idx), pgnoChild);
5974 VVA_ONLY( pCur->pagesShuffled = 0 );
5975 rc = balance(pCur, 0);
5978 if( rc==SQLITE_OK && leafCursorInvalid ){
5979 /* The leaf-node is now underfull and so the tree needs to be
5980 ** rebalanced. However, the balance() operation on the internal
5981 ** node above may have modified the structure of the B-Tree and
5982 ** so the current contents of leafCur.apPage[] and leafCur.aiIdx[]
5983 ** may not be trusted.
5985 ** It is not possible to copy the ancestry from pCur, as the same
5986 ** balance() call has invalidated the pCur->apPage[] and aiIdx[]
5989 ** The call to saveCursorPosition() below internally saves the
5990 ** key that leafCur is currently pointing to. Currently, there
5991 ** are two copies of that key in the tree - one here on the leaf
5992 ** page and one on some internal node in the tree. The copy on
5993 ** the leaf node is always the next key in tree-order after the
5994 ** copy on the internal node. So, the call to sqlite3BtreeNext()
5995 ** calls restoreCursorPosition() to point the cursor to the copy
5996 ** stored on the internal node, then advances to the next entry,
5997 ** which happens to be the copy of the key on the internal node.
5998 ** Net effect: leafCur is pointing back to the duplicate cell
5999 ** that needs to be removed, and the leafCur.apPage[] and
6000 ** leafCur.aiIdx[] arrays are correct.
6002 VVA_ONLY( Pgno leafPgno = pLeafPage->pgno );
6003 rc = saveCursorPosition(&leafCur);
6004 if( rc==SQLITE_OK ){
6005 rc = sqlite3BtreeNext(&leafCur, ¬Used);
6007 pLeafPage = leafCur.apPage[leafCur.iPage];
6008 assert( rc!=SQLITE_OK || pLeafPage->pgno==leafPgno );
6009 assert( rc!=SQLITE_OK || leafCur.aiIdx[leafCur.iPage]==0 );
6012 if( rc==SQLITE_OK ){
6013 dropCell(pLeafPage, 0, szNext);
6014 VVA_ONLY( leafCur.pagesShuffled = 0 );
6015 rc = balance(&leafCur, 0);
6016 assert( leafCursorInvalid || !leafCur.pagesShuffled
6017 || !pCur->pagesShuffled );
6020 sqlite3BtreeReleaseTempCursor(&leafCur);
6022 TRACE(("DELETE: table=%d delete from leaf %d\n",
6023 pCur->pgnoRoot, pPage->pgno));
6024 dropCell(pPage, idx, cellSizePtr(pPage, pCell));
6025 rc = balance(pCur, 0);
6027 if( rc==SQLITE_OK ){
6034 ** Create a new BTree table. Write into *piTable the page
6035 ** number for the root page of the new table.
6037 ** The type of type is determined by the flags parameter. Only the
6038 ** following values of flags are currently in use. Other values for
6039 ** flags might not work:
6041 ** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys
6042 ** BTREE_ZERODATA Used for SQL indices
6044 static int btreeCreateTable(Btree *p, int *piTable, int flags){
6045 BtShared *pBt = p->pBt;
6050 assert( sqlite3BtreeHoldsMutex(p) );
6051 if( pBt->inTransaction!=TRANS_WRITE ){
6052 /* Must start a transaction first */
6053 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
6056 assert( !pBt->readOnly );
6058 #ifdef SQLITE_OMIT_AUTOVACUUM
6059 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
6064 if( pBt->autoVacuum ){
6065 Pgno pgnoMove; /* Move a page here to make room for the root-page */
6066 MemPage *pPageMove; /* The page to move to. */
6068 /* Creating a new table may probably require moving an existing database
6069 ** to make room for the new tables root page. In case this page turns
6070 ** out to be an overflow page, delete all overflow page-map caches
6071 ** held by open cursors.
6073 invalidateAllOverflowCache(pBt);
6075 /* Read the value of meta[3] from the database to determine where the
6076 ** root page of the new table should go. meta[3] is the largest root-page
6077 ** created so far, so the new root-page is (meta[3]+1).
6079 rc = sqlite3BtreeGetMeta(p, 4, &pgnoRoot);
6080 if( rc!=SQLITE_OK ){
6085 /* The new root-page may not be allocated on a pointer-map page, or the
6086 ** PENDING_BYTE page.
6088 while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
6089 pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
6092 assert( pgnoRoot>=3 );
6094 /* Allocate a page. The page that currently resides at pgnoRoot will
6095 ** be moved to the allocated page (unless the allocated page happens
6096 ** to reside at pgnoRoot).
6098 rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1);
6099 if( rc!=SQLITE_OK ){
6103 if( pgnoMove!=pgnoRoot ){
6104 /* pgnoRoot is the page that will be used for the root-page of
6105 ** the new table (assuming an error did not occur). But we were
6106 ** allocated pgnoMove. If required (i.e. if it was not allocated
6107 ** by extending the file), the current page at position pgnoMove
6108 ** is already journaled.
6113 releasePage(pPageMove);
6115 /* Move the page currently at pgnoRoot to pgnoMove. */
6116 rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0);
6117 if( rc!=SQLITE_OK ){
6120 rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
6121 if( rc!=SQLITE_OK || eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
6125 assert( eType!=PTRMAP_ROOTPAGE );
6126 assert( eType!=PTRMAP_FREEPAGE );
6127 rc = sqlite3PagerWrite(pRoot->pDbPage);
6128 if( rc!=SQLITE_OK ){
6132 rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
6135 /* Obtain the page at pgnoRoot */
6136 if( rc!=SQLITE_OK ){
6139 rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0);
6140 if( rc!=SQLITE_OK ){
6143 rc = sqlite3PagerWrite(pRoot->pDbPage);
6144 if( rc!=SQLITE_OK ){
6152 /* Update the pointer-map and meta-data with the new root-page number. */
6153 rc = ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0);
6158 rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
6165 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
6169 assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
6170 zeroPage(pRoot, flags | PTF_LEAF);
6171 sqlite3PagerUnref(pRoot->pDbPage);
6172 *piTable = (int)pgnoRoot;
6175 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
6177 sqlite3BtreeEnter(p);
6179 rc = btreeCreateTable(p, piTable, flags);
6180 sqlite3BtreeLeave(p);
6185 ** Erase the given database page and all its children. Return
6186 ** the page to the freelist.
6188 static int clearDatabasePage(
6189 BtShared *pBt, /* The BTree that contains the table */
6190 Pgno pgno, /* Page number to clear */
6191 MemPage *pParent, /* Parent page. NULL for the root */
6192 int freePageFlag /* Deallocate page if true */
6196 unsigned char *pCell;
6199 assert( sqlite3_mutex_held(pBt->mutex) );
6200 if( pgno>pagerPagecount(pBt->pPager) ){
6201 return SQLITE_CORRUPT_BKPT;
6204 rc = getAndInitPage(pBt, pgno, &pPage);
6205 if( rc ) goto cleardatabasepage_out;
6206 for(i=0; i<pPage->nCell; i++){
6207 pCell = findCell(pPage, i);
6209 rc = clearDatabasePage(pBt, get4byte(pCell), pPage, 1);
6210 if( rc ) goto cleardatabasepage_out;
6212 rc = clearCell(pPage, pCell);
6213 if( rc ) goto cleardatabasepage_out;
6216 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), pPage, 1);
6217 if( rc ) goto cleardatabasepage_out;
6220 rc = freePage(pPage);
6221 }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
6222 zeroPage(pPage, pPage->aData[0] | PTF_LEAF);
6225 cleardatabasepage_out:
6231 ** Delete all information from a single table in the database. iTable is
6232 ** the page number of the root of the table. After this routine returns,
6233 ** the root page is empty, but still exists.
6235 ** This routine will fail with SQLITE_LOCKED if there are any open
6236 ** read cursors on the table. Open write cursors are moved to the
6237 ** root of the table.
6239 int sqlite3BtreeClearTable(Btree *p, int iTable){
6241 BtShared *pBt = p->pBt;
6242 sqlite3BtreeEnter(p);
6244 if( p->inTrans!=TRANS_WRITE ){
6245 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
6246 }else if( (rc = checkReadLocks(p, iTable, 0, 1))!=SQLITE_OK ){
6248 }else if( SQLITE_OK!=(rc = saveAllCursors(pBt, iTable, 0)) ){
6251 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, 0);
6253 sqlite3BtreeLeave(p);
6258 ** Erase all information in a table and add the root of the table to
6259 ** the freelist. Except, the root of the principle table (the one on
6260 ** page 1) is never added to the freelist.
6262 ** This routine will fail with SQLITE_LOCKED if there are any open
6263 ** cursors on the table.
6265 ** If AUTOVACUUM is enabled and the page at iTable is not the last
6266 ** root page in the database file, then the last root page
6267 ** in the database file is moved into the slot formerly occupied by
6268 ** iTable and that last slot formerly occupied by the last root page
6269 ** is added to the freelist instead of iTable. In this say, all
6270 ** root pages are kept at the beginning of the database file, which
6271 ** is necessary for AUTOVACUUM to work right. *piMoved is set to the
6272 ** page number that used to be the last root page in the file before
6273 ** the move. If no page gets moved, *piMoved is set to 0.
6274 ** The last root page is recorded in meta[3] and the value of
6275 ** meta[3] is updated by this procedure.
6277 static int btreeDropTable(Btree *p, int iTable, int *piMoved){
6280 BtShared *pBt = p->pBt;
6282 assert( sqlite3BtreeHoldsMutex(p) );
6283 if( p->inTrans!=TRANS_WRITE ){
6284 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
6287 /* It is illegal to drop a table if any cursors are open on the
6288 ** database. This is because in auto-vacuum mode the backend may
6289 ** need to move another root-page to fill a gap left by the deleted
6290 ** root page. If an open cursor was using this page a problem would
6294 return SQLITE_LOCKED;
6297 rc = sqlite3BtreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
6299 rc = sqlite3BtreeClearTable(p, iTable);
6308 #ifdef SQLITE_OMIT_AUTOVACUUM
6309 rc = freePage(pPage);
6312 if( pBt->autoVacuum ){
6314 rc = sqlite3BtreeGetMeta(p, 4, &maxRootPgno);
6315 if( rc!=SQLITE_OK ){
6320 if( iTable==maxRootPgno ){
6321 /* If the table being dropped is the table with the largest root-page
6322 ** number in the database, put the root page on the free list.
6324 rc = freePage(pPage);
6326 if( rc!=SQLITE_OK ){
6330 /* The table being dropped does not have the largest root-page
6331 ** number in the database. So move the page that does into the
6332 ** gap left by the deleted root-page.
6336 rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0);
6337 if( rc!=SQLITE_OK ){
6340 rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
6342 if( rc!=SQLITE_OK ){
6345 rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0);
6346 if( rc!=SQLITE_OK ){
6349 rc = freePage(pMove);
6351 if( rc!=SQLITE_OK ){
6354 *piMoved = maxRootPgno;
6357 /* Set the new 'max-root-page' value in the database header. This
6358 ** is the old value less one, less one more if that happens to
6359 ** be a root-page number, less one again if that is the
6360 ** PENDING_BYTE_PAGE.
6363 if( maxRootPgno==PENDING_BYTE_PAGE(pBt) ){
6366 if( maxRootPgno==PTRMAP_PAGENO(pBt, maxRootPgno) ){
6369 assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
6371 rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
6373 rc = freePage(pPage);
6378 /* If sqlite3BtreeDropTable was called on page 1. */
6379 zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
6384 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
6386 sqlite3BtreeEnter(p);
6388 rc = btreeDropTable(p, iTable, piMoved);
6389 sqlite3BtreeLeave(p);
6395 ** Read the meta-information out of a database file. Meta[0]
6396 ** is the number of free pages currently in the database. Meta[1]
6397 ** through meta[15] are available for use by higher layers. Meta[0]
6398 ** is read-only, the others are read/write.
6400 ** The schema layer numbers meta values differently. At the schema
6401 ** layer (and the SetCookie and ReadCookie opcodes) the number of
6402 ** free pages is not visible. So Cookie[0] is the same as Meta[1].
6404 int sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
6408 BtShared *pBt = p->pBt;
6410 sqlite3BtreeEnter(p);
6413 /* Reading a meta-data value requires a read-lock on page 1 (and hence
6414 ** the sqlite_master table. We grab this lock regardless of whether or
6415 ** not the SQLITE_ReadUncommitted flag is set (the table rooted at page
6416 ** 1 is treated as a special case by queryTableLock() and lockTable()).
6418 rc = queryTableLock(p, 1, READ_LOCK);
6419 if( rc!=SQLITE_OK ){
6420 sqlite3BtreeLeave(p);
6424 assert( idx>=0 && idx<=15 );
6426 /* The b-tree is already holding a reference to page 1 of the database
6427 ** file. In this case the required meta-data value can be read directly
6428 ** from the page data of this reference. This is slightly faster than
6429 ** requesting a new reference from the pager layer.
6431 pP1 = (unsigned char *)pBt->pPage1->aData;
6433 /* The b-tree does not have a reference to page 1 of the database file.
6434 ** Obtain one from the pager layer.
6436 rc = sqlite3PagerGet(pBt->pPager, 1, &pDbPage);
6438 sqlite3BtreeLeave(p);
6441 pP1 = (unsigned char *)sqlite3PagerGetData(pDbPage);
6443 *pMeta = get4byte(&pP1[36 + idx*4]);
6445 /* If the b-tree is not holding a reference to page 1, then one was
6446 ** requested from the pager layer in the above block. Release it now.
6449 sqlite3PagerUnref(pDbPage);
6452 /* If autovacuumed is disabled in this build but we are trying to
6453 ** access an autovacuumed database, then make the database readonly.
6455 #ifdef SQLITE_OMIT_AUTOVACUUM
6456 if( idx==4 && *pMeta>0 ) pBt->readOnly = 1;
6459 /* Grab the read-lock on page 1. */
6460 rc = lockTable(p, 1, READ_LOCK);
6461 sqlite3BtreeLeave(p);
6466 ** Write meta-information back into the database. Meta[0] is
6467 ** read-only and may not be written.
6469 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
6470 BtShared *pBt = p->pBt;
6473 assert( idx>=1 && idx<=15 );
6474 sqlite3BtreeEnter(p);
6476 if( p->inTrans!=TRANS_WRITE ){
6477 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
6479 assert( pBt->pPage1!=0 );
6480 pP1 = pBt->pPage1->aData;
6481 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
6482 if( rc==SQLITE_OK ){
6483 put4byte(&pP1[36 + idx*4], iMeta);
6484 #ifndef SQLITE_OMIT_AUTOVACUUM
6486 assert( pBt->autoVacuum || iMeta==0 );
6487 assert( iMeta==0 || iMeta==1 );
6488 pBt->incrVacuum = iMeta;
6493 sqlite3BtreeLeave(p);
6498 ** Return the flag byte at the beginning of the page that the cursor
6499 ** is currently pointing to.
6501 int sqlite3BtreeFlags(BtCursor *pCur){
6502 /* TODO: What about CURSOR_REQUIRESEEK state? Probably need to call
6503 ** restoreCursorPosition() here.
6506 restoreCursorPosition(pCur);
6507 pPage = pCur->apPage[pCur->iPage];
6508 assert( cursorHoldsMutex(pCur) );
6509 assert( pPage->pBt==pCur->pBt );
6510 return pPage ? pPage->aData[pPage->hdrOffset] : 0;
6515 ** Return the pager associated with a BTree. This routine is used for
6516 ** testing and debugging only.
6518 Pager *sqlite3BtreePager(Btree *p){
6519 return p->pBt->pPager;
6522 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
6524 ** Append a message to the error message string.
6526 static void checkAppendMsg(
6527 IntegrityCk *pCheck,
6529 const char *zFormat,
6533 if( !pCheck->mxErr ) return;
6536 va_start(ap, zFormat);
6537 if( pCheck->errMsg.nChar ){
6538 sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);
6541 sqlite3StrAccumAppend(&pCheck->errMsg, zMsg1, -1);
6543 sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap);
6545 if( pCheck->errMsg.mallocFailed ){
6546 pCheck->mallocFailed = 1;
6549 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
6551 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
6553 ** Add 1 to the reference count for page iPage. If this is the second
6554 ** reference to the page, add an error message to pCheck->zErrMsg.
6555 ** Return 1 if there are 2 ore more references to the page and 0 if
6556 ** if this is the first reference to the page.
6558 ** Also check that the page number is in bounds.
6560 static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){
6561 if( iPage==0 ) return 1;
6562 if( iPage>pCheck->nPage || iPage<0 ){
6563 checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage);
6566 if( pCheck->anRef[iPage]==1 ){
6567 checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage);
6570 return (pCheck->anRef[iPage]++)>1;
6573 #ifndef SQLITE_OMIT_AUTOVACUUM
6575 ** Check that the entry in the pointer-map for page iChild maps to
6576 ** page iParent, pointer type ptrType. If not, append an error message
6579 static void checkPtrmap(
6580 IntegrityCk *pCheck, /* Integrity check context */
6581 Pgno iChild, /* Child page number */
6582 u8 eType, /* Expected pointer map type */
6583 Pgno iParent, /* Expected pointer map parent page number */
6584 char *zContext /* Context description (used for error msg) */
6590 rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
6591 if( rc!=SQLITE_OK ){
6592 checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild);
6596 if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
6597 checkAppendMsg(pCheck, zContext,
6598 "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
6599 iChild, eType, iParent, ePtrmapType, iPtrmapParent);
6605 ** Check the integrity of the freelist or of an overflow page list.
6606 ** Verify that the number of pages on the list is N.
6608 static void checkList(
6609 IntegrityCk *pCheck, /* Integrity checking context */
6610 int isFreeList, /* True for a freelist. False for overflow page list */
6611 int iPage, /* Page number for first page in the list */
6612 int N, /* Expected number of pages in the list */
6613 char *zContext /* Context for error messages */
6618 while( N-- > 0 && pCheck->mxErr ){
6620 unsigned char *pOvflData;
6622 checkAppendMsg(pCheck, zContext,
6623 "%d of %d pages missing from overflow list starting at %d",
6624 N+1, expected, iFirst);
6627 if( checkRef(pCheck, iPage, zContext) ) break;
6628 if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){
6629 checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage);
6632 pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
6634 int n = get4byte(&pOvflData[4]);
6635 #ifndef SQLITE_OMIT_AUTOVACUUM
6636 if( pCheck->pBt->autoVacuum ){
6637 checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext);
6640 if( n>pCheck->pBt->usableSize/4-2 ){
6641 checkAppendMsg(pCheck, zContext,
6642 "freelist leaf count too big on page %d", iPage);
6646 Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
6647 #ifndef SQLITE_OMIT_AUTOVACUUM
6648 if( pCheck->pBt->autoVacuum ){
6649 checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext);
6652 checkRef(pCheck, iFreePage, zContext);
6657 #ifndef SQLITE_OMIT_AUTOVACUUM
6659 /* If this database supports auto-vacuum and iPage is not the last
6660 ** page in this overflow list, check that the pointer-map entry for
6661 ** the following page matches iPage.
6663 if( pCheck->pBt->autoVacuum && N>0 ){
6664 i = get4byte(pOvflData);
6665 checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext);
6669 iPage = get4byte(pOvflData);
6670 sqlite3PagerUnref(pOvflPage);
6673 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
6675 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
6677 ** Do various sanity checks on a single page of a tree. Return
6678 ** the tree depth. Root pages return 0. Parents of root pages
6679 ** return 1, and so forth.
6681 ** These checks are done:
6683 ** 1. Make sure that cells and freeblocks do not overlap
6684 ** but combine to completely cover the page.
6685 ** NO 2. Make sure cell keys are in order.
6686 ** NO 3. Make sure no key is less than or equal to zLowerBound.
6687 ** NO 4. Make sure no key is greater than or equal to zUpperBound.
6688 ** 5. Check the integrity of overflow pages.
6689 ** 6. Recursively call checkTreePage on all children.
6690 ** 7. Verify that the depth of all children is the same.
6691 ** 8. Make sure this page is at least 33% full or else it is
6692 ** the root of the tree.
6694 static int checkTreePage(
6695 IntegrityCk *pCheck, /* Context for the sanity check */
6696 int iPage, /* Page number of the page to check */
6697 MemPage *pParent, /* Parent page */
6698 char *zParentContext /* Parent context */
6701 int i, rc, depth, d2, pgno, cnt;
6710 sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage);
6712 /* Check that the page exists
6715 usableSize = pBt->usableSize;
6716 if( iPage==0 ) return 0;
6717 if( checkRef(pCheck, iPage, zParentContext) ) return 0;
6718 if( (rc = sqlite3BtreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
6719 checkAppendMsg(pCheck, zContext,
6720 "unable to get the page. error code=%d", rc);
6723 if( (rc = sqlite3BtreeInitPage(pPage))!=0 ){
6724 checkAppendMsg(pCheck, zContext,
6725 "sqlite3BtreeInitPage() returns error code %d", rc);
6730 /* Check out all the cells.
6733 for(i=0; i<pPage->nCell && pCheck->mxErr; i++){
6738 /* Check payload overflow pages
6740 sqlite3_snprintf(sizeof(zContext), zContext,
6741 "On tree page %d cell %d: ", iPage, i);
6742 pCell = findCell(pPage,i);
6743 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
6745 if( !pPage->intKey ) sz += info.nKey;
6746 assert( sz==info.nPayload );
6747 if( sz>info.nLocal ){
6748 int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4);
6749 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
6750 #ifndef SQLITE_OMIT_AUTOVACUUM
6751 if( pBt->autoVacuum ){
6752 checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext);
6755 checkList(pCheck, 0, pgnoOvfl, nPage, zContext);
6758 /* Check sanity of left child page.
6761 pgno = get4byte(pCell);
6762 #ifndef SQLITE_OMIT_AUTOVACUUM
6763 if( pBt->autoVacuum ){
6764 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
6767 d2 = checkTreePage(pCheck,pgno,pPage,zContext);
6768 if( i>0 && d2!=depth ){
6769 checkAppendMsg(pCheck, zContext, "Child page depth differs");
6775 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
6776 sqlite3_snprintf(sizeof(zContext), zContext,
6777 "On page %d at right child: ", iPage);
6778 #ifndef SQLITE_OMIT_AUTOVACUUM
6779 if( pBt->autoVacuum ){
6780 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, 0);
6783 checkTreePage(pCheck, pgno, pPage, zContext);
6786 /* Check for complete coverage of the page
6788 data = pPage->aData;
6789 hdr = pPage->hdrOffset;
6790 hit = sqlite3PageMalloc( pBt->pageSize );
6792 pCheck->mallocFailed = 1;
6794 memset(hit, 0, usableSize );
6795 memset(hit, 1, get2byte(&data[hdr+5]));
6796 nCell = get2byte(&data[hdr+3]);
6797 cellStart = hdr + 12 - 4*pPage->leaf;
6798 for(i=0; i<nCell; i++){
6799 int pc = get2byte(&data[cellStart+i*2]);
6802 if( pc<=usableSize ){
6803 size = cellSizePtr(pPage, &data[pc]);
6805 if( (pc+size-1)>=usableSize || pc<0 ){
6806 checkAppendMsg(pCheck, 0,
6807 "Corruption detected in cell %d on page %d",i,iPage,0);
6809 for(j=pc+size-1; j>=pc; j--) hit[j]++;
6812 for(cnt=0, i=get2byte(&data[hdr+1]); i>0 && i<usableSize && cnt<10000;
6814 int size = get2byte(&data[i+2]);
6816 if( (i+size-1)>=usableSize || i<0 ){
6817 checkAppendMsg(pCheck, 0,
6818 "Corruption detected in cell %d on page %d",i,iPage,0);
6820 for(j=i+size-1; j>=i; j--) hit[j]++;
6822 i = get2byte(&data[i]);
6824 for(i=cnt=0; i<usableSize; i++){
6827 }else if( hit[i]>1 ){
6828 checkAppendMsg(pCheck, 0,
6829 "Multiple uses for byte %d of page %d", i, iPage);
6833 if( cnt!=data[hdr+7] ){
6834 checkAppendMsg(pCheck, 0,
6835 "Fragmented space is %d byte reported as %d on page %d",
6836 cnt, data[hdr+7], iPage);
6839 sqlite3PageFree(hit);
6844 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
6846 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
6848 ** This routine does a complete check of the given BTree file. aRoot[] is
6849 ** an array of pages numbers were each page number is the root page of
6850 ** a table. nRoot is the number of entries in aRoot.
6852 ** Write the number of error seen in *pnErr. Except for some memory
6853 ** allocation errors, nn error message is held in memory obtained from
6854 ** malloc is returned if *pnErr is non-zero. If *pnErr==0 then NULL is
6857 char *sqlite3BtreeIntegrityCheck(
6858 Btree *p, /* The btree to be checked */
6859 int *aRoot, /* An array of root pages numbers for individual trees */
6860 int nRoot, /* Number of entries in aRoot[] */
6861 int mxErr, /* Stop reporting errors after this many */
6862 int *pnErr /* Write number of errors seen to this variable */
6867 BtShared *pBt = p->pBt;
6870 sqlite3BtreeEnter(p);
6872 nRef = sqlite3PagerRefcount(pBt->pPager);
6873 if( lockBtreeWithRetry(p)!=SQLITE_OK ){
6875 sqlite3BtreeLeave(p);
6876 return sqlite3DbStrDup(0, "cannot acquire a read lock on the database");
6879 sCheck.pPager = pBt->pPager;
6880 sCheck.nPage = pagerPagecount(sCheck.pPager);
6881 sCheck.mxErr = mxErr;
6883 sCheck.mallocFailed = 0;
6885 #ifndef SQLITE_OMIT_AUTOVACUUM
6886 if( pBt->nTrunc!=0 ){
6887 sCheck.nPage = pBt->nTrunc;
6890 if( sCheck.nPage==0 ){
6891 unlockBtreeIfUnused(pBt);
6892 sqlite3BtreeLeave(p);
6895 sCheck.anRef = sqlite3Malloc( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
6896 if( !sCheck.anRef ){
6897 unlockBtreeIfUnused(pBt);
6899 sqlite3BtreeLeave(p);
6902 for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
6903 i = PENDING_BYTE_PAGE(pBt);
6904 if( i<=sCheck.nPage ){
6905 sCheck.anRef[i] = 1;
6907 sqlite3StrAccumInit(&sCheck.errMsg, zErr, sizeof(zErr), 20000);
6909 /* Check the integrity of the freelist
6911 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
6912 get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
6914 /* Check all the tables.
6916 for(i=0; i<nRoot && sCheck.mxErr; i++){
6917 if( aRoot[i]==0 ) continue;
6918 #ifndef SQLITE_OMIT_AUTOVACUUM
6919 if( pBt->autoVacuum && aRoot[i]>1 ){
6920 checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0);
6923 checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: ");
6926 /* Make sure every page in the file is referenced
6928 for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
6929 #ifdef SQLITE_OMIT_AUTOVACUUM
6930 if( sCheck.anRef[i]==0 ){
6931 checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
6934 /* If the database supports auto-vacuum, make sure no tables contain
6935 ** references to pointer-map pages.
6937 if( sCheck.anRef[i]==0 &&
6938 (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
6939 checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
6941 if( sCheck.anRef[i]!=0 &&
6942 (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
6943 checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i);
6948 /* Make sure this analysis did not leave any unref() pages
6950 unlockBtreeIfUnused(pBt);
6951 if( nRef != sqlite3PagerRefcount(pBt->pPager) ){
6952 checkAppendMsg(&sCheck, 0,
6953 "Outstanding page count goes from %d to %d during this analysis",
6954 nRef, sqlite3PagerRefcount(pBt->pPager)
6958 /* Clean up and report errors.
6960 sqlite3BtreeLeave(p);
6961 sqlite3_free(sCheck.anRef);
6962 if( sCheck.mallocFailed ){
6963 sqlite3StrAccumReset(&sCheck.errMsg);
6964 *pnErr = sCheck.nErr+1;
6967 *pnErr = sCheck.nErr;
6968 if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);
6969 return sqlite3StrAccumFinish(&sCheck.errMsg);
6971 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
6974 ** Return the full pathname of the underlying database file.
6976 ** The pager filename is invariant as long as the pager is
6977 ** open so it is safe to access without the BtShared mutex.
6979 const char *sqlite3BtreeGetFilename(Btree *p){
6980 assert( p->pBt->pPager!=0 );
6981 return sqlite3PagerFilename(p->pBt->pPager);
6985 ** Return the pathname of the directory that contains the database file.
6987 ** The pager directory name is invariant as long as the pager is
6988 ** open so it is safe to access without the BtShared mutex.
6990 const char *sqlite3BtreeGetDirname(Btree *p){
6991 assert( p->pBt->pPager!=0 );
6992 return sqlite3PagerDirname(p->pBt->pPager);
6996 ** Return the pathname of the journal file for this database. The return
6997 ** value of this routine is the same regardless of whether the journal file
6998 ** has been created or not.
7000 ** The pager journal filename is invariant as long as the pager is
7001 ** open so it is safe to access without the BtShared mutex.
7003 const char *sqlite3BtreeGetJournalname(Btree *p){
7004 assert( p->pBt->pPager!=0 );
7005 return sqlite3PagerJournalname(p->pBt->pPager);
7008 #ifndef SQLITE_OMIT_VACUUM
7010 ** Copy the complete content of pBtFrom into pBtTo. A transaction
7011 ** must be active for both files.
7013 ** The size of file pTo may be reduced by this operation.
7014 ** If anything goes wrong, the transaction on pTo is rolled back.
7016 ** If successful, CommitPhaseOne() may be called on pTo before returning.
7017 ** The caller should finish committing the transaction on pTo by calling
7018 ** sqlite3BtreeCommit().
7020 static int btreeCopyFile(Btree *pTo, Btree *pFrom){
7024 Pgno nFromPage; /* Number of pages in pFrom */
7025 Pgno nToPage; /* Number of pages in pTo */
7026 Pgno nNewPage; /* Number of pages in pTo after the copy */
7028 Pgno iSkip; /* Pending byte page in pTo */
7029 int nToPageSize; /* Page size of pTo in bytes */
7030 int nFromPageSize; /* Page size of pFrom in bytes */
7032 BtShared *pBtTo = pTo->pBt;
7033 BtShared *pBtFrom = pFrom->pBt;
7034 pBtTo->db = pTo->db;
7035 pBtFrom->db = pFrom->db;
7037 nToPageSize = pBtTo->pageSize;
7038 nFromPageSize = pBtFrom->pageSize;
7040 if( pTo->inTrans!=TRANS_WRITE || pFrom->inTrans!=TRANS_WRITE ){
7041 return SQLITE_ERROR;
7043 if( pBtTo->pCursor ){
7047 nToPage = pagerPagecount(pBtTo->pPager);
7048 nFromPage = pagerPagecount(pBtFrom->pPager);
7049 iSkip = PENDING_BYTE_PAGE(pBtTo);
7051 /* Variable nNewPage is the number of pages required to store the
7052 ** contents of pFrom using the current page-size of pTo.
7054 nNewPage = ((i64)nFromPage * (i64)nFromPageSize + (i64)nToPageSize - 1) /
7057 for(i=1; rc==SQLITE_OK && (i<=nToPage || i<=nNewPage); i++){
7059 /* Journal the original page.
7061 ** iSkip is the page number of the locking page (PENDING_BYTE_PAGE)
7062 ** in database *pTo (before the copy). This page is never written
7063 ** into the journal file. Unless i==iSkip or the page was not
7064 ** present in pTo before the copy operation, journal page i from pTo.
7066 if( i!=iSkip && i<=nToPage ){
7067 DbPage *pDbPage = 0;
7068 rc = sqlite3PagerGet(pBtTo->pPager, i, &pDbPage);
7069 if( rc==SQLITE_OK ){
7070 rc = sqlite3PagerWrite(pDbPage);
7071 if( rc==SQLITE_OK && i>nFromPage ){
7072 /* Yeah. It seems wierd to call DontWrite() right after Write(). But
7073 ** that is because the names of those procedures do not exactly
7074 ** represent what they do. Write() really means "put this page in the
7075 ** rollback journal and mark it as dirty so that it will be written
7076 ** to the database file later." DontWrite() undoes the second part of
7077 ** that and prevents the page from being written to the database. The
7078 ** page is still on the rollback journal, though. And that is the
7079 ** whole point of this block: to put pages on the rollback journal.
7081 rc = sqlite3PagerDontWrite(pDbPage);
7083 sqlite3PagerUnref(pDbPage);
7087 /* Overwrite the data in page i of the target database */
7088 if( rc==SQLITE_OK && i!=iSkip && i<=nNewPage ){
7090 DbPage *pToPage = 0;
7093 rc = sqlite3PagerGet(pBtTo->pPager, i, &pToPage);
7094 if( rc==SQLITE_OK ){
7095 rc = sqlite3PagerWrite(pToPage);
7099 iOff=(i-1)*nToPageSize;
7100 rc==SQLITE_OK && iOff<i*nToPageSize;
7101 iOff += nFromPageSize
7103 DbPage *pFromPage = 0;
7104 Pgno iFrom = (iOff/nFromPageSize)+1;
7106 if( iFrom==PENDING_BYTE_PAGE(pBtFrom) ){
7110 rc = sqlite3PagerGet(pBtFrom->pPager, iFrom, &pFromPage);
7111 if( rc==SQLITE_OK ){
7112 char *zTo = sqlite3PagerGetData(pToPage);
7113 char *zFrom = sqlite3PagerGetData(pFromPage);
7116 if( nFromPageSize>=nToPageSize ){
7117 zFrom += ((i-1)*nToPageSize - ((iFrom-1)*nFromPageSize));
7118 nCopy = nToPageSize;
7120 zTo += (((iFrom-1)*nFromPageSize) - (i-1)*nToPageSize);
7121 nCopy = nFromPageSize;
7124 memcpy(zTo, zFrom, nCopy);
7125 sqlite3PagerUnref(pFromPage);
7130 MemPage *p = (MemPage *)sqlite3PagerGetExtra(pToPage);
7132 sqlite3PagerUnref(pToPage);
7137 /* If things have worked so far, the database file may need to be
7138 ** truncated. The complex part is that it may need to be truncated to
7139 ** a size that is not an integer multiple of nToPageSize - the current
7140 ** page size used by the pager associated with B-Tree pTo.
7142 ** For example, say the page-size of pTo is 2048 bytes and the original
7143 ** number of pages is 5 (10 KB file). If pFrom has a page size of 1024
7144 ** bytes and 9 pages, then the file needs to be truncated to 9KB.
7146 if( rc==SQLITE_OK ){
7147 if( nFromPageSize!=nToPageSize ){
7148 sqlite3_file *pFile = sqlite3PagerFile(pBtTo->pPager);
7149 i64 iSize = (i64)nFromPageSize * (i64)nFromPage;
7150 i64 iNow = (i64)((nToPage>nNewPage)?nToPage:nNewPage) * (i64)nToPageSize;
7151 i64 iPending = ((i64)PENDING_BYTE_PAGE(pBtTo)-1) *(i64)nToPageSize;
7153 assert( iSize<=iNow );
7155 /* Commit phase one syncs the journal file associated with pTo
7156 ** containing the original data. It does not sync the database file
7157 ** itself. After doing this it is safe to use OsTruncate() and other
7158 ** file APIs on the database file directly.
7160 pBtTo->db = pTo->db;
7161 rc = sqlite3PagerCommitPhaseOne(pBtTo->pPager, 0, 0, 1);
7162 if( iSize<iNow && rc==SQLITE_OK ){
7163 rc = sqlite3OsTruncate(pFile, iSize);
7166 /* The loop that copied data from database pFrom to pTo did not
7167 ** populate the locking page of database pTo. If the page-size of
7168 ** pFrom is smaller than that of pTo, this means some data will
7169 ** not have been copied.
7171 ** This block copies the missing data from database pFrom to pTo
7172 ** using file APIs. This is safe because at this point we know that
7173 ** all of the original data from pTo has been synced into the
7174 ** journal file. At this point it would be safe to do anything at
7175 ** all to the database file except truncate it to zero bytes.
7177 if( rc==SQLITE_OK && nFromPageSize<nToPageSize && iSize>iPending){
7181 rc==SQLITE_OK && iOff<(iPending+nToPageSize);
7182 iOff += nFromPageSize
7184 DbPage *pFromPage = 0;
7185 Pgno iFrom = (iOff/nFromPageSize)+1;
7187 if( iFrom==PENDING_BYTE_PAGE(pBtFrom) || iFrom>nFromPage ){
7191 rc = sqlite3PagerGet(pBtFrom->pPager, iFrom, &pFromPage);
7192 if( rc==SQLITE_OK ){
7193 char *zFrom = sqlite3PagerGetData(pFromPage);
7194 rc = sqlite3OsWrite(pFile, zFrom, nFromPageSize, iOff);
7195 sqlite3PagerUnref(pFromPage);
7200 /* Sync the database file */
7201 if( rc==SQLITE_OK ){
7202 rc = sqlite3PagerSync(pBtTo->pPager);
7205 rc = sqlite3PagerTruncate(pBtTo->pPager, nNewPage);
7207 if( rc==SQLITE_OK ){
7208 pBtTo->pageSizeFixed = 0;
7213 sqlite3BtreeRollback(pTo);
7218 int sqlite3BtreeCopyFile(Btree *pTo, Btree *pFrom){
7220 sqlite3BtreeEnter(pTo);
7221 sqlite3BtreeEnter(pFrom);
7222 rc = btreeCopyFile(pTo, pFrom);
7223 sqlite3BtreeLeave(pFrom);
7224 sqlite3BtreeLeave(pTo);
7228 #endif /* SQLITE_OMIT_VACUUM */
7231 ** Return non-zero if a transaction is active.
7233 int sqlite3BtreeIsInTrans(Btree *p){
7234 assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
7235 return (p && (p->inTrans==TRANS_WRITE));
7239 ** Return non-zero if a statement transaction is active.
7241 int sqlite3BtreeIsInStmt(Btree *p){
7242 assert( sqlite3BtreeHoldsMutex(p) );
7243 return (p->pBt && p->pBt->inStmt);
7247 ** Return non-zero if a read (or write) transaction is active.
7249 int sqlite3BtreeIsInReadTrans(Btree *p){
7250 assert( sqlite3_mutex_held(p->db->mutex) );
7251 return (p && (p->inTrans!=TRANS_NONE));
7255 ** This function returns a pointer to a blob of memory associated with
7256 ** a single shared-btree. The memory is used by client code for its own
7257 ** purposes (for example, to store a high-level schema associated with
7258 ** the shared-btree). The btree layer manages reference counting issues.
7260 ** The first time this is called on a shared-btree, nBytes bytes of memory
7261 ** are allocated, zeroed, and returned to the caller. For each subsequent
7262 ** call the nBytes parameter is ignored and a pointer to the same blob
7263 ** of memory returned.
7265 ** If the nBytes parameter is 0 and the blob of memory has not yet been
7266 ** allocated, a null pointer is returned. If the blob has already been
7267 ** allocated, it is returned as normal.
7269 ** Just before the shared-btree is closed, the function passed as the
7270 ** xFree argument when the memory allocation was made is invoked on the
7271 ** blob of allocated memory. This function should not call sqlite3_free()
7272 ** on the memory, the btree layer does that.
7274 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
7275 BtShared *pBt = p->pBt;
7276 sqlite3BtreeEnter(p);
7277 if( !pBt->pSchema && nBytes ){
7278 pBt->pSchema = sqlite3MallocZero(nBytes);
7279 pBt->xFreeSchema = xFree;
7281 sqlite3BtreeLeave(p);
7282 return pBt->pSchema;
7286 ** Return true if another user of the same shared btree as the argument
7287 ** handle holds an exclusive lock on the sqlite_master table.
7289 int sqlite3BtreeSchemaLocked(Btree *p){
7291 assert( sqlite3_mutex_held(p->db->mutex) );
7292 sqlite3BtreeEnter(p);
7293 rc = (queryTableLock(p, MASTER_ROOT, READ_LOCK)!=SQLITE_OK);
7294 sqlite3BtreeLeave(p);
7299 #ifndef SQLITE_OMIT_SHARED_CACHE
7301 ** Obtain a lock on the table whose root page is iTab. The
7302 ** lock is a write lock if isWritelock is true or a read lock
7305 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
7308 u8 lockType = READ_LOCK + isWriteLock;
7309 assert( READ_LOCK+1==WRITE_LOCK );
7310 assert( isWriteLock==0 || isWriteLock==1 );
7311 sqlite3BtreeEnter(p);
7312 rc = queryTableLock(p, iTab, lockType);
7313 if( rc==SQLITE_OK ){
7314 rc = lockTable(p, iTab, lockType);
7316 sqlite3BtreeLeave(p);
7322 #ifndef SQLITE_OMIT_INCRBLOB
7324 ** Argument pCsr must be a cursor opened for writing on an
7325 ** INTKEY table currently pointing at a valid table entry.
7326 ** This function modifies the data stored as part of that entry.
7327 ** Only the data content may only be modified, it is not possible
7328 ** to change the length of the data stored.
7330 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
7331 assert( cursorHoldsMutex(pCsr) );
7332 assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
7333 assert(pCsr->isIncrblobHandle);
7335 restoreCursorPosition(pCsr);
7336 assert( pCsr->eState!=CURSOR_REQUIRESEEK );
7337 if( pCsr->eState!=CURSOR_VALID ){
7338 return SQLITE_ABORT;
7341 /* Check some preconditions:
7342 ** (a) the cursor is open for writing,
7343 ** (b) there is no read-lock on the table being modified and
7344 ** (c) the cursor points at a valid row of an intKey table.
7346 if( !pCsr->wrFlag ){
7347 return SQLITE_READONLY;
7349 assert( !pCsr->pBt->readOnly
7350 && pCsr->pBt->inTransaction==TRANS_WRITE );
7351 if( checkReadLocks(pCsr->pBtree, pCsr->pgnoRoot, pCsr, 0) ){
7352 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
7354 if( pCsr->eState==CURSOR_INVALID || !pCsr->apPage[pCsr->iPage]->intKey ){
7355 return SQLITE_ERROR;
7358 return accessPayload(pCsr, offset, amt, (unsigned char *)z, 0, 1);
7362 ** Set a flag on this cursor to cache the locations of pages from the
7363 ** overflow list for the current row. This is used by cursors opened
7364 ** for incremental blob IO only.
7366 ** This function sets a flag only. The actual page location cache
7367 ** (stored in BtCursor.aOverflow[]) is allocated and used by function
7368 ** accessPayload() (the worker function for sqlite3BtreeData() and
7369 ** sqlite3BtreePutData()).
7371 void sqlite3BtreeCacheOverflow(BtCursor *pCur){
7372 assert( cursorHoldsMutex(pCur) );
7373 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
7374 assert(!pCur->isIncrblobHandle);
7375 assert(!pCur->aOverflow);
7376 pCur->isIncrblobHandle = 1;