Update contrib.
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 *************************************************************************
12 ** $Id: btree.c,v 1.525 2008/10/08 17:58:49 danielk1977 Exp $
14 ** This file implements a external (disk-based) database using BTrees.
15 ** See the header comment on "btreeInt.h" for additional information.
16 ** Including a description of file format and an overview of operation.
21 ** The header string that appears at the beginning of every
24 static const char zMagicHeader[] = SQLITE_FILE_HEADER;
27 ** Set this global variable to 1 to enable tracing using the TRACE
31 int sqlite3BtreeTrace=0; /* True to enable tracing */
32 # define TRACE(X) if(sqlite3BtreeTrace){printf X;fflush(stdout);}
38 ** Sometimes we need a small amount of code such as a variable initialization
39 ** to setup for a later assert() statement. We do not want this code to
40 ** appear when assert() is disabled. The following macro is therefore
41 ** used to contain that setup code. The "VVA" acronym stands for
42 ** "Verification, Validation, and Accreditation". In other words, the
43 ** code within VVA_ONLY() will only run during verification processes.
46 # define VVA_ONLY(X) X
53 #ifndef SQLITE_OMIT_SHARED_CACHE
55 ** A list of BtShared objects that are eligible for participation
56 ** in shared cache. This variable has file scope during normal builds,
57 ** but the test harness needs to access it so we make it global for
61 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
63 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
65 #endif /* SQLITE_OMIT_SHARED_CACHE */
67 #ifndef SQLITE_OMIT_SHARED_CACHE
69 ** Enable or disable the shared pager and schema features.
71 ** This routine has no effect on existing database connections.
72 ** The shared cache setting effects only future calls to
73 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
75 int sqlite3_enable_shared_cache(int enable){
76 sqlite3GlobalConfig.sharedCacheEnabled = enable;
83 ** Forward declaration
85 static int checkReadLocks(Btree*, Pgno, BtCursor*, i64);
88 #ifdef SQLITE_OMIT_SHARED_CACHE
90 ** The functions queryTableLock(), lockTable() and unlockAllTables()
91 ** manipulate entries in the BtShared.pLock linked list used to store
92 ** shared-cache table level locks. If the library is compiled with the
93 ** shared-cache feature disabled, then there is only ever one user
94 ** of each BtShared structure and so this locking is not necessary.
95 ** So define the lock related functions as no-ops.
97 #define queryTableLock(a,b,c) SQLITE_OK
98 #define lockTable(a,b,c) SQLITE_OK
99 #define unlockAllTables(a)
102 #ifndef SQLITE_OMIT_SHARED_CACHE
104 ** Query to see if btree handle p may obtain a lock of type eLock
105 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
106 ** SQLITE_OK if the lock may be obtained (by calling lockTable()), or
107 ** SQLITE_LOCKED if not.
109 static int queryTableLock(Btree *p, Pgno iTab, u8 eLock){
110 BtShared *pBt = p->pBt;
113 assert( sqlite3BtreeHoldsMutex(p) );
114 assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
117 /* This is a no-op if the shared-cache is not enabled */
122 /* If some other connection is holding an exclusive lock, the
123 ** requested lock may not be obtained.
125 if( pBt->pExclusive && pBt->pExclusive!=p ){
126 return SQLITE_LOCKED;
129 /* This (along with lockTable()) is where the ReadUncommitted flag is
130 ** dealt with. If the caller is querying for a read-lock and the flag is
131 ** set, it is unconditionally granted - even if there are write-locks
132 ** on the table. If a write-lock is requested, the ReadUncommitted flag
133 ** is not considered.
135 ** In function lockTable(), if a read-lock is demanded and the
136 ** ReadUncommitted flag is set, no entry is added to the locks list
139 ** To summarize: If the ReadUncommitted flag is set, then read cursors do
140 ** not create or respect table locks. The locking procedure for a
141 ** write-cursor does not change.
144 0==(p->db->flags&SQLITE_ReadUncommitted) ||
148 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
149 if( pIter->pBtree!=p && pIter->iTable==iTab &&
150 (pIter->eLock!=eLock || eLock!=READ_LOCK) ){
151 return SQLITE_LOCKED;
157 #endif /* !SQLITE_OMIT_SHARED_CACHE */
159 #ifndef SQLITE_OMIT_SHARED_CACHE
161 ** Add a lock on the table with root-page iTable to the shared-btree used
162 ** by Btree handle p. Parameter eLock must be either READ_LOCK or
165 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_BUSY and
166 ** SQLITE_NOMEM may also be returned.
168 static int lockTable(Btree *p, Pgno iTable, u8 eLock){
169 BtShared *pBt = p->pBt;
173 assert( sqlite3BtreeHoldsMutex(p) );
174 assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
177 /* This is a no-op if the shared-cache is not enabled */
182 assert( SQLITE_OK==queryTableLock(p, iTable, eLock) );
184 /* If the read-uncommitted flag is set and a read-lock is requested,
185 ** return early without adding an entry to the BtShared.pLock list. See
186 ** comment in function queryTableLock() for more info on handling
187 ** the ReadUncommitted flag.
190 (p->db->flags&SQLITE_ReadUncommitted) &&
191 (eLock==READ_LOCK) &&
197 /* First search the list for an existing lock on this table. */
198 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
199 if( pIter->iTable==iTable && pIter->pBtree==p ){
205 /* If the above search did not find a BtLock struct associating Btree p
206 ** with table iTable, allocate one and link it into the list.
209 pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
213 pLock->iTable = iTable;
215 pLock->pNext = pBt->pLock;
219 /* Set the BtLock.eLock variable to the maximum of the current lock
220 ** and the requested lock. This means if a write-lock was already held
221 ** and a read-lock requested, we don't incorrectly downgrade the lock.
223 assert( WRITE_LOCK>READ_LOCK );
224 if( eLock>pLock->eLock ){
225 pLock->eLock = eLock;
230 #endif /* !SQLITE_OMIT_SHARED_CACHE */
232 #ifndef SQLITE_OMIT_SHARED_CACHE
234 ** Release all the table locks (locks obtained via calls to the lockTable()
235 ** procedure) held by Btree handle p.
237 static void unlockAllTables(Btree *p){
238 BtShared *pBt = p->pBt;
239 BtLock **ppIter = &pBt->pLock;
241 assert( sqlite3BtreeHoldsMutex(p) );
242 assert( p->sharable || 0==*ppIter );
245 BtLock *pLock = *ppIter;
246 assert( pBt->pExclusive==0 || pBt->pExclusive==pLock->pBtree );
247 if( pLock->pBtree==p ){
248 *ppIter = pLock->pNext;
251 ppIter = &pLock->pNext;
255 if( pBt->pExclusive==p ){
259 #endif /* SQLITE_OMIT_SHARED_CACHE */
261 static void releasePage(MemPage *pPage); /* Forward reference */
264 ** Verify that the cursor holds a mutex on the BtShared
267 static int cursorHoldsMutex(BtCursor *p){
268 return sqlite3_mutex_held(p->pBt->mutex);
273 #ifndef SQLITE_OMIT_INCRBLOB
275 ** Invalidate the overflow page-list cache for cursor pCur, if any.
277 static void invalidateOverflowCache(BtCursor *pCur){
278 assert( cursorHoldsMutex(pCur) );
279 sqlite3_free(pCur->aOverflow);
284 ** Invalidate the overflow page-list cache for all cursors opened
285 ** on the shared btree structure pBt.
287 static void invalidateAllOverflowCache(BtShared *pBt){
289 assert( sqlite3_mutex_held(pBt->mutex) );
290 for(p=pBt->pCursor; p; p=p->pNext){
291 invalidateOverflowCache(p);
295 #define invalidateOverflowCache(x)
296 #define invalidateAllOverflowCache(x)
300 ** Save the current cursor position in the variables BtCursor.nKey
301 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
303 static int saveCursorPosition(BtCursor *pCur){
306 assert( CURSOR_VALID==pCur->eState );
307 assert( 0==pCur->pKey );
308 assert( cursorHoldsMutex(pCur) );
310 rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);
312 /* If this is an intKey table, then the above call to BtreeKeySize()
313 ** stores the integer key in pCur->nKey. In this case this value is
314 ** all that is required. Otherwise, if pCur is not open on an intKey
315 ** table, then malloc space for and store the pCur->nKey bytes of key
318 if( rc==SQLITE_OK && 0==pCur->apPage[0]->intKey){
319 void *pKey = sqlite3Malloc(pCur->nKey);
321 rc = sqlite3BtreeKey(pCur, 0, pCur->nKey, pKey);
331 assert( !pCur->apPage[0]->intKey || !pCur->pKey );
335 for(i=0; i<=pCur->iPage; i++){
336 releasePage(pCur->apPage[i]);
340 pCur->eState = CURSOR_REQUIRESEEK;
343 invalidateOverflowCache(pCur);
348 ** Save the positions of all cursors except pExcept open on the table
349 ** with root-page iRoot. Usually, this is called just before cursor
350 ** pExcept is used to modify the table (BtreeDelete() or BtreeInsert()).
352 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
354 assert( sqlite3_mutex_held(pBt->mutex) );
355 assert( pExcept==0 || pExcept->pBt==pBt );
356 for(p=pBt->pCursor; p; p=p->pNext){
357 if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) &&
358 p->eState==CURSOR_VALID ){
359 int rc = saveCursorPosition(p);
369 ** Clear the current cursor position.
371 void sqlite3BtreeClearCursor(BtCursor *pCur){
372 assert( cursorHoldsMutex(pCur) );
373 sqlite3_free(pCur->pKey);
375 pCur->eState = CURSOR_INVALID;
379 ** Restore the cursor to the position it was in (or as close to as possible)
380 ** when saveCursorPosition() was called. Note that this call deletes the
381 ** saved position info stored by saveCursorPosition(), so there can be
382 ** at most one effective restoreCursorPosition() call after each
383 ** saveCursorPosition().
385 int sqlite3BtreeRestoreCursorPosition(BtCursor *pCur){
387 assert( cursorHoldsMutex(pCur) );
388 assert( pCur->eState>=CURSOR_REQUIRESEEK );
389 if( pCur->eState==CURSOR_FAULT ){
392 pCur->eState = CURSOR_INVALID;
393 rc = sqlite3BtreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skip);
395 sqlite3_free(pCur->pKey);
397 assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
402 #define restoreCursorPosition(p) \
403 (p->eState>=CURSOR_REQUIRESEEK ? \
404 sqlite3BtreeRestoreCursorPosition(p) : \
408 ** Determine whether or not a cursor has moved from the position it
409 ** was last placed at. Cursor can move when the row they are pointing
410 ** at is deleted out from under them.
412 ** This routine returns an error code if something goes wrong. The
413 ** integer *pHasMoved is set to one if the cursor has moved and 0 if not.
415 int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved){
418 rc = restoreCursorPosition(pCur);
423 if( pCur->eState!=CURSOR_VALID || pCur->skip!=0 ){
431 #ifndef SQLITE_OMIT_AUTOVACUUM
433 ** Given a page number of a regular database page, return the page
434 ** number for the pointer-map page that contains the entry for the
435 ** input page number.
437 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
438 int nPagesPerMapPage, iPtrMap, ret;
439 assert( sqlite3_mutex_held(pBt->mutex) );
440 nPagesPerMapPage = (pBt->usableSize/5)+1;
441 iPtrMap = (pgno-2)/nPagesPerMapPage;
442 ret = (iPtrMap*nPagesPerMapPage) + 2;
443 if( ret==PENDING_BYTE_PAGE(pBt) ){
450 ** Write an entry into the pointer map.
452 ** This routine updates the pointer map entry for page number 'key'
453 ** so that it maps to type 'eType' and parent page number 'pgno'.
454 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
456 static int ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent){
457 DbPage *pDbPage; /* The pointer map page */
458 u8 *pPtrmap; /* The pointer map data */
459 Pgno iPtrmap; /* The pointer map page number */
460 int offset; /* Offset in pointer map page */
463 assert( sqlite3_mutex_held(pBt->mutex) );
464 /* The master-journal page number must never be used as a pointer map page */
465 assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
467 assert( pBt->autoVacuum );
469 return SQLITE_CORRUPT_BKPT;
471 iPtrmap = PTRMAP_PAGENO(pBt, key);
472 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
476 offset = PTRMAP_PTROFFSET(iPtrmap, key);
477 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
479 if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
480 TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
481 rc = sqlite3PagerWrite(pDbPage);
483 pPtrmap[offset] = eType;
484 put4byte(&pPtrmap[offset+1], parent);
488 sqlite3PagerUnref(pDbPage);
493 ** Read an entry from the pointer map.
495 ** This routine retrieves the pointer map entry for page 'key', writing
496 ** the type and parent page number to *pEType and *pPgno respectively.
497 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
499 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
500 DbPage *pDbPage; /* The pointer map page */
501 int iPtrmap; /* Pointer map page index */
502 u8 *pPtrmap; /* Pointer map page data */
503 int offset; /* Offset of entry in pointer map */
506 assert( sqlite3_mutex_held(pBt->mutex) );
508 iPtrmap = PTRMAP_PAGENO(pBt, key);
509 rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
513 pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
515 offset = PTRMAP_PTROFFSET(iPtrmap, key);
517 *pEType = pPtrmap[offset];
518 if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
520 sqlite3PagerUnref(pDbPage);
521 if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
525 #else /* if defined SQLITE_OMIT_AUTOVACUUM */
526 #define ptrmapPut(w,x,y,z) SQLITE_OK
527 #define ptrmapGet(w,x,y,z) SQLITE_OK
528 #define ptrmapPutOvfl(y,z) SQLITE_OK
532 ** Given a btree page and a cell index (0 means the first cell on
533 ** the page, 1 means the second cell, and so forth) return a pointer
534 ** to the cell content.
536 ** This routine works only for pages that do not contain overflow cells.
538 #define findCell(P,I) \
539 ((P)->aData + ((P)->maskPage & get2byte(&(P)->aData[(P)->cellOffset+2*(I)])))
542 ** This a more complex version of findCell() that works for
543 ** pages that do contain overflow cells. See insert
545 static u8 *findOverflowCell(MemPage *pPage, int iCell){
547 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
548 for(i=pPage->nOverflow-1; i>=0; i--){
550 struct _OvflCell *pOvfl;
551 pOvfl = &pPage->aOvfl[i];
560 return findCell(pPage, iCell);
564 ** Parse a cell content block and fill in the CellInfo structure. There
565 ** are two versions of this function. sqlite3BtreeParseCell() takes a
566 ** cell index as the second argument and sqlite3BtreeParseCellPtr()
567 ** takes a pointer to the body of the cell as its second argument.
569 ** Within this file, the parseCell() macro can be called instead of
570 ** sqlite3BtreeParseCellPtr(). Using some compilers, this will be faster.
572 void sqlite3BtreeParseCellPtr(
573 MemPage *pPage, /* Page containing the cell */
574 u8 *pCell, /* Pointer to the cell text. */
575 CellInfo *pInfo /* Fill in this structure */
577 int n; /* Number bytes in cell content header */
578 u32 nPayload; /* Number of bytes of cell payload */
580 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
582 pInfo->pCell = pCell;
583 assert( pPage->leaf==0 || pPage->leaf==1 );
584 n = pPage->childPtrSize;
585 assert( n==4-4*pPage->leaf );
587 if( pPage->hasData ){
588 n += getVarint32(&pCell[n], nPayload);
592 n += getVarint(&pCell[n], (u64*)&pInfo->nKey);
593 pInfo->nData = nPayload;
596 n += getVarint32(&pCell[n], nPayload);
597 pInfo->nKey = nPayload;
599 pInfo->nPayload = nPayload;
601 if( likely(nPayload<=pPage->maxLocal) ){
602 /* This is the (easy) common case where the entire payload fits
603 ** on the local page. No overflow is required.
605 int nSize; /* Total size of cell content in bytes */
606 nSize = nPayload + n;
607 pInfo->nLocal = nPayload;
608 pInfo->iOverflow = 0;
609 if( (nSize & ~3)==0 ){
610 nSize = 4; /* Minimum cell size is 4 */
612 pInfo->nSize = nSize;
614 /* If the payload will not fit completely on the local page, we have
615 ** to decide how much to store locally and how much to spill onto
616 ** overflow pages. The strategy is to minimize the amount of unused
617 ** space on overflow pages while keeping the amount of local storage
618 ** in between minLocal and maxLocal.
620 ** Warning: changing the way overflow payload is distributed in any
621 ** way will result in an incompatible file format.
623 int minLocal; /* Minimum amount of payload held locally */
624 int maxLocal; /* Maximum amount of payload held locally */
625 int surplus; /* Overflow payload available for local storage */
627 minLocal = pPage->minLocal;
628 maxLocal = pPage->maxLocal;
629 surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);
630 if( surplus <= maxLocal ){
631 pInfo->nLocal = surplus;
633 pInfo->nLocal = minLocal;
635 pInfo->iOverflow = pInfo->nLocal + n;
636 pInfo->nSize = pInfo->iOverflow + 4;
639 #define parseCell(pPage, iCell, pInfo) \
640 sqlite3BtreeParseCellPtr((pPage), findCell((pPage), (iCell)), (pInfo))
641 void sqlite3BtreeParseCell(
642 MemPage *pPage, /* Page containing the cell */
643 int iCell, /* The cell index. First cell is 0 */
644 CellInfo *pInfo /* Fill in this structure */
646 parseCell(pPage, iCell, pInfo);
650 ** Compute the total number of bytes that a Cell needs in the cell
651 ** data area of the btree-page. The return number includes the cell
652 ** data header and the local payload, but not any overflow page or
653 ** the space used by the cell pointer.
656 static u16 cellSize(MemPage *pPage, int iCell){
658 sqlite3BtreeParseCell(pPage, iCell, &info);
662 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
664 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
668 #ifndef SQLITE_OMIT_AUTOVACUUM
670 ** If the cell pCell, part of page pPage contains a pointer
671 ** to an overflow page, insert an entry into the pointer-map
672 ** for the overflow page.
674 static int ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell){
677 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
678 assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
679 if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
680 Pgno ovfl = get4byte(&pCell[info.iOverflow]);
681 return ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno);
686 ** If the cell with index iCell on page pPage contains a pointer
687 ** to an overflow page, insert an entry into the pointer-map
688 ** for the overflow page.
690 static int ptrmapPutOvfl(MemPage *pPage, int iCell){
692 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
693 pCell = findOverflowCell(pPage, iCell);
694 return ptrmapPutOvflPtr(pPage, pCell);
700 ** Defragment the page given. All Cells are moved to the
701 ** end of the page and all free space is collected into one
702 ** big FreeBlk that occurs in between the header and cell
703 ** pointer array and the cell content area.
705 static int defragmentPage(MemPage *pPage){
706 int i; /* Loop counter */
707 int pc; /* Address of a i-th cell */
708 int addr; /* Offset of first byte after cell pointer array */
709 int hdr; /* Offset to the page header */
710 int size; /* Size of a cell */
711 int usableSize; /* Number of usable bytes on a page */
712 int cellOffset; /* Offset to the cell pointer array */
713 int cbrk; /* Offset to the cell content area */
714 int nCell; /* Number of cells on the page */
715 unsigned char *data; /* The page data */
716 unsigned char *temp; /* Temp area for cell content */
718 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
719 assert( pPage->pBt!=0 );
720 assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
721 assert( pPage->nOverflow==0 );
722 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
723 temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
725 hdr = pPage->hdrOffset;
726 cellOffset = pPage->cellOffset;
727 nCell = pPage->nCell;
728 assert( nCell==get2byte(&data[hdr+3]) );
729 usableSize = pPage->pBt->usableSize;
730 cbrk = get2byte(&data[hdr+5]);
731 memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk);
733 for(i=0; i<nCell; i++){
734 u8 *pAddr; /* The i-th cell pointer */
735 pAddr = &data[cellOffset + i*2];
736 pc = get2byte(pAddr);
737 if( pc>=usableSize ){
738 return SQLITE_CORRUPT_BKPT;
740 size = cellSizePtr(pPage, &temp[pc]);
742 if( cbrk<cellOffset+2*nCell || pc+size>usableSize ){
743 return SQLITE_CORRUPT_BKPT;
745 assert( cbrk+size<=usableSize && cbrk>=0 );
746 memcpy(&data[cbrk], &temp[pc], size);
747 put2byte(pAddr, cbrk);
749 assert( cbrk>=cellOffset+2*nCell );
750 put2byte(&data[hdr+5], cbrk);
754 addr = cellOffset+2*nCell;
755 memset(&data[addr], 0, cbrk-addr);
756 if( cbrk-addr!=pPage->nFree ){
757 return SQLITE_CORRUPT_BKPT;
763 ** Allocate nByte bytes of space on a page.
765 ** Return the index into pPage->aData[] of the first byte of
766 ** the new allocation. The caller guarantees that there is enough
767 ** space. This routine will never fail.
769 ** If the page contains nBytes of free space but does not contain
770 ** nBytes of contiguous free space, then this routine automatically
771 ** calls defragementPage() to consolidate all free space before
772 ** allocating the new chunk.
774 static int allocateSpace(MemPage *pPage, int nByte){
784 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
785 assert( pPage->pBt );
786 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
787 assert( nByte>=0 ); /* Minimum cell size is 4 */
788 assert( pPage->nFree>=nByte );
789 assert( pPage->nOverflow==0 );
790 pPage->nFree -= nByte;
791 hdr = pPage->hdrOffset;
795 /* Search the freelist looking for a slot big enough to satisfy the
798 while( (pc = get2byte(&data[addr]))>0 ){
799 size = get2byte(&data[pc+2]);
802 memcpy(&data[addr], &data[pc], 2);
803 data[hdr+7] = nFrag + size - nByte;
806 put2byte(&data[pc+2], size-nByte);
807 return pc + size - nByte;
814 /* Allocate memory from the gap in between the cell pointer array
815 ** and the cell content area.
817 top = get2byte(&data[hdr+5]);
818 nCell = get2byte(&data[hdr+3]);
819 cellOffset = pPage->cellOffset;
820 if( nFrag>=60 || cellOffset + 2*nCell > top - nByte ){
821 defragmentPage(pPage);
822 top = get2byte(&data[hdr+5]);
825 assert( cellOffset + 2*nCell <= top );
826 put2byte(&data[hdr+5], top);
831 ** Return a section of the pPage->aData to the freelist.
832 ** The first byte of the new free block is pPage->aDisk[start]
833 ** and the size of the block is "size" bytes.
835 ** Most of the effort here is involved in coalesing adjacent
836 ** free blocks into a single big free block.
838 static int freeSpace(MemPage *pPage, int start, int size){
839 int addr, pbegin, hdr;
840 unsigned char *data = pPage->aData;
842 assert( pPage->pBt!=0 );
843 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
844 assert( start>=pPage->hdrOffset+6+(pPage->leaf?0:4) );
845 assert( (start + size)<=pPage->pBt->usableSize );
846 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
847 assert( size>=0 ); /* Minimum cell size is 4 */
849 #ifdef SQLITE_SECURE_DELETE
850 /* Overwrite deleted information with zeros when the SECURE_DELETE
851 ** option is enabled at compile-time */
852 memset(&data[start], 0, size);
855 /* Add the space back into the linked list of freeblocks */
856 hdr = pPage->hdrOffset;
858 while( (pbegin = get2byte(&data[addr]))<start && pbegin>0 ){
859 assert( pbegin<=pPage->pBt->usableSize-4 );
860 if( pbegin<=addr ) return SQLITE_CORRUPT_BKPT;
863 if( pbegin>pPage->pBt->usableSize-4 ) return SQLITE_CORRUPT_BKPT;
864 assert( pbegin>addr || pbegin==0 );
865 put2byte(&data[addr], start);
866 put2byte(&data[start], pbegin);
867 put2byte(&data[start+2], size);
868 pPage->nFree += size;
870 /* Coalesce adjacent free blocks */
871 addr = pPage->hdrOffset + 1;
872 while( (pbegin = get2byte(&data[addr]))>0 ){
874 assert( pbegin>addr );
875 assert( pbegin<=pPage->pBt->usableSize-4 );
876 pnext = get2byte(&data[pbegin]);
877 psize = get2byte(&data[pbegin+2]);
878 if( pbegin + psize + 3 >= pnext && pnext>0 ){
879 int frag = pnext - (pbegin+psize);
880 if( frag<0 || frag>data[pPage->hdrOffset+7] ) return SQLITE_CORRUPT_BKPT;
881 data[pPage->hdrOffset+7] -= frag;
882 put2byte(&data[pbegin], get2byte(&data[pnext]));
883 put2byte(&data[pbegin+2], pnext+get2byte(&data[pnext+2])-pbegin);
889 /* If the cell content area begins with a freeblock, remove it. */
890 if( data[hdr+1]==data[hdr+5] && data[hdr+2]==data[hdr+6] ){
892 pbegin = get2byte(&data[hdr+1]);
893 memcpy(&data[hdr+1], &data[pbegin], 2);
894 top = get2byte(&data[hdr+5]);
895 put2byte(&data[hdr+5], top + get2byte(&data[pbegin+2]));
901 ** Decode the flags byte (the first byte of the header) for a page
902 ** and initialize fields of the MemPage structure accordingly.
904 ** Only the following combinations are supported. Anything different
905 ** indicates a corrupt database files:
908 ** PTF_ZERODATA | PTF_LEAF
909 ** PTF_LEAFDATA | PTF_INTKEY
910 ** PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
912 static int decodeFlags(MemPage *pPage, int flagByte){
913 BtShared *pBt; /* A copy of pPage->pBt */
915 assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
916 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
917 pPage->leaf = flagByte>>3; assert( PTF_LEAF == 1<<3 );
918 flagByte &= ~PTF_LEAF;
919 pPage->childPtrSize = 4-4*pPage->leaf;
921 if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
923 pPage->hasData = pPage->leaf;
924 pPage->maxLocal = pBt->maxLeaf;
925 pPage->minLocal = pBt->minLeaf;
926 }else if( flagByte==PTF_ZERODATA ){
929 pPage->maxLocal = pBt->maxLocal;
930 pPage->minLocal = pBt->minLocal;
932 return SQLITE_CORRUPT_BKPT;
938 ** Initialize the auxiliary information for a disk block.
940 ** Return SQLITE_OK on success. If we see that the page does
941 ** not contain a well-formed database page, then return
942 ** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not
943 ** guarantee that the page is well-formed. It only shows that
944 ** we failed to detect any corruption.
946 int sqlite3BtreeInitPage(MemPage *pPage){
948 assert( pPage->pBt!=0 );
949 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
950 assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
951 assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
952 assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
954 if( !pPage->isInit ){
955 int pc; /* Address of a freeblock within pPage->aData[] */
956 int hdr; /* Offset to beginning of page header */
957 u8 *data; /* Equal to pPage->aData */
958 BtShared *pBt; /* The main btree structure */
959 int usableSize; /* Amount of usable space on each page */
960 int cellOffset; /* Offset from start of page to first cell pointer */
961 int nFree; /* Number of unused bytes on the page */
962 int top; /* First byte of the cell content area */
966 hdr = pPage->hdrOffset;
968 if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;
969 assert( pBt->pageSize>=512 && pBt->pageSize<=32768 );
970 pPage->maskPage = pBt->pageSize - 1;
971 pPage->nOverflow = 0;
972 usableSize = pBt->usableSize;
973 pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
974 top = get2byte(&data[hdr+5]);
975 pPage->nCell = get2byte(&data[hdr+3]);
976 if( pPage->nCell>MX_CELL(pBt) ){
977 /* To many cells for a single page. The page must be corrupt */
978 return SQLITE_CORRUPT_BKPT;
981 /* Compute the total free space on the page */
982 pc = get2byte(&data[hdr+1]);
983 nFree = data[hdr+7] + top - (cellOffset + 2*pPage->nCell);
986 if( pc>usableSize-4 ){
987 /* Free block is off the page */
988 return SQLITE_CORRUPT_BKPT;
990 next = get2byte(&data[pc]);
991 size = get2byte(&data[pc+2]);
992 if( next>0 && next<=pc+size+3 ){
993 /* Free blocks must be in accending order */
994 return SQLITE_CORRUPT_BKPT;
999 pPage->nFree = nFree;
1000 if( nFree>=usableSize ){
1001 /* Free space cannot exceed total page size */
1002 return SQLITE_CORRUPT_BKPT;
1006 /* Check that all the offsets in the cell offset array are within range.
1008 ** Omitting this consistency check and using the pPage->maskPage mask
1009 ** to prevent overrunning the page buffer in findCell() results in a
1010 ** 2.5% performance gain.
1013 u8 *pOff; /* Iterator used to check all cell offsets are in range */
1014 u8 *pEnd; /* Pointer to end of cell offset array */
1015 u8 mask; /* Mask of bits that must be zero in MSB of cell offsets */
1016 mask = ~(((u8)(pBt->pageSize>>8))-1);
1017 pEnd = &data[cellOffset + pPage->nCell*2];
1018 for(pOff=&data[cellOffset]; pOff!=pEnd && !((*pOff)&mask); pOff+=2);
1020 return SQLITE_CORRUPT_BKPT;
1031 ** Set up a raw page so that it looks like a database page holding
1034 static void zeroPage(MemPage *pPage, int flags){
1035 unsigned char *data = pPage->aData;
1036 BtShared *pBt = pPage->pBt;
1037 int hdr = pPage->hdrOffset;
1040 assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
1041 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1042 assert( sqlite3PagerGetData(pPage->pDbPage) == data );
1043 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1044 assert( sqlite3_mutex_held(pBt->mutex) );
1045 /*memset(&data[hdr], 0, pBt->usableSize - hdr);*/
1047 first = hdr + 8 + 4*((flags&PTF_LEAF)==0);
1048 memset(&data[hdr+1], 0, 4);
1050 put2byte(&data[hdr+5], pBt->usableSize);
1051 pPage->nFree = pBt->usableSize - first;
1052 decodeFlags(pPage, flags);
1053 pPage->hdrOffset = hdr;
1054 pPage->cellOffset = first;
1055 pPage->nOverflow = 0;
1056 assert( pBt->pageSize>=512 && pBt->pageSize<=32768 );
1057 pPage->maskPage = pBt->pageSize - 1;
1064 ** Convert a DbPage obtained from the pager into a MemPage used by
1067 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
1068 MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
1069 pPage->aData = sqlite3PagerGetData(pDbPage);
1070 pPage->pDbPage = pDbPage;
1073 pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
1078 ** Get a page from the pager. Initialize the MemPage.pBt and
1079 ** MemPage.aData elements if needed.
1081 ** If the noContent flag is set, it means that we do not care about
1082 ** the content of the page at this time. So do not go to the disk
1083 ** to fetch the content. Just fill in the content with zeros for now.
1084 ** If in the future we call sqlite3PagerWrite() on this page, that
1085 ** means we have started to be concerned about content and the disk
1086 ** read should occur at that point.
1088 int sqlite3BtreeGetPage(
1089 BtShared *pBt, /* The btree */
1090 Pgno pgno, /* Number of the page to fetch */
1091 MemPage **ppPage, /* Return the page in this parameter */
1092 int noContent /* Do not load page content if true */
1097 assert( sqlite3_mutex_held(pBt->mutex) );
1098 rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, noContent);
1100 *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
1105 ** Return the size of the database file in pages. Or return -1 if
1106 ** there is any kind of error.
1108 static int pagerPagecount(Pager *pPager){
1111 rc = sqlite3PagerPagecount(pPager, &nPage);
1112 return (rc==SQLITE_OK?nPage:-1);
1116 ** Get a page from the pager and initialize it. This routine
1117 ** is just a convenience wrapper around separate calls to
1118 ** sqlite3BtreeGetPage() and sqlite3BtreeInitPage().
1120 static int getAndInitPage(
1121 BtShared *pBt, /* The database file */
1122 Pgno pgno, /* Number of the page to get */
1123 MemPage **ppPage /* Write the page pointer here */
1129 assert( sqlite3_mutex_held(pBt->mutex) );
1131 return SQLITE_CORRUPT_BKPT;
1134 /* It is often the case that the page we want is already in cache.
1135 ** If so, get it directly. This saves us from having to call
1136 ** pagerPagecount() to make sure pgno is within limits, which results
1137 ** in a measureable performance improvements.
1139 pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
1141 /* Page is already in cache */
1142 *ppPage = pPage = btreePageFromDbPage(pDbPage, pgno, pBt);
1145 /* Page not in cache. Acquire it. */
1146 if( pgno>pagerPagecount(pBt->pPager) ){
1147 return SQLITE_CORRUPT_BKPT;
1149 rc = sqlite3BtreeGetPage(pBt, pgno, ppPage, 0);
1153 if( !pPage->isInit ){
1154 rc = sqlite3BtreeInitPage(pPage);
1156 if( rc!=SQLITE_OK ){
1164 ** Release a MemPage. This should be called once for each prior
1165 ** call to sqlite3BtreeGetPage.
1167 static void releasePage(MemPage *pPage){
1169 assert( pPage->aData );
1170 assert( pPage->pBt );
1171 assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
1172 assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
1173 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1174 sqlite3PagerUnref(pPage->pDbPage);
1179 ** During a rollback, when the pager reloads information into the cache
1180 ** so that the cache is restored to its original state at the start of
1181 ** the transaction, for each page restored this routine is called.
1183 ** This routine needs to reset the extra data section at the end of the
1184 ** page to agree with the restored data.
1186 static void pageReinit(DbPage *pData){
1188 pPage = (MemPage *)sqlite3PagerGetExtra(pData);
1189 if( pPage->isInit ){
1190 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1192 if( sqlite3PagerPageRefcount(pData)>0 ){
1193 sqlite3BtreeInitPage(pPage);
1199 ** Invoke the busy handler for a btree.
1201 static int sqlite3BtreeInvokeBusyHandler(void *pArg, int n){
1202 BtShared *pBt = (BtShared*)pArg;
1204 assert( sqlite3_mutex_held(pBt->db->mutex) );
1205 return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
1209 ** Open a database file.
1211 ** zFilename is the name of the database file. If zFilename is NULL
1212 ** a new database with a random name is created. This randomly named
1213 ** database file will be deleted when sqlite3BtreeClose() is called.
1214 ** If zFilename is ":memory:" then an in-memory database is created
1215 ** that is automatically destroyed when it is closed.
1217 int sqlite3BtreeOpen(
1218 const char *zFilename, /* Name of the file containing the BTree database */
1219 sqlite3 *db, /* Associated database handle */
1220 Btree **ppBtree, /* Pointer to new Btree object written here */
1221 int flags, /* Options */
1222 int vfsFlags /* Flags passed through to sqlite3_vfs.xOpen() */
1224 sqlite3_vfs *pVfs; /* The VFS to use for this btree */
1225 BtShared *pBt = 0; /* Shared part of btree structure */
1226 Btree *p; /* Handle to return */
1229 unsigned char zDbHeader[100];
1231 /* Set the variable isMemdb to true for an in-memory database, or
1232 ** false for a file-based database. This symbol is only required if
1233 ** either of the shared-data or autovacuum features are compiled
1234 ** into the library.
1236 #if !defined(SQLITE_OMIT_SHARED_CACHE) || !defined(SQLITE_OMIT_AUTOVACUUM)
1237 #ifdef SQLITE_OMIT_MEMORYDB
1238 const int isMemdb = 0;
1240 const int isMemdb = zFilename && !strcmp(zFilename, ":memory:");
1245 assert( sqlite3_mutex_held(db->mutex) );
1248 p = sqlite3MallocZero(sizeof(Btree));
1250 return SQLITE_NOMEM;
1252 p->inTrans = TRANS_NONE;
1255 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1257 ** If this Btree is a candidate for shared cache, try to find an
1258 ** existing BtShared object that we can share with
1261 && (db->flags & SQLITE_Vtab)==0
1262 && zFilename && zFilename[0]
1264 if( sqlite3GlobalConfig.sharedCacheEnabled ){
1265 int nFullPathname = pVfs->mxPathname+1;
1266 char *zFullPathname = sqlite3Malloc(nFullPathname);
1267 sqlite3_mutex *mutexShared;
1269 db->flags |= SQLITE_SharedCache;
1270 if( !zFullPathname ){
1272 return SQLITE_NOMEM;
1274 sqlite3OsFullPathname(pVfs, zFilename, nFullPathname, zFullPathname);
1275 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1276 sqlite3_mutex_enter(mutexShared);
1277 for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
1278 assert( pBt->nRef>0 );
1279 if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager))
1280 && sqlite3PagerVfs(pBt->pPager)==pVfs ){
1286 sqlite3_mutex_leave(mutexShared);
1287 sqlite3_free(zFullPathname);
1291 /* In debug mode, we mark all persistent databases as sharable
1292 ** even when they are not. This exercises the locking code and
1293 ** gives more opportunity for asserts(sqlite3_mutex_held())
1294 ** statements to find locking problems.
1303 ** The following asserts make sure that structures used by the btree are
1304 ** the right size. This is to guard against size changes that result
1305 ** when compiling on a different architecture.
1307 assert( sizeof(i64)==8 || sizeof(i64)==4 );
1308 assert( sizeof(u64)==8 || sizeof(u64)==4 );
1309 assert( sizeof(u32)==4 );
1310 assert( sizeof(u16)==2 );
1311 assert( sizeof(Pgno)==4 );
1313 pBt = sqlite3MallocZero( sizeof(*pBt) );
1316 goto btree_open_out;
1318 pBt->busyHdr.xFunc = sqlite3BtreeInvokeBusyHandler;
1319 pBt->busyHdr.pArg = pBt;
1320 rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
1321 EXTRA_SIZE, flags, vfsFlags);
1322 if( rc==SQLITE_OK ){
1323 rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
1325 if( rc!=SQLITE_OK ){
1326 goto btree_open_out;
1328 sqlite3PagerSetBusyhandler(pBt->pPager, &pBt->busyHdr);
1331 sqlite3PagerSetReiniter(pBt->pPager, pageReinit);
1334 pBt->readOnly = sqlite3PagerIsreadonly(pBt->pPager);
1335 pBt->pageSize = get2byte(&zDbHeader[16]);
1336 if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
1337 || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
1339 sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1340 #ifndef SQLITE_OMIT_AUTOVACUUM
1341 /* If the magic name ":memory:" will create an in-memory database, then
1342 ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
1343 ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
1344 ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
1345 ** regular file-name. In this case the auto-vacuum applies as per normal.
1347 if( zFilename && !isMemdb ){
1348 pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
1349 pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
1354 nReserve = zDbHeader[20];
1355 pBt->pageSizeFixed = 1;
1356 #ifndef SQLITE_OMIT_AUTOVACUUM
1357 pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
1358 pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
1361 pBt->usableSize = pBt->pageSize - nReserve;
1362 assert( (pBt->pageSize & 7)==0 ); /* 8-byte alignment of pageSize */
1363 sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1365 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1366 /* Add the new BtShared object to the linked list sharable BtShareds.
1369 sqlite3_mutex *mutexShared;
1371 mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1372 if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
1373 pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
1374 if( pBt->mutex==0 ){
1376 db->mallocFailed = 0;
1377 goto btree_open_out;
1380 sqlite3_mutex_enter(mutexShared);
1381 pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
1382 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
1383 sqlite3_mutex_leave(mutexShared);
1388 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
1389 /* If the new Btree uses a sharable pBtShared, then link the new
1390 ** Btree into the list of all sharable Btrees for the same connection.
1391 ** The list is kept in ascending order by pBt address.
1396 for(i=0; i<db->nDb; i++){
1397 if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
1398 while( pSib->pPrev ){ pSib = pSib->pPrev; }
1399 if( p->pBt<pSib->pBt ){
1404 while( pSib->pNext && pSib->pNext->pBt<p->pBt ){
1407 p->pNext = pSib->pNext;
1410 p->pNext->pPrev = p;
1422 if( rc!=SQLITE_OK ){
1423 if( pBt && pBt->pPager ){
1424 sqlite3PagerClose(pBt->pPager);
1434 ** Decrement the BtShared.nRef counter. When it reaches zero,
1435 ** remove the BtShared structure from the sharing list. Return
1436 ** true if the BtShared.nRef counter reaches zero and return
1437 ** false if it is still positive.
1439 static int removeFromSharingList(BtShared *pBt){
1440 #ifndef SQLITE_OMIT_SHARED_CACHE
1441 sqlite3_mutex *pMaster;
1445 assert( sqlite3_mutex_notheld(pBt->mutex) );
1446 pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
1447 sqlite3_mutex_enter(pMaster);
1450 if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
1451 GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
1453 pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
1454 while( ALWAYS(pList) && pList->pNext!=pBt ){
1457 if( ALWAYS(pList) ){
1458 pList->pNext = pBt->pNext;
1461 if( SQLITE_THREADSAFE ){
1462 sqlite3_mutex_free(pBt->mutex);
1466 sqlite3_mutex_leave(pMaster);
1474 ** Make sure pBt->pTmpSpace points to an allocation of
1475 ** MX_CELL_SIZE(pBt) bytes.
1477 static void allocateTempSpace(BtShared *pBt){
1478 if( !pBt->pTmpSpace ){
1479 pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
1484 ** Free the pBt->pTmpSpace allocation
1486 static void freeTempSpace(BtShared *pBt){
1487 sqlite3PageFree( pBt->pTmpSpace);
1492 ** Close an open database and invalidate all cursors.
1494 int sqlite3BtreeClose(Btree *p){
1495 BtShared *pBt = p->pBt;
1498 /* Close all cursors opened via this handle. */
1499 assert( sqlite3_mutex_held(p->db->mutex) );
1500 sqlite3BtreeEnter(p);
1502 pCur = pBt->pCursor;
1504 BtCursor *pTmp = pCur;
1506 if( pTmp->pBtree==p ){
1507 sqlite3BtreeCloseCursor(pTmp);
1511 /* Rollback any active transaction and free the handle structure.
1512 ** The call to sqlite3BtreeRollback() drops any table-locks held by
1515 sqlite3BtreeRollback(p);
1516 sqlite3BtreeLeave(p);
1518 /* If there are still other outstanding references to the shared-btree
1519 ** structure, return now. The remainder of this procedure cleans
1520 ** up the shared-btree.
1522 assert( p->wantToLock==0 && p->locked==0 );
1523 if( !p->sharable || removeFromSharingList(pBt) ){
1524 /* The pBt is no longer on the sharing list, so we can access
1525 ** it without having to hold the mutex.
1527 ** Clean out and delete the BtShared object.
1529 assert( !pBt->pCursor );
1530 sqlite3PagerClose(pBt->pPager);
1531 if( pBt->xFreeSchema && pBt->pSchema ){
1532 pBt->xFreeSchema(pBt->pSchema);
1534 sqlite3_free(pBt->pSchema);
1539 #ifndef SQLITE_OMIT_SHARED_CACHE
1540 assert( p->wantToLock==0 );
1541 assert( p->locked==0 );
1542 if( p->pPrev ) p->pPrev->pNext = p->pNext;
1543 if( p->pNext ) p->pNext->pPrev = p->pPrev;
1551 ** Change the limit on the number of pages allowed in the cache.
1553 ** The maximum number of cache pages is set to the absolute
1554 ** value of mxPage. If mxPage is negative, the pager will
1555 ** operate asynchronously - it will not stop to do fsync()s
1556 ** to insure data is written to the disk surface before
1557 ** continuing. Transactions still work if synchronous is off,
1558 ** and the database cannot be corrupted if this program
1559 ** crashes. But if the operating system crashes or there is
1560 ** an abrupt power failure when synchronous is off, the database
1561 ** could be left in an inconsistent and unrecoverable state.
1562 ** Synchronous is on by default so database corruption is not
1563 ** normally a worry.
1565 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
1566 BtShared *pBt = p->pBt;
1567 assert( sqlite3_mutex_held(p->db->mutex) );
1568 sqlite3BtreeEnter(p);
1569 sqlite3PagerSetCachesize(pBt->pPager, mxPage);
1570 sqlite3BtreeLeave(p);
1575 ** Change the way data is synced to disk in order to increase or decrease
1576 ** how well the database resists damage due to OS crashes and power
1577 ** failures. Level 1 is the same as asynchronous (no syncs() occur and
1578 ** there is a high probability of damage) Level 2 is the default. There
1579 ** is a very low but non-zero probability of damage. Level 3 reduces the
1580 ** probability of damage to near zero but with a write performance reduction.
1582 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
1583 int sqlite3BtreeSetSafetyLevel(Btree *p, int level, int fullSync){
1584 BtShared *pBt = p->pBt;
1585 assert( sqlite3_mutex_held(p->db->mutex) );
1586 sqlite3BtreeEnter(p);
1587 sqlite3PagerSetSafetyLevel(pBt->pPager, level, fullSync);
1588 sqlite3BtreeLeave(p);
1594 ** Return TRUE if the given btree is set to safety level 1. In other
1595 ** words, return TRUE if no sync() occurs on the disk files.
1597 int sqlite3BtreeSyncDisabled(Btree *p){
1598 BtShared *pBt = p->pBt;
1600 assert( sqlite3_mutex_held(p->db->mutex) );
1601 sqlite3BtreeEnter(p);
1602 assert( pBt && pBt->pPager );
1603 rc = sqlite3PagerNosync(pBt->pPager);
1604 sqlite3BtreeLeave(p);
1608 #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
1610 ** Change the default pages size and the number of reserved bytes per page.
1612 ** The page size must be a power of 2 between 512 and 65536. If the page
1613 ** size supplied does not meet this constraint then the page size is not
1616 ** Page sizes are constrained to be a power of two so that the region
1617 ** of the database file used for locking (beginning at PENDING_BYTE,
1618 ** the first byte past the 1GB boundary, 0x40000000) needs to occur
1619 ** at the beginning of a page.
1621 ** If parameter nReserve is less than zero, then the number of reserved
1622 ** bytes per page is left unchanged.
1624 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve){
1626 BtShared *pBt = p->pBt;
1627 sqlite3BtreeEnter(p);
1628 if( pBt->pageSizeFixed ){
1629 sqlite3BtreeLeave(p);
1630 return SQLITE_READONLY;
1633 nReserve = pBt->pageSize - pBt->usableSize;
1635 if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
1636 ((pageSize-1)&pageSize)==0 ){
1637 assert( (pageSize & 7)==0 );
1638 assert( !pBt->pPage1 && !pBt->pCursor );
1639 pBt->pageSize = pageSize;
1641 rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1643 pBt->usableSize = pBt->pageSize - nReserve;
1644 sqlite3BtreeLeave(p);
1649 ** Return the currently defined page size
1651 int sqlite3BtreeGetPageSize(Btree *p){
1652 return p->pBt->pageSize;
1654 int sqlite3BtreeGetReserve(Btree *p){
1656 sqlite3BtreeEnter(p);
1657 n = p->pBt->pageSize - p->pBt->usableSize;
1658 sqlite3BtreeLeave(p);
1663 ** Set the maximum page count for a database if mxPage is positive.
1664 ** No changes are made if mxPage is 0 or negative.
1665 ** Regardless of the value of mxPage, return the maximum page count.
1667 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
1669 sqlite3BtreeEnter(p);
1670 n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
1671 sqlite3BtreeLeave(p);
1674 #endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */
1677 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
1678 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
1679 ** is disabled. The default value for the auto-vacuum property is
1680 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
1682 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
1683 #ifdef SQLITE_OMIT_AUTOVACUUM
1684 return SQLITE_READONLY;
1686 BtShared *pBt = p->pBt;
1688 int av = (autoVacuum?1:0);
1690 sqlite3BtreeEnter(p);
1691 if( pBt->pageSizeFixed && av!=pBt->autoVacuum ){
1692 rc = SQLITE_READONLY;
1694 pBt->autoVacuum = av;
1696 sqlite3BtreeLeave(p);
1702 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is
1703 ** enabled 1 is returned. Otherwise 0.
1705 int sqlite3BtreeGetAutoVacuum(Btree *p){
1706 #ifdef SQLITE_OMIT_AUTOVACUUM
1707 return BTREE_AUTOVACUUM_NONE;
1710 sqlite3BtreeEnter(p);
1712 (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
1713 (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
1714 BTREE_AUTOVACUUM_INCR
1716 sqlite3BtreeLeave(p);
1723 ** Get a reference to pPage1 of the database file. This will
1724 ** also acquire a readlock on that file.
1726 ** SQLITE_OK is returned on success. If the file is not a
1727 ** well-formed database file, then SQLITE_CORRUPT is returned.
1728 ** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM
1729 ** is returned if we run out of memory.
1731 static int lockBtree(BtShared *pBt){
1736 assert( sqlite3_mutex_held(pBt->mutex) );
1737 if( pBt->pPage1 ) return SQLITE_OK;
1738 rc = sqlite3BtreeGetPage(pBt, 1, &pPage1, 0);
1739 if( rc!=SQLITE_OK ) return rc;
1741 /* Do some checking to help insure the file we opened really is
1742 ** a valid database file.
1744 rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
1745 if( rc!=SQLITE_OK ){
1746 goto page1_init_failed;
1747 }else if( nPage>0 ){
1750 u8 *page1 = pPage1->aData;
1752 if( memcmp(page1, zMagicHeader, 16)!=0 ){
1753 goto page1_init_failed;
1759 goto page1_init_failed;
1762 /* The maximum embedded fraction must be exactly 25%. And the minimum
1763 ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data.
1764 ** The original design allowed these amounts to vary, but as of
1765 ** version 3.6.0, we require them to be fixed.
1767 if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
1768 goto page1_init_failed;
1770 pageSize = get2byte(&page1[16]);
1771 if( ((pageSize-1)&pageSize)!=0 || pageSize<512 ||
1772 (SQLITE_MAX_PAGE_SIZE<32768 && pageSize>SQLITE_MAX_PAGE_SIZE)
1774 goto page1_init_failed;
1776 assert( (pageSize & 7)==0 );
1777 usableSize = pageSize - page1[20];
1778 if( pageSize!=pBt->pageSize ){
1779 /* After reading the first page of the database assuming a page size
1780 ** of BtShared.pageSize, we have discovered that the page-size is
1781 ** actually pageSize. Unlock the database, leave pBt->pPage1 at
1782 ** zero and return SQLITE_OK. The caller will call this function
1783 ** again with the correct page-size.
1785 releasePage(pPage1);
1786 pBt->usableSize = usableSize;
1787 pBt->pageSize = pageSize;
1789 sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize);
1792 if( usableSize<500 ){
1793 goto page1_init_failed;
1795 pBt->pageSize = pageSize;
1796 pBt->usableSize = usableSize;
1797 #ifndef SQLITE_OMIT_AUTOVACUUM
1798 pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
1799 pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
1803 /* maxLocal is the maximum amount of payload to store locally for
1804 ** a cell. Make sure it is small enough so that at least minFanout
1805 ** cells can will fit on one page. We assume a 10-byte page header.
1806 ** Besides the payload, the cell must store:
1807 ** 2-byte pointer to the cell
1808 ** 4-byte child pointer
1809 ** 9-byte nKey value
1810 ** 4-byte nData value
1811 ** 4-byte overflow page pointer
1812 ** So a cell consists of a 2-byte poiner, a header which is as much as
1813 ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
1816 pBt->maxLocal = (pBt->usableSize-12)*64/255 - 23;
1817 pBt->minLocal = (pBt->usableSize-12)*32/255 - 23;
1818 pBt->maxLeaf = pBt->usableSize - 35;
1819 pBt->minLeaf = (pBt->usableSize-12)*32/255 - 23;
1820 assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
1821 pBt->pPage1 = pPage1;
1825 releasePage(pPage1);
1831 ** This routine works like lockBtree() except that it also invokes the
1832 ** busy callback if there is lock contention.
1834 static int lockBtreeWithRetry(Btree *pRef){
1837 assert( sqlite3BtreeHoldsMutex(pRef) );
1838 if( pRef->inTrans==TRANS_NONE ){
1839 u8 inTransaction = pRef->pBt->inTransaction;
1840 btreeIntegrity(pRef);
1841 rc = sqlite3BtreeBeginTrans(pRef, 0);
1842 pRef->pBt->inTransaction = inTransaction;
1843 pRef->inTrans = TRANS_NONE;
1844 if( rc==SQLITE_OK ){
1845 pRef->pBt->nTransaction--;
1847 btreeIntegrity(pRef);
1854 ** If there are no outstanding cursors and we are not in the middle
1855 ** of a transaction but there is a read lock on the database, then
1856 ** this routine unrefs the first page of the database file which
1857 ** has the effect of releasing the read lock.
1859 ** If there are any outstanding cursors, this routine is a no-op.
1861 ** If there is a transaction in progress, this routine is a no-op.
1863 static void unlockBtreeIfUnused(BtShared *pBt){
1864 assert( sqlite3_mutex_held(pBt->mutex) );
1865 if( pBt->inTransaction==TRANS_NONE && pBt->pCursor==0 && pBt->pPage1!=0 ){
1866 if( sqlite3PagerRefcount(pBt->pPager)>=1 ){
1867 assert( pBt->pPage1->aData );
1869 if( pBt->pPage1->aData==0 ){
1870 MemPage *pPage = pBt->pPage1;
1871 pPage->aData = sqlite3PagerGetData(pPage->pDbPage);
1876 releasePage(pBt->pPage1);
1884 ** Create a new database by initializing the first page of the
1887 static int newDatabase(BtShared *pBt){
1889 unsigned char *data;
1893 assert( sqlite3_mutex_held(pBt->mutex) );
1894 rc = sqlite3PagerPagecount(pBt->pPager, &nPage);
1895 if( rc!=SQLITE_OK || nPage>0 ){
1901 rc = sqlite3PagerWrite(pP1->pDbPage);
1903 memcpy(data, zMagicHeader, sizeof(zMagicHeader));
1904 assert( sizeof(zMagicHeader)==16 );
1905 put2byte(&data[16], pBt->pageSize);
1908 data[20] = pBt->pageSize - pBt->usableSize;
1912 memset(&data[24], 0, 100-24);
1913 zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
1914 pBt->pageSizeFixed = 1;
1915 #ifndef SQLITE_OMIT_AUTOVACUUM
1916 assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
1917 assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
1918 put4byte(&data[36 + 4*4], pBt->autoVacuum);
1919 put4byte(&data[36 + 7*4], pBt->incrVacuum);
1925 ** Attempt to start a new transaction. A write-transaction
1926 ** is started if the second argument is nonzero, otherwise a read-
1927 ** transaction. If the second argument is 2 or more and exclusive
1928 ** transaction is started, meaning that no other process is allowed
1929 ** to access the database. A preexisting transaction may not be
1930 ** upgraded to exclusive by calling this routine a second time - the
1931 ** exclusivity flag only works for a new transaction.
1933 ** A write-transaction must be started before attempting any
1934 ** changes to the database. None of the following routines
1935 ** will work unless a transaction is started first:
1937 ** sqlite3BtreeCreateTable()
1938 ** sqlite3BtreeCreateIndex()
1939 ** sqlite3BtreeClearTable()
1940 ** sqlite3BtreeDropTable()
1941 ** sqlite3BtreeInsert()
1942 ** sqlite3BtreeDelete()
1943 ** sqlite3BtreeUpdateMeta()
1945 ** If an initial attempt to acquire the lock fails because of lock contention
1946 ** and the database was previously unlocked, then invoke the busy handler
1947 ** if there is one. But if there was previously a read-lock, do not
1948 ** invoke the busy handler - just return SQLITE_BUSY. SQLITE_BUSY is
1949 ** returned when there is already a read-lock in order to avoid a deadlock.
1951 ** Suppose there are two processes A and B. A has a read lock and B has
1952 ** a reserved lock. B tries to promote to exclusive but is blocked because
1953 ** of A's read lock. A tries to promote to reserved but is blocked by B.
1954 ** One or the other of the two processes must give way or there can be
1955 ** no progress. By returning SQLITE_BUSY and not invoking the busy callback
1956 ** when A already has a read lock, we encourage A to give up and let B
1959 int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
1960 BtShared *pBt = p->pBt;
1963 sqlite3BtreeEnter(p);
1967 /* If the btree is already in a write-transaction, or it
1968 ** is already in a read-transaction and a read-transaction
1969 ** is requested, this is a no-op.
1971 if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
1975 /* Write transactions are not possible on a read-only database */
1976 if( pBt->readOnly && wrflag ){
1977 rc = SQLITE_READONLY;
1981 /* If another database handle has already opened a write transaction
1982 ** on this shared-btree structure and a second write transaction is
1983 ** requested, return SQLITE_BUSY.
1985 if( pBt->inTransaction==TRANS_WRITE && wrflag ){
1990 #ifndef SQLITE_OMIT_SHARED_CACHE
1993 for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
1994 if( pIter->pBtree!=p ){
2003 if( pBt->pPage1==0 ){
2005 rc = lockBtree(pBt);
2006 }while( pBt->pPage1==0 && rc==SQLITE_OK );
2009 if( rc==SQLITE_OK && wrflag ){
2010 if( pBt->readOnly ){
2011 rc = SQLITE_READONLY;
2013 rc = sqlite3PagerBegin(pBt->pPage1->pDbPage, wrflag>1);
2014 if( rc==SQLITE_OK ){
2015 rc = newDatabase(pBt);
2020 if( rc==SQLITE_OK ){
2021 if( wrflag ) pBt->inStmt = 0;
2023 unlockBtreeIfUnused(pBt);
2025 }while( rc==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
2026 sqlite3BtreeInvokeBusyHandler(pBt, 0) );
2028 if( rc==SQLITE_OK ){
2029 if( p->inTrans==TRANS_NONE ){
2030 pBt->nTransaction++;
2032 p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
2033 if( p->inTrans>pBt->inTransaction ){
2034 pBt->inTransaction = p->inTrans;
2036 #ifndef SQLITE_OMIT_SHARED_CACHE
2038 assert( !pBt->pExclusive );
2039 pBt->pExclusive = p;
2047 sqlite3BtreeLeave(p);
2052 #ifndef SQLITE_OMIT_AUTOVACUUM
2055 ** Set the pointer-map entries for all children of page pPage. Also, if
2056 ** pPage contains cells that point to overflow pages, set the pointer
2057 ** map entries for the overflow pages as well.
2059 static int setChildPtrmaps(MemPage *pPage){
2060 int i; /* Counter variable */
2061 int nCell; /* Number of cells in page pPage */
2062 int rc; /* Return code */
2063 BtShared *pBt = pPage->pBt;
2064 int isInitOrig = pPage->isInit;
2065 Pgno pgno = pPage->pgno;
2067 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2068 rc = sqlite3BtreeInitPage(pPage);
2069 if( rc!=SQLITE_OK ){
2070 goto set_child_ptrmaps_out;
2072 nCell = pPage->nCell;
2074 for(i=0; i<nCell; i++){
2075 u8 *pCell = findCell(pPage, i);
2077 rc = ptrmapPutOvflPtr(pPage, pCell);
2078 if( rc!=SQLITE_OK ){
2079 goto set_child_ptrmaps_out;
2083 Pgno childPgno = get4byte(pCell);
2084 rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
2085 if( rc!=SQLITE_OK ) goto set_child_ptrmaps_out;
2090 Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
2091 rc = ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno);
2094 set_child_ptrmaps_out:
2095 pPage->isInit = isInitOrig;
2100 ** Somewhere on pPage, which is guarenteed to be a btree page, not an overflow
2101 ** page, is a pointer to page iFrom. Modify this pointer so that it points to
2102 ** iTo. Parameter eType describes the type of pointer to be modified, as
2105 ** PTRMAP_BTREE: pPage is a btree-page. The pointer points at a child
2108 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
2109 ** page pointed to by one of the cells on pPage.
2111 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
2112 ** overflow page in the list.
2114 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
2115 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2116 if( eType==PTRMAP_OVERFLOW2 ){
2117 /* The pointer is always the first 4 bytes of the page in this case. */
2118 if( get4byte(pPage->aData)!=iFrom ){
2119 return SQLITE_CORRUPT_BKPT;
2121 put4byte(pPage->aData, iTo);
2123 int isInitOrig = pPage->isInit;
2127 sqlite3BtreeInitPage(pPage);
2128 nCell = pPage->nCell;
2130 for(i=0; i<nCell; i++){
2131 u8 *pCell = findCell(pPage, i);
2132 if( eType==PTRMAP_OVERFLOW1 ){
2134 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
2135 if( info.iOverflow ){
2136 if( iFrom==get4byte(&pCell[info.iOverflow]) ){
2137 put4byte(&pCell[info.iOverflow], iTo);
2142 if( get4byte(pCell)==iFrom ){
2143 put4byte(pCell, iTo);
2150 if( eType!=PTRMAP_BTREE ||
2151 get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
2152 return SQLITE_CORRUPT_BKPT;
2154 put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
2157 pPage->isInit = isInitOrig;
2164 ** Move the open database page pDbPage to location iFreePage in the
2165 ** database. The pDbPage reference remains valid.
2167 static int relocatePage(
2168 BtShared *pBt, /* Btree */
2169 MemPage *pDbPage, /* Open page to move */
2170 u8 eType, /* Pointer map 'type' entry for pDbPage */
2171 Pgno iPtrPage, /* Pointer map 'page-no' entry for pDbPage */
2172 Pgno iFreePage, /* The location to move pDbPage to */
2175 MemPage *pPtrPage; /* The page that contains a pointer to pDbPage */
2176 Pgno iDbPage = pDbPage->pgno;
2177 Pager *pPager = pBt->pPager;
2180 assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
2181 eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
2182 assert( sqlite3_mutex_held(pBt->mutex) );
2183 assert( pDbPage->pBt==pBt );
2185 /* Move page iDbPage from its current location to page number iFreePage */
2186 TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
2187 iDbPage, iFreePage, iPtrPage, eType));
2188 rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
2189 if( rc!=SQLITE_OK ){
2192 pDbPage->pgno = iFreePage;
2194 /* If pDbPage was a btree-page, then it may have child pages and/or cells
2195 ** that point to overflow pages. The pointer map entries for all these
2196 ** pages need to be changed.
2198 ** If pDbPage is an overflow page, then the first 4 bytes may store a
2199 ** pointer to a subsequent overflow page. If this is the case, then
2200 ** the pointer map needs to be updated for the subsequent overflow page.
2202 if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
2203 rc = setChildPtrmaps(pDbPage);
2204 if( rc!=SQLITE_OK ){
2208 Pgno nextOvfl = get4byte(pDbPage->aData);
2210 rc = ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage);
2211 if( rc!=SQLITE_OK ){
2217 /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
2218 ** that it points at iFreePage. Also fix the pointer map entry for
2221 if( eType!=PTRMAP_ROOTPAGE ){
2222 rc = sqlite3BtreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
2223 if( rc!=SQLITE_OK ){
2226 rc = sqlite3PagerWrite(pPtrPage->pDbPage);
2227 if( rc!=SQLITE_OK ){
2228 releasePage(pPtrPage);
2231 rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
2232 releasePage(pPtrPage);
2233 if( rc==SQLITE_OK ){
2234 rc = ptrmapPut(pBt, iFreePage, eType, iPtrPage);
2240 /* Forward declaration required by incrVacuumStep(). */
2241 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
2244 ** Perform a single step of an incremental-vacuum. If successful,
2245 ** return SQLITE_OK. If there is no work to do (and therefore no
2246 ** point in calling this function again), return SQLITE_DONE.
2248 ** More specificly, this function attempts to re-organize the
2249 ** database so that the last page of the file currently in use
2250 ** is no longer in use.
2252 ** If the nFin parameter is non-zero, the implementation assumes
2253 ** that the caller will keep calling incrVacuumStep() until
2254 ** it returns SQLITE_DONE or an error, and that nFin is the
2255 ** number of pages the database file will contain after this
2256 ** process is complete.
2258 static int incrVacuumStep(BtShared *pBt, Pgno nFin){
2259 Pgno iLastPg; /* Last page in the database */
2260 Pgno nFreeList; /* Number of pages still on the free-list */
2262 assert( sqlite3_mutex_held(pBt->mutex) );
2263 iLastPg = pBt->nTrunc;
2265 iLastPg = pagerPagecount(pBt->pPager);
2268 if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
2273 nFreeList = get4byte(&pBt->pPage1->aData[36]);
2274 if( nFreeList==0 || nFin==iLastPg ){
2278 rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
2279 if( rc!=SQLITE_OK ){
2282 if( eType==PTRMAP_ROOTPAGE ){
2283 return SQLITE_CORRUPT_BKPT;
2286 if( eType==PTRMAP_FREEPAGE ){
2288 /* Remove the page from the files free-list. This is not required
2289 ** if nFin is non-zero. In that case, the free-list will be
2290 ** truncated to zero after this function returns, so it doesn't
2291 ** matter if it still contains some garbage entries.
2295 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, 1);
2296 if( rc!=SQLITE_OK ){
2299 assert( iFreePg==iLastPg );
2300 releasePage(pFreePg);
2303 Pgno iFreePg; /* Index of free page to move pLastPg to */
2306 rc = sqlite3BtreeGetPage(pBt, iLastPg, &pLastPg, 0);
2307 if( rc!=SQLITE_OK ){
2311 /* If nFin is zero, this loop runs exactly once and page pLastPg
2312 ** is swapped with the first free page pulled off the free list.
2314 ** On the other hand, if nFin is greater than zero, then keep
2315 ** looping until a free-page located within the first nFin pages
2316 ** of the file is found.
2320 rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, 0, 0);
2321 if( rc!=SQLITE_OK ){
2322 releasePage(pLastPg);
2325 releasePage(pFreePg);
2326 }while( nFin!=0 && iFreePg>nFin );
2327 assert( iFreePg<iLastPg );
2329 rc = sqlite3PagerWrite(pLastPg->pDbPage);
2330 if( rc==SQLITE_OK ){
2331 rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, nFin!=0);
2333 releasePage(pLastPg);
2334 if( rc!=SQLITE_OK ){
2340 pBt->nTrunc = iLastPg - 1;
2341 while( pBt->nTrunc==PENDING_BYTE_PAGE(pBt)||PTRMAP_ISPAGE(pBt, pBt->nTrunc) ){
2348 ** A write-transaction must be opened before calling this function.
2349 ** It performs a single unit of work towards an incremental vacuum.
2351 ** If the incremental vacuum is finished after this function has run,
2352 ** SQLITE_DONE is returned. If it is not finished, but no error occured,
2353 ** SQLITE_OK is returned. Otherwise an SQLite error code.
2355 int sqlite3BtreeIncrVacuum(Btree *p){
2357 BtShared *pBt = p->pBt;
2359 sqlite3BtreeEnter(p);
2361 assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
2362 if( !pBt->autoVacuum ){
2365 invalidateAllOverflowCache(pBt);
2366 rc = incrVacuumStep(pBt, 0);
2368 sqlite3BtreeLeave(p);
2373 ** This routine is called prior to sqlite3PagerCommit when a transaction
2374 ** is commited for an auto-vacuum database.
2376 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
2377 ** the database file should be truncated to during the commit process.
2378 ** i.e. the database has been reorganized so that only the first *pnTrunc
2379 ** pages are in use.
2381 static int autoVacuumCommit(BtShared *pBt, Pgno *pnTrunc){
2383 Pager *pPager = pBt->pPager;
2384 VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager) );
2386 assert( sqlite3_mutex_held(pBt->mutex) );
2387 invalidateAllOverflowCache(pBt);
2388 assert(pBt->autoVacuum);
2389 if( !pBt->incrVacuum ){
2392 if( pBt->nTrunc==0 ){
2395 const int pgsz = pBt->pageSize;
2396 int nOrig = pagerPagecount(pBt->pPager);
2398 if( PTRMAP_ISPAGE(pBt, nOrig) ){
2399 return SQLITE_CORRUPT_BKPT;
2401 if( nOrig==PENDING_BYTE_PAGE(pBt) ){
2404 nFree = get4byte(&pBt->pPage1->aData[36]);
2405 nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+pgsz/5)/(pgsz/5);
2406 nFin = nOrig - nFree - nPtrmap;
2407 if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<=PENDING_BYTE_PAGE(pBt) ){
2410 while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
2415 while( rc==SQLITE_OK ){
2416 rc = incrVacuumStep(pBt, nFin);
2418 if( rc==SQLITE_DONE ){
2419 assert(nFin==0 || pBt->nTrunc==0 || nFin<=pBt->nTrunc);
2421 if( pBt->nTrunc && nFin ){
2422 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
2423 put4byte(&pBt->pPage1->aData[32], 0);
2424 put4byte(&pBt->pPage1->aData[36], 0);
2428 if( rc!=SQLITE_OK ){
2429 sqlite3PagerRollback(pPager);
2433 if( rc==SQLITE_OK ){
2434 *pnTrunc = pBt->nTrunc;
2437 assert( nRef==sqlite3PagerRefcount(pPager) );
2444 ** This routine does the first phase of a two-phase commit. This routine
2445 ** causes a rollback journal to be created (if it does not already exist)
2446 ** and populated with enough information so that if a power loss occurs
2447 ** the database can be restored to its original state by playing back
2448 ** the journal. Then the contents of the journal are flushed out to
2449 ** the disk. After the journal is safely on oxide, the changes to the
2450 ** database are written into the database file and flushed to oxide.
2451 ** At the end of this call, the rollback journal still exists on the
2452 ** disk and we are still holding all locks, so the transaction has not
2453 ** committed. See sqlite3BtreeCommit() for the second phase of the
2456 ** This call is a no-op if no write-transaction is currently active on pBt.
2458 ** Otherwise, sync the database file for the btree pBt. zMaster points to
2459 ** the name of a master journal file that should be written into the
2460 ** individual journal file, or is NULL, indicating no master journal file
2461 ** (single database transaction).
2463 ** When this is called, the master journal should already have been
2464 ** created, populated with this journal pointer and synced to disk.
2466 ** Once this is routine has returned, the only thing required to commit
2467 ** the write-transaction for this database file is to delete the journal.
2469 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
2471 if( p->inTrans==TRANS_WRITE ){
2472 BtShared *pBt = p->pBt;
2474 sqlite3BtreeEnter(p);
2476 #ifndef SQLITE_OMIT_AUTOVACUUM
2477 if( pBt->autoVacuum ){
2478 rc = autoVacuumCommit(pBt, &nTrunc);
2479 if( rc!=SQLITE_OK ){
2480 sqlite3BtreeLeave(p);
2485 rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, nTrunc, 0);
2486 sqlite3BtreeLeave(p);
2492 ** Commit the transaction currently in progress.
2494 ** This routine implements the second phase of a 2-phase commit. The
2495 ** sqlite3BtreeSync() routine does the first phase and should be invoked
2496 ** prior to calling this routine. The sqlite3BtreeSync() routine did
2497 ** all the work of writing information out to disk and flushing the
2498 ** contents so that they are written onto the disk platter. All this
2499 ** routine has to do is delete or truncate the rollback journal
2500 ** (which causes the transaction to commit) and drop locks.
2502 ** This will release the write lock on the database file. If there
2503 ** are no active cursors, it also releases the read lock.
2505 int sqlite3BtreeCommitPhaseTwo(Btree *p){
2506 BtShared *pBt = p->pBt;
2508 sqlite3BtreeEnter(p);
2512 /* If the handle has a write-transaction open, commit the shared-btrees
2513 ** transaction and set the shared state to TRANS_READ.
2515 if( p->inTrans==TRANS_WRITE ){
2517 assert( pBt->inTransaction==TRANS_WRITE );
2518 assert( pBt->nTransaction>0 );
2519 rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
2520 if( rc!=SQLITE_OK ){
2521 sqlite3BtreeLeave(p);
2524 pBt->inTransaction = TRANS_READ;
2529 /* If the handle has any kind of transaction open, decrement the transaction
2530 ** count of the shared btree. If the transaction count reaches 0, set
2531 ** the shared state to TRANS_NONE. The unlockBtreeIfUnused() call below
2532 ** will unlock the pager.
2534 if( p->inTrans!=TRANS_NONE ){
2535 pBt->nTransaction--;
2536 if( 0==pBt->nTransaction ){
2537 pBt->inTransaction = TRANS_NONE;
2541 /* Set the handles current transaction state to TRANS_NONE and unlock
2542 ** the pager if this call closed the only read or write transaction.
2544 p->inTrans = TRANS_NONE;
2545 unlockBtreeIfUnused(pBt);
2548 sqlite3BtreeLeave(p);
2553 ** Do both phases of a commit.
2555 int sqlite3BtreeCommit(Btree *p){
2557 sqlite3BtreeEnter(p);
2558 rc = sqlite3BtreeCommitPhaseOne(p, 0);
2559 if( rc==SQLITE_OK ){
2560 rc = sqlite3BtreeCommitPhaseTwo(p);
2562 sqlite3BtreeLeave(p);
2568 ** Return the number of write-cursors open on this handle. This is for use
2569 ** in assert() expressions, so it is only compiled if NDEBUG is not
2572 ** For the purposes of this routine, a write-cursor is any cursor that
2573 ** is capable of writing to the databse. That means the cursor was
2574 ** originally opened for writing and the cursor has not be disabled
2575 ** by having its state changed to CURSOR_FAULT.
2577 static int countWriteCursors(BtShared *pBt){
2580 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
2581 if( pCur->wrFlag && pCur->eState!=CURSOR_FAULT ) r++;
2588 ** This routine sets the state to CURSOR_FAULT and the error
2589 ** code to errCode for every cursor on BtShared that pBtree
2592 ** Every cursor is tripped, including cursors that belong
2593 ** to other database connections that happen to be sharing
2594 ** the cache with pBtree.
2596 ** This routine gets called when a rollback occurs.
2597 ** All cursors using the same cache must be tripped
2598 ** to prevent them from trying to use the btree after
2599 ** the rollback. The rollback may have deleted tables
2600 ** or moved root pages, so it is not sufficient to
2601 ** save the state of the cursor. The cursor must be
2604 void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){
2606 sqlite3BtreeEnter(pBtree);
2607 for(p=pBtree->pBt->pCursor; p; p=p->pNext){
2608 sqlite3BtreeClearCursor(p);
2609 p->eState = CURSOR_FAULT;
2612 sqlite3BtreeLeave(pBtree);
2616 ** Rollback the transaction in progress. All cursors will be
2617 ** invalided by this operation. Any attempt to use a cursor
2618 ** that was open at the beginning of this operation will result
2621 ** This will release the write lock on the database file. If there
2622 ** are no active cursors, it also releases the read lock.
2624 int sqlite3BtreeRollback(Btree *p){
2626 BtShared *pBt = p->pBt;
2629 sqlite3BtreeEnter(p);
2631 rc = saveAllCursors(pBt, 0, 0);
2632 #ifndef SQLITE_OMIT_SHARED_CACHE
2633 if( rc!=SQLITE_OK ){
2634 /* This is a horrible situation. An IO or malloc() error occured whilst
2635 ** trying to save cursor positions. If this is an automatic rollback (as
2636 ** the result of a constraint, malloc() failure or IO error) then
2637 ** the cache may be internally inconsistent (not contain valid trees) so
2638 ** we cannot simply return the error to the caller. Instead, abort
2639 ** all queries that may be using any of the cursors that failed to save.
2641 sqlite3BtreeTripAllCursors(p, rc);
2647 if( p->inTrans==TRANS_WRITE ){
2650 #ifndef SQLITE_OMIT_AUTOVACUUM
2654 assert( TRANS_WRITE==pBt->inTransaction );
2655 rc2 = sqlite3PagerRollback(pBt->pPager);
2656 if( rc2!=SQLITE_OK ){
2660 /* The rollback may have destroyed the pPage1->aData value. So
2661 ** call sqlite3BtreeGetPage() on page 1 again to make
2662 ** sure pPage1->aData is set correctly. */
2663 if( sqlite3BtreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
2664 releasePage(pPage1);
2666 assert( countWriteCursors(pBt)==0 );
2667 pBt->inTransaction = TRANS_READ;
2670 if( p->inTrans!=TRANS_NONE ){
2671 assert( pBt->nTransaction>0 );
2672 pBt->nTransaction--;
2673 if( 0==pBt->nTransaction ){
2674 pBt->inTransaction = TRANS_NONE;
2678 p->inTrans = TRANS_NONE;
2680 unlockBtreeIfUnused(pBt);
2683 sqlite3BtreeLeave(p);
2688 ** Start a statement subtransaction. The subtransaction can
2689 ** can be rolled back independently of the main transaction.
2690 ** You must start a transaction before starting a subtransaction.
2691 ** The subtransaction is ended automatically if the main transaction
2692 ** commits or rolls back.
2694 ** Only one subtransaction may be active at a time. It is an error to try
2695 ** to start a new subtransaction if another subtransaction is already active.
2697 ** Statement subtransactions are used around individual SQL statements
2698 ** that are contained within a BEGIN...COMMIT block. If a constraint
2699 ** error occurs within the statement, the effect of that one statement
2700 ** can be rolled back without having to rollback the entire transaction.
2702 int sqlite3BtreeBeginStmt(Btree *p){
2704 BtShared *pBt = p->pBt;
2705 sqlite3BtreeEnter(p);
2707 if( (p->inTrans!=TRANS_WRITE) || pBt->inStmt ){
2708 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
2710 assert( pBt->inTransaction==TRANS_WRITE );
2711 rc = pBt->readOnly ? SQLITE_OK : sqlite3PagerStmtBegin(pBt->pPager);
2714 sqlite3BtreeLeave(p);
2720 ** Commit the statment subtransaction currently in progress. If no
2721 ** subtransaction is active, this is a no-op.
2723 int sqlite3BtreeCommitStmt(Btree *p){
2725 BtShared *pBt = p->pBt;
2726 sqlite3BtreeEnter(p);
2728 if( pBt->inStmt && !pBt->readOnly ){
2729 rc = sqlite3PagerStmtCommit(pBt->pPager);
2734 sqlite3BtreeLeave(p);
2739 ** Rollback the active statement subtransaction. If no subtransaction
2740 ** is active this routine is a no-op.
2742 ** All cursors will be invalidated by this operation. Any attempt
2743 ** to use a cursor that was open at the beginning of this operation
2744 ** will result in an error.
2746 int sqlite3BtreeRollbackStmt(Btree *p){
2748 BtShared *pBt = p->pBt;
2749 sqlite3BtreeEnter(p);
2751 if( pBt->inStmt && !pBt->readOnly ){
2752 rc = sqlite3PagerStmtRollback(pBt->pPager);
2755 sqlite3BtreeLeave(p);
2760 ** Create a new cursor for the BTree whose root is on the page
2761 ** iTable. The act of acquiring a cursor gets a read lock on
2762 ** the database file.
2764 ** If wrFlag==0, then the cursor can only be used for reading.
2765 ** If wrFlag==1, then the cursor can be used for reading or for
2766 ** writing if other conditions for writing are also met. These
2767 ** are the conditions that must be met in order for writing to
2770 ** 1: The cursor must have been opened with wrFlag==1
2772 ** 2: Other database connections that share the same pager cache
2773 ** but which are not in the READ_UNCOMMITTED state may not have
2774 ** cursors open with wrFlag==0 on the same table. Otherwise
2775 ** the changes made by this write cursor would be visible to
2776 ** the read cursors in the other database connection.
2778 ** 3: The database must be writable (not on read-only media)
2780 ** 4: There must be an active transaction.
2782 ** No checking is done to make sure that page iTable really is the
2783 ** root page of a b-tree. If it is not, then the cursor acquired
2784 ** will not work correctly.
2786 ** It is assumed that the sqlite3BtreeCursorSize() bytes of memory
2787 ** pointed to by pCur have been zeroed by the caller.
2789 static int btreeCursor(
2790 Btree *p, /* The btree */
2791 int iTable, /* Root page of table to open */
2792 int wrFlag, /* 1 to write. 0 read-only */
2793 struct KeyInfo *pKeyInfo, /* First arg to comparison function */
2794 BtCursor *pCur /* Space for new cursor */
2797 BtShared *pBt = p->pBt;
2799 assert( sqlite3BtreeHoldsMutex(p) );
2801 if( pBt->readOnly ){
2802 return SQLITE_READONLY;
2804 if( checkReadLocks(p, iTable, 0, 0) ){
2805 return SQLITE_LOCKED;
2809 if( pBt->pPage1==0 ){
2810 rc = lockBtreeWithRetry(p);
2811 if( rc!=SQLITE_OK ){
2814 if( pBt->readOnly && wrFlag ){
2815 return SQLITE_READONLY;
2818 pCur->pgnoRoot = (Pgno)iTable;
2819 if( iTable==1 && pagerPagecount(pBt->pPager)==0 ){
2821 goto create_cursor_exception;
2823 rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]);
2824 if( rc!=SQLITE_OK ){
2825 goto create_cursor_exception;
2828 /* Now that no other errors can occur, finish filling in the BtCursor
2829 ** variables, link the cursor into the BtShared list and set *ppCur (the
2830 ** output argument to this function).
2832 pCur->pKeyInfo = pKeyInfo;
2835 pCur->wrFlag = wrFlag;
2836 pCur->pNext = pBt->pCursor;
2838 pCur->pNext->pPrev = pCur;
2840 pBt->pCursor = pCur;
2841 pCur->eState = CURSOR_INVALID;
2845 create_cursor_exception:
2846 releasePage(pCur->apPage[0]);
2847 unlockBtreeIfUnused(pBt);
2850 int sqlite3BtreeCursor(
2851 Btree *p, /* The btree */
2852 int iTable, /* Root page of table to open */
2853 int wrFlag, /* 1 to write. 0 read-only */
2854 struct KeyInfo *pKeyInfo, /* First arg to xCompare() */
2855 BtCursor *pCur /* Write new cursor here */
2858 sqlite3BtreeEnter(p);
2860 rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
2861 sqlite3BtreeLeave(p);
2864 int sqlite3BtreeCursorSize(){
2865 return sizeof(BtCursor);
2871 ** Close a cursor. The read lock on the database file is released
2872 ** when the last cursor is closed.
2874 int sqlite3BtreeCloseCursor(BtCursor *pCur){
2875 Btree *pBtree = pCur->pBtree;
2878 BtShared *pBt = pCur->pBt;
2879 sqlite3BtreeEnter(pBtree);
2880 pBt->db = pBtree->db;
2881 sqlite3BtreeClearCursor(pCur);
2883 pCur->pPrev->pNext = pCur->pNext;
2885 pBt->pCursor = pCur->pNext;
2888 pCur->pNext->pPrev = pCur->pPrev;
2890 for(i=0; i<=pCur->iPage; i++){
2891 releasePage(pCur->apPage[i]);
2893 unlockBtreeIfUnused(pBt);
2894 invalidateOverflowCache(pCur);
2895 /* sqlite3_free(pCur); */
2896 sqlite3BtreeLeave(pBtree);
2902 ** Make a temporary cursor by filling in the fields of pTempCur.
2903 ** The temporary cursor is not on the cursor list for the Btree.
2905 void sqlite3BtreeGetTempCursor(BtCursor *pCur, BtCursor *pTempCur){
2907 assert( cursorHoldsMutex(pCur) );
2908 memcpy(pTempCur, pCur, sizeof(BtCursor));
2909 pTempCur->pNext = 0;
2910 pTempCur->pPrev = 0;
2911 for(i=0; i<=pTempCur->iPage; i++){
2912 sqlite3PagerRef(pTempCur->apPage[i]->pDbPage);
2917 ** Delete a temporary cursor such as was made by the CreateTemporaryCursor()
2920 void sqlite3BtreeReleaseTempCursor(BtCursor *pCur){
2922 assert( cursorHoldsMutex(pCur) );
2923 for(i=0; i<=pCur->iPage; i++){
2924 sqlite3PagerUnref(pCur->apPage[i]->pDbPage);
2929 ** Make sure the BtCursor* given in the argument has a valid
2930 ** BtCursor.info structure. If it is not already valid, call
2931 ** sqlite3BtreeParseCell() to fill it in.
2933 ** BtCursor.info is a cache of the information in the current cell.
2934 ** Using this cache reduces the number of calls to sqlite3BtreeParseCell().
2936 ** 2007-06-25: There is a bug in some versions of MSVC that cause the
2937 ** compiler to crash when getCellInfo() is implemented as a macro.
2938 ** But there is a measureable speed advantage to using the macro on gcc
2939 ** (when less compiler optimizations like -Os or -O0 are used and the
2940 ** compiler is not doing agressive inlining.) So we use a real function
2941 ** for MSVC and a macro for everything else. Ticket #2457.
2944 static void assertCellInfo(BtCursor *pCur){
2946 int iPage = pCur->iPage;
2947 memset(&info, 0, sizeof(info));
2948 sqlite3BtreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info);
2949 assert( memcmp(&info, &pCur->info, sizeof(info))==0 );
2952 #define assertCellInfo(x)
2955 /* Use a real function in MSVC to work around bugs in that compiler. */
2956 static void getCellInfo(BtCursor *pCur){
2957 if( pCur->info.nSize==0 ){
2958 int iPage = pCur->iPage;
2959 sqlite3BtreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);
2960 pCur->validNKey = 1;
2962 assertCellInfo(pCur);
2965 #else /* if not _MSC_VER */
2966 /* Use a macro in all other compilers so that the function is inlined */
2967 #define getCellInfo(pCur) \
2968 if( pCur->info.nSize==0 ){ \
2969 int iPage = pCur->iPage; \
2970 sqlite3BtreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info); \
2971 pCur->validNKey = 1; \
2973 assertCellInfo(pCur); \
2975 #endif /* _MSC_VER */
2978 ** Set *pSize to the size of the buffer needed to hold the value of
2979 ** the key for the current entry. If the cursor is not pointing
2980 ** to a valid entry, *pSize is set to 0.
2982 ** For a table with the INTKEY flag set, this routine returns the key
2983 ** itself, not the number of bytes in the key.
2985 int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
2988 assert( cursorHoldsMutex(pCur) );
2989 rc = restoreCursorPosition(pCur);
2990 if( rc==SQLITE_OK ){
2991 assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
2992 if( pCur->eState==CURSOR_INVALID ){
2996 *pSize = pCur->info.nKey;
3003 ** Set *pSize to the number of bytes of data in the entry the
3004 ** cursor currently points to. Always return SQLITE_OK.
3005 ** Failure is not possible. If the cursor is not currently
3006 ** pointing to an entry (which can happen, for example, if
3007 ** the database is empty) then *pSize is set to 0.
3009 int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
3012 assert( cursorHoldsMutex(pCur) );
3013 rc = restoreCursorPosition(pCur);
3014 if( rc==SQLITE_OK ){
3015 assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
3016 if( pCur->eState==CURSOR_INVALID ){
3017 /* Not pointing at a valid entry - set *pSize to 0. */
3021 *pSize = pCur->info.nData;
3028 ** Given the page number of an overflow page in the database (parameter
3029 ** ovfl), this function finds the page number of the next page in the
3030 ** linked list of overflow pages. If possible, it uses the auto-vacuum
3031 ** pointer-map data instead of reading the content of page ovfl to do so.
3033 ** If an error occurs an SQLite error code is returned. Otherwise:
3035 ** Unless pPgnoNext is NULL, the page number of the next overflow
3036 ** page in the linked list is written to *pPgnoNext. If page ovfl
3037 ** is the last page in its linked list, *pPgnoNext is set to zero.
3039 ** If ppPage is not NULL, *ppPage is set to the MemPage* handle
3040 ** for page ovfl. The underlying pager page may have been requested
3041 ** with the noContent flag set, so the page data accessable via
3042 ** this handle may not be trusted.
3044 static int getOverflowPage(
3046 Pgno ovfl, /* Overflow page */
3047 MemPage **ppPage, /* OUT: MemPage handle */
3048 Pgno *pPgnoNext /* OUT: Next overflow page number */
3053 assert( sqlite3_mutex_held(pBt->mutex) );
3054 /* One of these must not be NULL. Otherwise, why call this function? */
3055 assert(ppPage || pPgnoNext);
3057 /* If pPgnoNext is NULL, then this function is being called to obtain
3058 ** a MemPage* reference only. No page-data is required in this case.
3061 return sqlite3BtreeGetPage(pBt, ovfl, ppPage, 1);
3064 #ifndef SQLITE_OMIT_AUTOVACUUM
3065 /* Try to find the next page in the overflow list using the
3066 ** autovacuum pointer-map pages. Guess that the next page in
3067 ** the overflow list is page number (ovfl+1). If that guess turns
3068 ** out to be wrong, fall back to loading the data of page
3069 ** number ovfl to determine the next page number.
3071 if( pBt->autoVacuum ){
3073 Pgno iGuess = ovfl+1;
3076 while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
3080 if( iGuess<=pagerPagecount(pBt->pPager) ){
3081 rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
3082 if( rc!=SQLITE_OK ){
3085 if( eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
3092 if( next==0 || ppPage ){
3095 rc = sqlite3BtreeGetPage(pBt, ovfl, &pPage, next!=0);
3096 assert(rc==SQLITE_OK || pPage==0);
3097 if( next==0 && rc==SQLITE_OK ){
3098 next = get4byte(pPage->aData);
3113 ** Copy data from a buffer to a page, or from a page to a buffer.
3115 ** pPayload is a pointer to data stored on database page pDbPage.
3116 ** If argument eOp is false, then nByte bytes of data are copied
3117 ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
3118 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
3119 ** of data are copied from the buffer pBuf to pPayload.
3121 ** SQLITE_OK is returned on success, otherwise an error code.
3123 static int copyPayload(
3124 void *pPayload, /* Pointer to page data */
3125 void *pBuf, /* Pointer to buffer */
3126 int nByte, /* Number of bytes to copy */
3127 int eOp, /* 0 -> copy from page, 1 -> copy to page */
3128 DbPage *pDbPage /* Page containing pPayload */
3131 /* Copy data from buffer to page (a write operation) */
3132 int rc = sqlite3PagerWrite(pDbPage);
3133 if( rc!=SQLITE_OK ){
3136 memcpy(pPayload, pBuf, nByte);
3138 /* Copy data from page to buffer (a read operation) */
3139 memcpy(pBuf, pPayload, nByte);
3145 ** This function is used to read or overwrite payload information
3146 ** for the entry that the pCur cursor is pointing to. If the eOp
3147 ** parameter is 0, this is a read operation (data copied into
3148 ** buffer pBuf). If it is non-zero, a write (data copied from
3151 ** A total of "amt" bytes are read or written beginning at "offset".
3152 ** Data is read to or from the buffer pBuf.
3154 ** This routine does not make a distinction between key and data.
3155 ** It just reads or writes bytes from the payload area. Data might
3156 ** appear on the main page or be scattered out on multiple overflow
3159 ** If the BtCursor.isIncrblobHandle flag is set, and the current
3160 ** cursor entry uses one or more overflow pages, this function
3161 ** allocates space for and lazily popluates the overflow page-list
3162 ** cache array (BtCursor.aOverflow). Subsequent calls use this
3163 ** cache to make seeking to the supplied offset more efficient.
3165 ** Once an overflow page-list cache has been allocated, it may be
3166 ** invalidated if some other cursor writes to the same table, or if
3167 ** the cursor is moved to a different row. Additionally, in auto-vacuum
3168 ** mode, the following events may invalidate an overflow page-list cache.
3170 ** * An incremental vacuum,
3171 ** * A commit in auto_vacuum="full" mode,
3172 ** * Creating a table (may require moving an overflow page).
3174 static int accessPayload(
3175 BtCursor *pCur, /* Cursor pointing to entry to read from */
3176 int offset, /* Begin reading this far into payload */
3177 int amt, /* Read this many bytes */
3178 unsigned char *pBuf, /* Write the bytes into this buffer */
3179 int skipKey, /* offset begins at data if this is true */
3180 int eOp /* zero to read. non-zero to write. */
3182 unsigned char *aPayload;
3186 MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */
3187 BtShared *pBt = pCur->pBt; /* Btree this cursor belongs to */
3190 assert( pCur->eState==CURSOR_VALID );
3191 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
3192 assert( offset>=0 );
3193 assert( cursorHoldsMutex(pCur) );
3196 aPayload = pCur->info.pCell + pCur->info.nHeader;
3197 nKey = (pPage->intKey ? 0 : pCur->info.nKey);
3202 if( offset+amt > nKey+pCur->info.nData
3203 || &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize]
3205 /* Trying to read or write past the end of the data is an error */
3206 return SQLITE_CORRUPT_BKPT;
3209 /* Check if data must be read/written to/from the btree page itself. */
3210 if( offset<pCur->info.nLocal ){
3212 if( a+offset>pCur->info.nLocal ){
3213 a = pCur->info.nLocal - offset;
3215 rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
3220 offset -= pCur->info.nLocal;
3224 if( rc==SQLITE_OK && amt>0 ){
3225 const int ovflSize = pBt->usableSize - 4; /* Bytes content per ovfl page */
3228 nextPage = get4byte(&aPayload[pCur->info.nLocal]);
3230 #ifndef SQLITE_OMIT_INCRBLOB
3231 /* If the isIncrblobHandle flag is set and the BtCursor.aOverflow[]
3232 ** has not been allocated, allocate it now. The array is sized at
3233 ** one entry for each overflow page in the overflow chain. The
3234 ** page number of the first overflow page is stored in aOverflow[0],
3235 ** etc. A value of 0 in the aOverflow[] array means "not yet known"
3236 ** (the cache is lazily populated).
3238 if( pCur->isIncrblobHandle && !pCur->aOverflow ){
3239 int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
3240 pCur->aOverflow = (Pgno *)sqlite3MallocZero(sizeof(Pgno)*nOvfl);
3241 if( nOvfl && !pCur->aOverflow ){
3246 /* If the overflow page-list cache has been allocated and the
3247 ** entry for the first required overflow page is valid, skip
3250 if( pCur->aOverflow && pCur->aOverflow[offset/ovflSize] ){
3251 iIdx = (offset/ovflSize);
3252 nextPage = pCur->aOverflow[iIdx];
3253 offset = (offset%ovflSize);
3257 for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){
3259 #ifndef SQLITE_OMIT_INCRBLOB
3260 /* If required, populate the overflow page-list cache. */
3261 if( pCur->aOverflow ){
3262 assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage);
3263 pCur->aOverflow[iIdx] = nextPage;
3267 if( offset>=ovflSize ){
3268 /* The only reason to read this page is to obtain the page
3269 ** number for the next page in the overflow chain. The page
3270 ** data is not required. So first try to lookup the overflow
3271 ** page-list cache, if any, then fall back to the getOverflowPage()
3274 #ifndef SQLITE_OMIT_INCRBLOB
3275 if( pCur->aOverflow && pCur->aOverflow[iIdx+1] ){
3276 nextPage = pCur->aOverflow[iIdx+1];
3279 rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
3282 /* Need to read this page properly. It contains some of the
3283 ** range of data that is being read (eOp==0) or written (eOp!=0).
3287 rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage);
3288 if( rc==SQLITE_OK ){
3289 aPayload = sqlite3PagerGetData(pDbPage);
3290 nextPage = get4byte(aPayload);
3291 if( a + offset > ovflSize ){
3292 a = ovflSize - offset;
3294 rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
3295 sqlite3PagerUnref(pDbPage);
3304 if( rc==SQLITE_OK && amt>0 ){
3305 return SQLITE_CORRUPT_BKPT;
3311 ** Read part of the key associated with cursor pCur. Exactly
3312 ** "amt" bytes will be transfered into pBuf[]. The transfer
3313 ** begins at "offset".
3315 ** Return SQLITE_OK on success or an error code if anything goes
3316 ** wrong. An error is returned if "offset+amt" is larger than
3317 ** the available payload.
3319 int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
3322 assert( cursorHoldsMutex(pCur) );
3323 rc = restoreCursorPosition(pCur);
3324 if( rc==SQLITE_OK ){
3325 assert( pCur->eState==CURSOR_VALID );
3326 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
3327 if( pCur->apPage[0]->intKey ){
3328 return SQLITE_CORRUPT_BKPT;
3330 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
3331 rc = accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0, 0);
3337 ** Read part of the data associated with cursor pCur. Exactly
3338 ** "amt" bytes will be transfered into pBuf[]. The transfer
3339 ** begins at "offset".
3341 ** Return SQLITE_OK on success or an error code if anything goes
3342 ** wrong. An error is returned if "offset+amt" is larger than
3343 ** the available payload.
3345 int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
3348 #ifndef SQLITE_OMIT_INCRBLOB
3349 if ( pCur->eState==CURSOR_INVALID ){
3350 return SQLITE_ABORT;
3354 assert( cursorHoldsMutex(pCur) );
3355 rc = restoreCursorPosition(pCur);
3356 if( rc==SQLITE_OK ){
3357 assert( pCur->eState==CURSOR_VALID );
3358 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] );
3359 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
3360 rc = accessPayload(pCur, offset, amt, pBuf, 1, 0);
3366 ** Return a pointer to payload information from the entry that the
3367 ** pCur cursor is pointing to. The pointer is to the beginning of
3368 ** the key if skipKey==0 and it points to the beginning of data if
3369 ** skipKey==1. The number of bytes of available key/data is written
3370 ** into *pAmt. If *pAmt==0, then the value returned will not be
3373 ** This routine is an optimization. It is common for the entire key
3374 ** and data to fit on the local page and for there to be no overflow
3375 ** pages. When that is so, this routine can be used to access the
3376 ** key and data without making a copy. If the key and/or data spills
3377 ** onto overflow pages, then accessPayload() must be used to reassembly
3378 ** the key/data and copy it into a preallocated buffer.
3380 ** The pointer returned by this routine looks directly into the cached
3381 ** page of the database. The data might change or move the next time
3382 ** any btree routine is called.
3384 static const unsigned char *fetchPayload(
3385 BtCursor *pCur, /* Cursor pointing to entry to read from */
3386 int *pAmt, /* Write the number of available bytes here */
3387 int skipKey /* read beginning at data if this is true */
3389 unsigned char *aPayload;
3394 assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]);
3395 assert( pCur->eState==CURSOR_VALID );
3396 assert( cursorHoldsMutex(pCur) );
3397 pPage = pCur->apPage[pCur->iPage];
3398 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
3400 aPayload = pCur->info.pCell;
3401 aPayload += pCur->info.nHeader;
3402 if( pPage->intKey ){
3405 nKey = pCur->info.nKey;
3409 nLocal = pCur->info.nLocal - nKey;
3411 nLocal = pCur->info.nLocal;
3422 ** For the entry that cursor pCur is point to, return as
3423 ** many bytes of the key or data as are available on the local
3424 ** b-tree page. Write the number of available bytes into *pAmt.
3426 ** The pointer returned is ephemeral. The key/data may move
3427 ** or be destroyed on the next call to any Btree routine,
3428 ** including calls from other threads against the same cache.
3429 ** Hence, a mutex on the BtShared should be held prior to calling
3432 ** These routines is used to get quick access to key and data
3433 ** in the common case where no overflow pages are used.
3435 const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt){
3436 assert( cursorHoldsMutex(pCur) );
3437 if( pCur->eState==CURSOR_VALID ){
3438 return (const void*)fetchPayload(pCur, pAmt, 0);
3442 const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt){
3443 assert( cursorHoldsMutex(pCur) );
3444 if( pCur->eState==CURSOR_VALID ){
3445 return (const void*)fetchPayload(pCur, pAmt, 1);
3452 ** Move the cursor down to a new child page. The newPgno argument is the
3453 ** page number of the child page to move to.
3455 static int moveToChild(BtCursor *pCur, u32 newPgno){
3457 int i = pCur->iPage;
3459 BtShared *pBt = pCur->pBt;
3461 assert( cursorHoldsMutex(pCur) );
3462 assert( pCur->eState==CURSOR_VALID );
3463 assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
3464 if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
3465 return SQLITE_CORRUPT_BKPT;
3467 rc = getAndInitPage(pBt, newPgno, &pNewPage);
3469 pCur->apPage[i+1] = pNewPage;
3470 pCur->aiIdx[i+1] = 0;
3473 pCur->info.nSize = 0;
3474 pCur->validNKey = 0;
3475 if( pNewPage->nCell<1 ){
3476 return SQLITE_CORRUPT_BKPT;
3483 ** Page pParent is an internal (non-leaf) tree page. This function
3484 ** asserts that page number iChild is the left-child if the iIdx'th
3485 ** cell in page pParent. Or, if iIdx is equal to the total number of
3486 ** cells in pParent, that page number iChild is the right-child of
3489 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
3490 assert( iIdx<=pParent->nCell );
3491 if( iIdx==pParent->nCell ){
3492 assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
3494 assert( get4byte(findCell(pParent, iIdx))==iChild );
3498 # define assertParentIndex(x,y,z)
3502 ** Move the cursor up to the parent page.
3504 ** pCur->idx is set to the cell index that contains the pointer
3505 ** to the page we are coming from. If we are coming from the
3506 ** right-most child page then pCur->idx is set to one more than
3507 ** the largest cell index.
3509 void sqlite3BtreeMoveToParent(BtCursor *pCur){
3510 assert( cursorHoldsMutex(pCur) );
3511 assert( pCur->eState==CURSOR_VALID );
3512 assert( pCur->iPage>0 );
3513 assert( pCur->apPage[pCur->iPage] );
3515 pCur->apPage[pCur->iPage-1],
3516 pCur->aiIdx[pCur->iPage-1],
3517 pCur->apPage[pCur->iPage]->pgno
3519 releasePage(pCur->apPage[pCur->iPage]);
3521 pCur->info.nSize = 0;
3522 pCur->validNKey = 0;
3526 ** Move the cursor to the root page
3528 static int moveToRoot(BtCursor *pCur){
3531 Btree *p = pCur->pBtree;
3532 BtShared *pBt = p->pBt;
3534 assert( cursorHoldsMutex(pCur) );
3535 assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
3536 assert( CURSOR_VALID < CURSOR_REQUIRESEEK );
3537 assert( CURSOR_FAULT > CURSOR_REQUIRESEEK );
3538 if( pCur->eState>=CURSOR_REQUIRESEEK ){
3539 if( pCur->eState==CURSOR_FAULT ){
3542 sqlite3BtreeClearCursor(pCur);
3545 if( pCur->iPage>=0 ){
3547 for(i=1; i<=pCur->iPage; i++){
3548 releasePage(pCur->apPage[i]);
3552 SQLITE_OK!=(rc = getAndInitPage(pBt, pCur->pgnoRoot, &pCur->apPage[0]))
3554 pCur->eState = CURSOR_INVALID;
3559 pRoot = pCur->apPage[0];
3560 assert( pRoot->pgno==pCur->pgnoRoot );
3563 pCur->info.nSize = 0;
3565 pCur->validNKey = 0;
3567 if( pRoot->nCell==0 && !pRoot->leaf ){
3569 assert( pRoot->pgno==1 );
3570 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
3571 assert( subpage>0 );
3572 pCur->eState = CURSOR_VALID;
3573 rc = moveToChild(pCur, subpage);
3575 pCur->eState = ((pRoot->nCell>0)?CURSOR_VALID:CURSOR_INVALID);
3581 ** Move the cursor down to the left-most leaf entry beneath the
3582 ** entry to which it is currently pointing.
3584 ** The left-most leaf is the one with the smallest key - the first
3585 ** in ascending order.
3587 static int moveToLeftmost(BtCursor *pCur){
3592 assert( cursorHoldsMutex(pCur) );
3593 assert( pCur->eState==CURSOR_VALID );
3594 while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
3595 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
3596 pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage]));
3597 rc = moveToChild(pCur, pgno);
3603 ** Move the cursor down to the right-most leaf entry beneath the
3604 ** page to which it is currently pointing. Notice the difference
3605 ** between moveToLeftmost() and moveToRightmost(). moveToLeftmost()
3606 ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
3607 ** finds the right-most entry beneath the *page*.
3609 ** The right-most entry is the one with the largest key - the last
3610 ** key in ascending order.
3612 static int moveToRightmost(BtCursor *pCur){
3617 assert( cursorHoldsMutex(pCur) );
3618 assert( pCur->eState==CURSOR_VALID );
3619 while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){
3620 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
3621 pCur->aiIdx[pCur->iPage] = pPage->nCell;
3622 rc = moveToChild(pCur, pgno);
3624 if( rc==SQLITE_OK ){
3625 pCur->aiIdx[pCur->iPage] = pPage->nCell-1;
3626 pCur->info.nSize = 0;
3627 pCur->validNKey = 0;
3632 /* Move the cursor to the first entry in the table. Return SQLITE_OK
3633 ** on success. Set *pRes to 0 if the cursor actually points to something
3634 ** or set *pRes to 1 if the table is empty.
3636 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
3639 assert( cursorHoldsMutex(pCur) );
3640 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
3641 rc = moveToRoot(pCur);
3642 if( rc==SQLITE_OK ){
3643 if( pCur->eState==CURSOR_INVALID ){
3644 assert( pCur->apPage[pCur->iPage]->nCell==0 );
3648 assert( pCur->apPage[pCur->iPage]->nCell>0 );
3650 rc = moveToLeftmost(pCur);
3656 /* Move the cursor to the last entry in the table. Return SQLITE_OK
3657 ** on success. Set *pRes to 0 if the cursor actually points to something
3658 ** or set *pRes to 1 if the table is empty.
3660 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
3663 assert( cursorHoldsMutex(pCur) );
3664 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
3665 rc = moveToRoot(pCur);
3666 if( rc==SQLITE_OK ){
3667 if( CURSOR_INVALID==pCur->eState ){
3668 assert( pCur->apPage[pCur->iPage]->nCell==0 );
3671 assert( pCur->eState==CURSOR_VALID );
3673 rc = moveToRightmost(pCur);
3675 pCur->atLast = rc==SQLITE_OK;
3681 /* Move the cursor so that it points to an entry near the key
3682 ** specified by pIdxKey or intKey. Return a success code.
3684 ** For INTKEY tables, the intKey parameter is used. pIdxKey
3685 ** must be NULL. For index tables, pIdxKey is used and intKey
3688 ** If an exact match is not found, then the cursor is always
3689 ** left pointing at a leaf page which would hold the entry if it
3690 ** were present. The cursor might point to an entry that comes
3691 ** before or after the key.
3693 ** The result of comparing the key with the entry to which the
3694 ** cursor is written to *pRes if pRes!=NULL. The meaning of
3695 ** this value is as follows:
3697 ** *pRes<0 The cursor is left pointing at an entry that
3698 ** is smaller than pKey or if the table is empty
3699 ** and the cursor is therefore left point to nothing.
3701 ** *pRes==0 The cursor is left pointing at an entry that
3702 ** exactly matches pKey.
3704 ** *pRes>0 The cursor is left pointing at an entry that
3705 ** is larger than pKey.
3708 int sqlite3BtreeMovetoUnpacked(
3709 BtCursor *pCur, /* The cursor to be moved */
3710 UnpackedRecord *pIdxKey, /* Unpacked index key */
3711 i64 intKey, /* The table key */
3712 int biasRight, /* If true, bias the search to the high end */
3713 int *pRes /* Write search results here */
3717 assert( cursorHoldsMutex(pCur) );
3718 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
3720 /* If the cursor is already positioned at the point we are trying
3721 ** to move to, then just return without doing any work */
3722 if( pCur->eState==CURSOR_VALID && pCur->validNKey
3723 && pCur->apPage[0]->intKey
3725 if( pCur->info.nKey==intKey ){
3729 if( pCur->atLast && pCur->info.nKey<intKey ){
3735 rc = moveToRoot(pCur);
3739 assert( pCur->apPage[pCur->iPage] );
3740 assert( pCur->apPage[pCur->iPage]->isInit );
3741 if( pCur->eState==CURSOR_INVALID ){
3743 assert( pCur->apPage[pCur->iPage]->nCell==0 );
3746 assert( pCur->apPage[0]->intKey || pIdxKey );
3750 MemPage *pPage = pCur->apPage[pCur->iPage];
3751 int c = -1; /* pRes return if table is empty must be -1 */
3753 upr = pPage->nCell-1;
3754 if( !pPage->intKey && pIdxKey==0 ){
3755 rc = SQLITE_CORRUPT_BKPT;
3759 pCur->aiIdx[pCur->iPage] = upr;
3761 pCur->aiIdx[pCur->iPage] = (upr+lwr)/2;
3763 if( lwr<=upr ) for(;;){
3766 int idx = pCur->aiIdx[pCur->iPage];
3767 pCur->info.nSize = 0;
3768 pCur->validNKey = 1;
3769 if( pPage->intKey ){
3771 pCell = findCell(pPage, idx) + pPage->childPtrSize;
3772 if( pPage->hasData ){
3774 pCell += getVarint32(pCell, dummy);
3776 getVarint(pCell, (u64*)&nCellKey);
3777 if( nCellKey==intKey ){
3779 }else if( nCellKey<intKey ){
3782 assert( nCellKey>intKey );
3787 pCellKey = (void *)fetchPayload(pCur, &available, 0);
3788 nCellKey = pCur->info.nKey;
3789 if( available>=nCellKey ){
3790 c = sqlite3VdbeRecordCompare(nCellKey, pCellKey, pIdxKey);
3792 pCellKey = sqlite3Malloc( nCellKey );
3797 rc = sqlite3BtreeKey(pCur, 0, nCellKey, (void *)pCellKey);
3798 c = sqlite3VdbeRecordCompare(nCellKey, pCellKey, pIdxKey);
3799 sqlite3_free(pCellKey);
3800 if( rc ) goto moveto_finish;
3804 pCur->info.nKey = nCellKey;
3805 if( pPage->intKey && !pPage->leaf ){
3810 if( pRes ) *pRes = 0;
3821 pCur->info.nKey = nCellKey;
3824 pCur->aiIdx[pCur->iPage] = (lwr+upr)/2;
3826 assert( lwr==upr+1 );
3827 assert( pPage->isInit );
3830 }else if( lwr>=pPage->nCell ){
3831 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
3833 chldPg = get4byte(findCell(pPage, lwr));
3836 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell );
3837 if( pRes ) *pRes = c;
3841 pCur->aiIdx[pCur->iPage] = lwr;
3842 pCur->info.nSize = 0;
3843 pCur->validNKey = 0;
3844 rc = moveToChild(pCur, chldPg);
3845 if( rc ) goto moveto_finish;
3852 ** In this version of BtreeMoveto, pKey is a packed index record
3853 ** such as is generated by the OP_MakeRecord opcode. Unpack the
3854 ** record and then call BtreeMovetoUnpacked() to do the work.
3856 int sqlite3BtreeMoveto(
3857 BtCursor *pCur, /* Cursor open on the btree to be searched */
3858 const void *pKey, /* Packed key if the btree is an index */
3859 i64 nKey, /* Integer key for tables. Size of pKey for indices */
3860 int bias, /* Bias search to the high end */
3861 int *pRes /* Write search results here */
3863 int rc; /* Status code */
3864 UnpackedRecord *pIdxKey; /* Unpacked index key */
3865 UnpackedRecord aSpace[16]; /* Temp space for pIdxKey - to avoid a malloc */
3868 pIdxKey = sqlite3VdbeRecordUnpack(pCur->pKeyInfo, nKey, pKey,
3869 aSpace, sizeof(aSpace));
3870 if( pIdxKey==0 ) return SQLITE_NOMEM;
3874 rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
3876 sqlite3VdbeDeleteUnpackedRecord(pIdxKey);
3883 ** Return TRUE if the cursor is not pointing at an entry of the table.
3885 ** TRUE will be returned after a call to sqlite3BtreeNext() moves
3886 ** past the last entry in the table or sqlite3BtreePrev() moves past
3887 ** the first entry. TRUE is also returned if the table is empty.
3889 int sqlite3BtreeEof(BtCursor *pCur){
3890 /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
3891 ** have been deleted? This API will need to change to return an error code
3892 ** as well as the boolean result value.
3894 return (CURSOR_VALID!=pCur->eState);
3898 ** Return the database connection handle for a cursor.
3900 sqlite3 *sqlite3BtreeCursorDb(const BtCursor *pCur){
3901 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
3902 return pCur->pBtree->db;
3906 ** Advance the cursor to the next entry in the database. If
3907 ** successful then set *pRes=0. If the cursor
3908 ** was already pointing to the last entry in the database before
3909 ** this routine was called, then set *pRes=1.
3911 int sqlite3BtreeNext(BtCursor *pCur, int *pRes){
3916 assert( cursorHoldsMutex(pCur) );
3917 rc = restoreCursorPosition(pCur);
3918 if( rc!=SQLITE_OK ){
3922 if( CURSOR_INVALID==pCur->eState ){
3933 pPage = pCur->apPage[pCur->iPage];
3934 idx = ++pCur->aiIdx[pCur->iPage];
3935 assert( pPage->isInit );
3936 assert( idx<=pPage->nCell );
3938 pCur->info.nSize = 0;
3939 pCur->validNKey = 0;
3940 if( idx>=pPage->nCell ){
3942 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
3944 rc = moveToLeftmost(pCur);
3949 if( pCur->iPage==0 ){
3951 pCur->eState = CURSOR_INVALID;
3954 sqlite3BtreeMoveToParent(pCur);
3955 pPage = pCur->apPage[pCur->iPage];
3956 }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell );
3958 if( pPage->intKey ){
3959 rc = sqlite3BtreeNext(pCur, pRes);
3969 rc = moveToLeftmost(pCur);
3975 ** Step the cursor to the back to the previous entry in the database. If
3976 ** successful then set *pRes=0. If the cursor
3977 ** was already pointing to the first entry in the database before
3978 ** this routine was called, then set *pRes=1.
3980 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){
3984 assert( cursorHoldsMutex(pCur) );
3985 rc = restoreCursorPosition(pCur);
3986 if( rc!=SQLITE_OK ){
3990 if( CURSOR_INVALID==pCur->eState ){
4001 pPage = pCur->apPage[pCur->iPage];
4002 assert( pPage->isInit );
4004 int idx = pCur->aiIdx[pCur->iPage];
4005 rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
4009 rc = moveToRightmost(pCur);
4011 while( pCur->aiIdx[pCur->iPage]==0 ){
4012 if( pCur->iPage==0 ){
4013 pCur->eState = CURSOR_INVALID;
4017 sqlite3BtreeMoveToParent(pCur);
4019 pCur->info.nSize = 0;
4020 pCur->validNKey = 0;
4022 pCur->aiIdx[pCur->iPage]--;
4023 pPage = pCur->apPage[pCur->iPage];
4024 if( pPage->intKey && !pPage->leaf ){
4025 rc = sqlite3BtreePrevious(pCur, pRes);
4035 ** Allocate a new page from the database file.
4037 ** The new page is marked as dirty. (In other words, sqlite3PagerWrite()
4038 ** has already been called on the new page.) The new page has also
4039 ** been referenced and the calling routine is responsible for calling
4040 ** sqlite3PagerUnref() on the new page when it is done.
4042 ** SQLITE_OK is returned on success. Any other return value indicates
4043 ** an error. *ppPage and *pPgno are undefined in the event of an error.
4044 ** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned.
4046 ** If the "nearby" parameter is not 0, then a (feeble) effort is made to
4047 ** locate a page close to the page number "nearby". This can be used in an
4048 ** attempt to keep related pages close to each other in the database file,
4049 ** which in turn can make database access faster.
4051 ** If the "exact" parameter is not 0, and the page-number nearby exists
4052 ** anywhere on the free-list, then it is guarenteed to be returned. This
4053 ** is only used by auto-vacuum databases when allocating a new table.
4055 static int allocateBtreePage(
4064 int n; /* Number of pages on the freelist */
4065 int k; /* Number of leaves on the trunk of the freelist */
4066 MemPage *pTrunk = 0;
4067 MemPage *pPrevTrunk = 0;
4069 assert( sqlite3_mutex_held(pBt->mutex) );
4070 pPage1 = pBt->pPage1;
4071 n = get4byte(&pPage1->aData[36]);
4073 /* There are pages on the freelist. Reuse one of those pages. */
4075 u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
4077 /* If the 'exact' parameter was true and a query of the pointer-map
4078 ** shows that the page 'nearby' is somewhere on the free-list, then
4079 ** the entire-list will be searched for that page.
4081 #ifndef SQLITE_OMIT_AUTOVACUUM
4082 if( exact && nearby<=pagerPagecount(pBt->pPager) ){
4085 assert( pBt->autoVacuum );
4086 rc = ptrmapGet(pBt, nearby, &eType, 0);
4088 if( eType==PTRMAP_FREEPAGE ){
4095 /* Decrement the free-list count by 1. Set iTrunk to the index of the
4096 ** first free-list trunk page. iPrevTrunk is initially 1.
4098 rc = sqlite3PagerWrite(pPage1->pDbPage);
4100 put4byte(&pPage1->aData[36], n-1);
4102 /* The code within this loop is run only once if the 'searchList' variable
4103 ** is not true. Otherwise, it runs once for each trunk-page on the
4104 ** free-list until the page 'nearby' is located.
4107 pPrevTrunk = pTrunk;
4109 iTrunk = get4byte(&pPrevTrunk->aData[0]);
4111 iTrunk = get4byte(&pPage1->aData[32]);
4113 rc = sqlite3BtreeGetPage(pBt, iTrunk, &pTrunk, 0);
4116 goto end_allocate_page;
4119 k = get4byte(&pTrunk->aData[4]);
4120 if( k==0 && !searchList ){
4121 /* The trunk has no leaves and the list is not being searched.
4122 ** So extract the trunk page itself and use it as the newly
4123 ** allocated page */
4124 assert( pPrevTrunk==0 );
4125 rc = sqlite3PagerWrite(pTrunk->pDbPage);
4127 goto end_allocate_page;
4130 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
4133 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
4134 }else if( k>pBt->usableSize/4 - 2 ){
4135 /* Value of k is out of range. Database corruption */
4136 rc = SQLITE_CORRUPT_BKPT;
4137 goto end_allocate_page;
4138 #ifndef SQLITE_OMIT_AUTOVACUUM
4139 }else if( searchList && nearby==iTrunk ){
4140 /* The list is being searched and this trunk page is the page
4141 ** to allocate, regardless of whether it has leaves.
4143 assert( *pPgno==iTrunk );
4146 rc = sqlite3PagerWrite(pTrunk->pDbPage);
4148 goto end_allocate_page;
4152 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
4154 memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
4157 /* The trunk page is required by the caller but it contains
4158 ** pointers to free-list leaves. The first leaf becomes a trunk
4159 ** page in this case.
4162 Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
4163 rc = sqlite3BtreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0);
4164 if( rc!=SQLITE_OK ){
4165 goto end_allocate_page;
4167 rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
4168 if( rc!=SQLITE_OK ){
4169 releasePage(pNewTrunk);
4170 goto end_allocate_page;
4172 memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
4173 put4byte(&pNewTrunk->aData[4], k-1);
4174 memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
4175 releasePage(pNewTrunk);
4177 put4byte(&pPage1->aData[32], iNewTrunk);
4179 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
4181 goto end_allocate_page;
4183 put4byte(&pPrevTrunk->aData[0], iNewTrunk);
4187 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
4190 /* Extract a leaf from the trunk */
4193 unsigned char *aData = pTrunk->aData;
4194 rc = sqlite3PagerWrite(pTrunk->pDbPage);
4196 goto end_allocate_page;
4201 dist = get4byte(&aData[8]) - nearby;
4202 if( dist<0 ) dist = -dist;
4204 int d2 = get4byte(&aData[8+i*4]) - nearby;
4205 if( d2<0 ) d2 = -d2;
4215 iPage = get4byte(&aData[8+closest*4]);
4216 if( !searchList || iPage==nearby ){
4219 nPage = pagerPagecount(pBt->pPager);
4221 /* Free page off the end of the file */
4222 rc = SQLITE_CORRUPT_BKPT;
4223 goto end_allocate_page;
4225 TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
4226 ": %d more free pages\n",
4227 *pPgno, closest+1, k, pTrunk->pgno, n-1));
4229 memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
4231 put4byte(&aData[4], k-1);
4232 rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 1);
4233 if( rc==SQLITE_OK ){
4234 sqlite3PagerDontRollback((*ppPage)->pDbPage);
4235 rc = sqlite3PagerWrite((*ppPage)->pDbPage);
4236 if( rc!=SQLITE_OK ){
4237 releasePage(*ppPage);
4243 releasePage(pPrevTrunk);
4245 }while( searchList );
4247 /* There are no pages on the freelist, so create a new page at the
4248 ** end of the file */
4249 int nPage = pagerPagecount(pBt->pPager);
4252 #ifndef SQLITE_OMIT_AUTOVACUUM
4254 /* An incr-vacuum has already run within this transaction. So the
4255 ** page to allocate is not from the physical end of the file, but
4258 *pPgno = pBt->nTrunc+1;
4259 if( *pPgno==PENDING_BYTE_PAGE(pBt) ){
4263 if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, *pPgno) ){
4264 /* If *pPgno refers to a pointer-map page, allocate two new pages
4265 ** at the end of the file instead of one. The first allocated page
4266 ** becomes a new pointer-map page, the second is used by the caller.
4268 TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", *pPgno));
4269 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
4271 if( *pPgno==PENDING_BYTE_PAGE(pBt) ){ (*pPgno)++; }
4274 pBt->nTrunc = *pPgno;
4278 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
4279 rc = sqlite3BtreeGetPage(pBt, *pPgno, ppPage, 0);
4281 rc = sqlite3PagerWrite((*ppPage)->pDbPage);
4282 if( rc!=SQLITE_OK ){
4283 releasePage(*ppPage);
4285 TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
4288 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
4291 releasePage(pTrunk);
4292 releasePage(pPrevTrunk);
4293 if( rc==SQLITE_OK ){
4294 if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
4295 releasePage(*ppPage);
4296 return SQLITE_CORRUPT_BKPT;
4298 (*ppPage)->isInit = 0;
4304 ** Add a page of the database file to the freelist.
4306 ** sqlite3PagerUnref() is NOT called for pPage.
4308 static int freePage(MemPage *pPage){
4309 BtShared *pBt = pPage->pBt;
4310 MemPage *pPage1 = pBt->pPage1;
4313 /* Prepare the page for freeing */
4314 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4315 assert( pPage->pgno>1 );
4318 /* Increment the free page count on pPage1 */
4319 rc = sqlite3PagerWrite(pPage1->pDbPage);
4321 n = get4byte(&pPage1->aData[36]);
4322 put4byte(&pPage1->aData[36], n+1);
4324 #ifdef SQLITE_SECURE_DELETE
4325 /* If the SQLITE_SECURE_DELETE compile-time option is enabled, then
4326 ** always fully overwrite deleted information with zeros.
4328 rc = sqlite3PagerWrite(pPage->pDbPage);
4330 memset(pPage->aData, 0, pPage->pBt->pageSize);
4333 /* If the database supports auto-vacuum, write an entry in the pointer-map
4334 ** to indicate that the page is free.
4337 rc = ptrmapPut(pBt, pPage->pgno, PTRMAP_FREEPAGE, 0);
4342 /* This is the first free page */
4343 rc = sqlite3PagerWrite(pPage->pDbPage);
4345 memset(pPage->aData, 0, 8);
4346 put4byte(&pPage1->aData[32], pPage->pgno);
4347 TRACE(("FREE-PAGE: %d first\n", pPage->pgno));
4349 /* Other free pages already exist. Retrive the first trunk page
4350 ** of the freelist and find out how many leaves it has. */
4352 rc = sqlite3BtreeGetPage(pBt, get4byte(&pPage1->aData[32]), &pTrunk, 0);
4354 k = get4byte(&pTrunk->aData[4]);
4355 if( k>=pBt->usableSize/4 - 8 ){
4356 /* The trunk is full. Turn the page being freed into a new
4357 ** trunk page with no leaves.
4359 ** Note that the trunk page is not really full until it contains
4360 ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
4361 ** coded. But due to a coding error in versions of SQLite prior to
4362 ** 3.6.0, databases with freelist trunk pages holding more than
4363 ** usableSize/4 - 8 entries will be reported as corrupt. In order
4364 ** to maintain backwards compatibility with older versions of SQLite,
4365 ** we will contain to restrict the number of entries to usableSize/4 - 8
4366 ** for now. At some point in the future (once everyone has upgraded
4367 ** to 3.6.0 or later) we should consider fixing the conditional above
4368 ** to read "usableSize/4-2" instead of "usableSize/4-8".
4370 rc = sqlite3PagerWrite(pPage->pDbPage);
4371 if( rc==SQLITE_OK ){
4372 put4byte(pPage->aData, pTrunk->pgno);
4373 put4byte(&pPage->aData[4], 0);
4374 put4byte(&pPage1->aData[32], pPage->pgno);
4375 TRACE(("FREE-PAGE: %d new trunk page replacing %d\n",
4376 pPage->pgno, pTrunk->pgno));
4379 rc = SQLITE_CORRUPT;
4381 /* Add the newly freed page as a leaf on the current trunk */
4382 rc = sqlite3PagerWrite(pTrunk->pDbPage);
4383 if( rc==SQLITE_OK ){
4384 put4byte(&pTrunk->aData[4], k+1);
4385 put4byte(&pTrunk->aData[8+k*4], pPage->pgno);
4386 #ifndef SQLITE_SECURE_DELETE
4387 rc = sqlite3PagerDontWrite(pPage->pDbPage);
4390 TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
4392 releasePage(pTrunk);
4398 ** Free any overflow pages associated with the given Cell.
4400 static int clearCell(MemPage *pPage, unsigned char *pCell){
4401 BtShared *pBt = pPage->pBt;
4408 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4409 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
4410 if( info.iOverflow==0 ){
4411 return SQLITE_OK; /* No overflow pages. Return without doing anything */
4413 ovflPgno = get4byte(&pCell[info.iOverflow]);
4414 ovflPageSize = pBt->usableSize - 4;
4415 nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize;
4416 assert( ovflPgno==0 || nOvfl>0 );
4419 if( ovflPgno==0 || ovflPgno>pagerPagecount(pBt->pPager) ){
4420 return SQLITE_CORRUPT_BKPT;
4423 rc = getOverflowPage(pBt, ovflPgno, &pOvfl, (nOvfl==0)?0:&ovflPgno);
4425 rc = freePage(pOvfl);
4426 sqlite3PagerUnref(pOvfl->pDbPage);
4433 ** Create the byte sequence used to represent a cell on page pPage
4434 ** and write that byte sequence into pCell[]. Overflow pages are
4435 ** allocated and filled in as necessary. The calling procedure
4436 ** is responsible for making sure sufficient space has been allocated
4439 ** Note that pCell does not necessary need to point to the pPage->aData
4440 ** area. pCell might point to some temporary storage. The cell will
4441 ** be constructed in this temporary area then copied into pPage->aData
4444 static int fillInCell(
4445 MemPage *pPage, /* The page that contains the cell */
4446 unsigned char *pCell, /* Complete text of the cell */
4447 const void *pKey, i64 nKey, /* The key */
4448 const void *pData,int nData, /* The data */
4449 int nZero, /* Extra zero bytes to append to pData */
4450 int *pnSize /* Write cell size here */
4457 MemPage *pToRelease = 0;
4458 unsigned char *pPrior;
4459 unsigned char *pPayload;
4460 BtShared *pBt = pPage->pBt;
4465 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4467 /* Fill in the header. */
4472 if( pPage->hasData ){
4473 nHeader += putVarint(&pCell[nHeader], nData+nZero);
4477 nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey);
4478 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
4479 assert( info.nHeader==nHeader );
4480 assert( info.nKey==nKey );
4481 assert( info.nData==nData+nZero );
4483 /* Fill in the payload */
4484 nPayload = nData + nZero;
4485 if( pPage->intKey ){
4494 *pnSize = info.nSize;
4495 spaceLeft = info.nLocal;
4496 pPayload = &pCell[nHeader];
4497 pPrior = &pCell[info.iOverflow];
4499 while( nPayload>0 ){
4502 #ifndef SQLITE_OMIT_AUTOVACUUM
4503 Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
4504 if( pBt->autoVacuum ){
4508 PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
4515 rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, isExact);
4516 #ifndef SQLITE_OMIT_AUTOVACUUM
4517 /* If the database supports auto-vacuum, and the second or subsequent
4518 ** overflow page is being allocated, add an entry to the pointer-map
4519 ** for that page now.
4521 ** If this is the first overflow page, then write a partial entry
4522 ** to the pointer-map. If we write nothing to this pointer-map slot,
4523 ** then the optimistic overflow chain processing in clearCell()
4524 ** may misinterpret the uninitialised values and delete the
4525 ** wrong pages from the database.
4527 if( pBt->autoVacuum && rc==SQLITE_OK ){
4528 u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
4529 rc = ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap);
4536 releasePage(pToRelease);
4539 put4byte(pPrior, pgnoOvfl);
4540 releasePage(pToRelease);
4542 pPrior = pOvfl->aData;
4543 put4byte(pPrior, 0);
4544 pPayload = &pOvfl->aData[4];
4545 spaceLeft = pBt->usableSize - 4;
4548 if( n>spaceLeft ) n = spaceLeft;
4550 if( n>nSrc ) n = nSrc;
4552 memcpy(pPayload, pSrc, n);
4554 memset(pPayload, 0, n);
4566 releasePage(pToRelease);
4571 ** Remove the i-th cell from pPage. This routine effects pPage only.
4572 ** The cell content is not freed or deallocated. It is assumed that
4573 ** the cell content has been copied someplace else. This routine just
4574 ** removes the reference to the cell from pPage.
4576 ** "sz" must be the number of bytes in the cell.
4578 static int dropCell(MemPage *pPage, int idx, int sz){
4579 int i; /* Loop counter */
4580 int pc; /* Offset to cell content of cell being deleted */
4581 u8 *data; /* pPage->aData */
4582 u8 *ptr; /* Used to move bytes around within data[] */
4583 int rc; /* Return code */
4585 assert( idx>=0 && idx<pPage->nCell );
4586 assert( sz==cellSize(pPage, idx) );
4587 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
4588 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4589 data = pPage->aData;
4590 ptr = &data[pPage->cellOffset + 2*idx];
4592 if( pc<pPage->hdrOffset+6+(pPage->leaf?0:4)
4593 || pc+sz>pPage->pBt->usableSize
4595 return SQLITE_CORRUPT_BKPT;
4597 rc = freeSpace(pPage, pc, sz);
4599 for(i=idx+1; i<pPage->nCell; i++, ptr+=2){
4604 put2byte(&data[pPage->hdrOffset+3], pPage->nCell);
4610 ** Insert a new cell on pPage at cell index "i". pCell points to the
4611 ** content of the cell.
4613 ** If the cell content will fit on the page, then put it there. If it
4614 ** will not fit, then make a copy of the cell content into pTemp if
4615 ** pTemp is not null. Regardless of pTemp, allocate a new entry
4616 ** in pPage->aOvfl[] and make it point to the cell content (either
4617 ** in pTemp or the original pCell) and also record its index.
4618 ** Allocating a new entry in pPage->aCell[] implies that
4619 ** pPage->nOverflow is incremented.
4621 ** If nSkip is non-zero, then do not copy the first nSkip bytes of the
4622 ** cell. The caller will overwrite them after this function returns. If
4623 ** nSkip is non-zero, then pCell may not point to an invalid memory location
4624 ** (but pCell+nSkip is always valid).
4626 static int insertCell(
4627 MemPage *pPage, /* Page into which we are copying */
4628 int i, /* New cell becomes the i-th cell of the page */
4629 u8 *pCell, /* Content of the new cell */
4630 int sz, /* Bytes of content in pCell */
4631 u8 *pTemp, /* Temp storage space for pCell, if needed */
4632 u8 nSkip /* Do not write the first nSkip bytes of the cell */
4634 int idx; /* Where to write new cell content in data[] */
4635 int j; /* Loop counter */
4636 int top; /* First byte of content for any cell in data[] */
4637 int end; /* First byte past the last cell pointer in data[] */
4638 int ins; /* Index in data[] where new cell pointer is inserted */
4639 int hdr; /* Offset into data[] of the page header */
4640 int cellOffset; /* Address of first cell pointer in data[] */
4641 u8 *data; /* The content of the whole page */
4642 u8 *ptr; /* Used for moving information around in data[] */
4644 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
4645 assert( sz==cellSizePtr(pPage, pCell) );
4646 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4647 if( pPage->nOverflow || sz+2>pPage->nFree ){
4649 memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip);
4652 j = pPage->nOverflow++;
4653 assert( j<sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0]) );
4654 pPage->aOvfl[j].pCell = pCell;
4655 pPage->aOvfl[j].idx = i;
4658 int rc = sqlite3PagerWrite(pPage->pDbPage);
4659 if( rc!=SQLITE_OK ){
4662 assert( sqlite3PagerIswriteable(pPage->pDbPage) );
4663 data = pPage->aData;
4664 hdr = pPage->hdrOffset;
4665 top = get2byte(&data[hdr+5]);
4666 cellOffset = pPage->cellOffset;
4667 end = cellOffset + 2*pPage->nCell + 2;
4668 ins = cellOffset + 2*i;
4669 if( end > top - sz ){
4670 rc = defragmentPage(pPage);
4672 top = get2byte(&data[hdr+5]);
4673 assert( end + sz <= top );
4675 idx = allocateSpace(pPage, sz);
4677 assert( end <= get2byte(&data[hdr+5]) );
4678 if( idx+sz > pPage->pBt->usableSize ){
4679 return SQLITE_CORRUPT_BKPT;
4683 memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip);
4684 for(j=end-2, ptr=&data[j]; j>ins; j-=2, ptr-=2){
4688 put2byte(&data[ins], idx);
4689 put2byte(&data[hdr+3], pPage->nCell);
4690 #ifndef SQLITE_OMIT_AUTOVACUUM
4691 if( pPage->pBt->autoVacuum ){
4692 /* The cell may contain a pointer to an overflow page. If so, write
4693 ** the entry for the overflow page into the pointer map.
4696 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
4697 assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload );
4698 if( (info.nData+(pPage->intKey?0:info.nKey))>info.nLocal ){
4699 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
4700 rc = ptrmapPut(pPage->pBt, pgnoOvfl, PTRMAP_OVERFLOW1, pPage->pgno);
4701 if( rc!=SQLITE_OK ) return rc;
4711 ** Add a list of cells to a page. The page should be initially empty.
4712 ** The cells are guaranteed to fit on the page.
4714 static void assemblePage(
4715 MemPage *pPage, /* The page to be assemblied */
4716 int nCell, /* The number of cells to add to this page */
4717 u8 **apCell, /* Pointers to cell bodies */
4718 u16 *aSize /* Sizes of the cells */
4720 int i; /* Loop counter */
4721 int totalSize; /* Total size of all cells */
4722 int hdr; /* Index of page header */
4723 int cellptr; /* Address of next cell pointer */
4724 int cellbody; /* Address of next cell body */
4725 u8 *data; /* Data for the page */
4727 assert( pPage->nOverflow==0 );
4728 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4730 for(i=0; i<nCell; i++){
4731 totalSize += aSize[i];
4733 assert( totalSize+2*nCell<=pPage->nFree );
4734 assert( pPage->nCell==0 );
4735 cellptr = pPage->cellOffset;
4736 data = pPage->aData;
4737 hdr = pPage->hdrOffset;
4738 put2byte(&data[hdr+3], nCell);
4740 cellbody = allocateSpace(pPage, totalSize);
4741 assert( cellbody>0 );
4742 assert( pPage->nFree >= 2*nCell );
4743 pPage->nFree -= 2*nCell;
4744 for(i=0; i<nCell; i++){
4745 put2byte(&data[cellptr], cellbody);
4746 memcpy(&data[cellbody], apCell[i], aSize[i]);
4748 cellbody += aSize[i];
4750 assert( cellbody==pPage->pBt->usableSize );
4752 pPage->nCell = nCell;
4756 ** The following parameters determine how many adjacent pages get involved
4757 ** in a balancing operation. NN is the number of neighbors on either side
4758 ** of the page that participate in the balancing operation. NB is the
4759 ** total number of pages that participate, including the target page and
4760 ** NN neighbors on either side.
4762 ** The minimum value of NN is 1 (of course). Increasing NN above 1
4763 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
4764 ** in exchange for a larger degradation in INSERT and UPDATE performance.
4765 ** The value of NN appears to give the best results overall.
4767 #define NN 1 /* Number of neighbors on either side of pPage */
4768 #define NB (NN*2+1) /* Total pages involved in the balance */
4770 /* Forward reference */
4771 static int balance(BtCursor*, int);
4773 #ifndef SQLITE_OMIT_QUICKBALANCE
4775 ** This version of balance() handles the common special case where
4776 ** a new entry is being inserted on the extreme right-end of the
4777 ** tree, in other words, when the new entry will become the largest
4778 ** entry in the tree.
4780 ** Instead of trying balance the 3 right-most leaf pages, just add
4781 ** a new page to the right-hand side and put the one new entry in
4782 ** that page. This leaves the right side of the tree somewhat
4783 ** unbalanced. But odds are that we will be inserting new entries
4784 ** at the end soon afterwards so the nearly empty page will quickly
4785 ** fill up. On average.
4787 ** pPage is the leaf page which is the right-most page in the tree.
4788 ** pParent is its parent. pPage must have a single overflow entry
4789 ** which is also the right-most entry on the page.
4791 static int balance_quick(BtCursor *pCur){
4798 MemPage *pPage = pCur->apPage[pCur->iPage];
4799 MemPage *pParent = pCur->apPage[pCur->iPage-1];
4800 BtShared *pBt = pPage->pBt;
4801 int parentIdx = pParent->nCell; /* pParent new divider cell index */
4802 int parentSize; /* Size of new divider cell */
4803 u8 parentCell[64]; /* Space for the new divider cell */
4805 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4807 /* Allocate a new page. Insert the overflow cell from pPage
4808 ** into it. Then remove the overflow cell from pPage.
4810 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
4811 if( rc==SQLITE_OK ){
4812 pCell = pPage->aOvfl[0].pCell;
4813 szCell = cellSizePtr(pPage, pCell);
4814 zeroPage(pNew, pPage->aData[0]);
4815 assemblePage(pNew, 1, &pCell, &szCell);
4816 pPage->nOverflow = 0;
4818 /* pPage is currently the right-child of pParent. Change this
4819 ** so that the right-child is the new page allocated above and
4820 ** pPage is the next-to-right child.
4822 ** Ignore the return value of the call to fillInCell(). fillInCell()
4823 ** may only return other than SQLITE_OK if it is required to allocate
4824 ** one or more overflow pages. Since an internal table B-Tree cell
4825 ** may never spill over onto an overflow page (it is a maximum of
4826 ** 13 bytes in size), it is not neccessary to check the return code.
4828 ** Similarly, the insertCell() function cannot fail if the page
4829 ** being inserted into is already writable and the cell does not
4830 ** contain an overflow pointer. So ignore this return code too.
4832 assert( pPage->nCell>0 );
4833 pCell = findCell(pPage, pPage->nCell-1);
4834 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
4835 fillInCell(pParent, parentCell, 0, info.nKey, 0, 0, 0, &parentSize);
4836 assert( parentSize<64 );
4837 assert( sqlite3PagerIswriteable(pParent->pDbPage) );
4838 insertCell(pParent, parentIdx, parentCell, parentSize, 0, 4);
4839 put4byte(findOverflowCell(pParent,parentIdx), pPage->pgno);
4840 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
4842 /* If this is an auto-vacuum database, update the pointer map
4843 ** with entries for the new page, and any pointer from the
4844 ** cell on the page to an overflow page.
4847 rc = ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno);
4848 if( rc==SQLITE_OK ){
4849 rc = ptrmapPutOvfl(pNew, 0);
4853 /* Release the reference to the new page. */
4857 /* At this point the pPage->nFree variable is not set correctly with
4858 ** respect to the content of the page (because it was set to 0 by
4859 ** insertCell). So call sqlite3BtreeInitPage() to make sure it is
4862 ** This has to be done even if an error will be returned. Normally, if
4863 ** an error occurs during tree balancing, the contents of MemPage are
4864 ** not important, as they will be recalculated when the page is rolled
4865 ** back. But here, in balance_quick(), it is possible that pPage has
4866 ** not yet been marked dirty or written into the journal file. Therefore
4867 ** it will not be rolled back and so it is important to make sure that
4868 ** the page data and contents of MemPage are consistent.
4871 sqlite3BtreeInitPage(pPage);
4873 /* If everything else succeeded, balance the parent page, in
4874 ** case the divider cell inserted caused it to become overfull.
4876 if( rc==SQLITE_OK ){
4879 rc = balance(pCur, 0);
4883 #endif /* SQLITE_OMIT_QUICKBALANCE */
4886 ** This routine redistributes Cells on pPage and up to NN*2 siblings
4887 ** of pPage so that all pages have about the same amount of free space.
4888 ** Usually NN siblings on either side of pPage is used in the balancing,
4889 ** though more siblings might come from one side if pPage is the first
4890 ** or last child of its parent. If pPage has fewer than 2*NN siblings
4891 ** (something which can only happen if pPage is the root page or a
4892 ** child of root) then all available siblings participate in the balancing.
4894 ** The number of siblings of pPage might be increased or decreased by one or
4895 ** two in an effort to keep pages nearly full but not over full. The root page
4896 ** is special and is allowed to be nearly empty. If pPage is
4897 ** the root page, then the depth of the tree might be increased
4898 ** or decreased by one, as necessary, to keep the root page from being
4899 ** overfull or completely empty.
4901 ** Note that when this routine is called, some of the Cells on pPage
4902 ** might not actually be stored in pPage->aData[]. This can happen
4903 ** if the page is overfull. Part of the job of this routine is to
4904 ** make sure all Cells for pPage once again fit in pPage->aData[].
4906 ** In the course of balancing the siblings of pPage, the parent of pPage
4907 ** might become overfull or underfull. If that happens, then this routine
4908 ** is called recursively on the parent.
4910 ** If this routine fails for any reason, it might leave the database
4911 ** in a corrupted state. So if this routine fails, the database should
4914 static int balance_nonroot(BtCursor *pCur){
4915 MemPage *pPage; /* The over or underfull page to balance */
4916 MemPage *pParent; /* The parent of pPage */
4917 BtShared *pBt; /* The whole database */
4918 int nCell = 0; /* Number of cells in apCell[] */
4919 int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */
4920 int nOld; /* Number of pages in apOld[] */
4921 int nNew; /* Number of pages in apNew[] */
4922 int nDiv; /* Number of cells in apDiv[] */
4923 int i, j, k; /* Loop counters */
4924 int idx; /* Index of pPage in pParent->aCell[] */
4925 int nxDiv; /* Next divider slot in pParent->aCell[] */
4926 int rc; /* The return code */
4927 int leafCorrection; /* 4 if pPage is a leaf. 0 if not */
4928 int leafData; /* True if pPage is a leaf of a LEAFDATA tree */
4929 int usableSpace; /* Bytes in pPage beyond the header */
4930 int pageFlags; /* Value of pPage->aData[0] */
4931 int subtotal; /* Subtotal of bytes in cells on one page */
4932 int iSpace1 = 0; /* First unused byte of aSpace1[] */
4933 int iSpace2 = 0; /* First unused byte of aSpace2[] */
4934 int szScratch; /* Size of scratch memory requested */
4935 MemPage *apOld[NB]; /* pPage and up to two siblings */
4936 Pgno pgnoOld[NB]; /* Page numbers for each page in apOld[] */
4937 MemPage *apCopy[NB]; /* Private copies of apOld[] pages */
4938 MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */
4939 Pgno pgnoNew[NB+2]; /* Page numbers for each page in apNew[] */
4940 u8 *apDiv[NB]; /* Divider cells in pParent */
4941 int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */
4942 int szNew[NB+2]; /* Combined size of cells place on i-th page */
4943 u8 **apCell = 0; /* All cells begin balanced */
4944 u16 *szCell; /* Local size of all cells in apCell[] */
4945 u8 *aCopy[NB]; /* Space for holding data of apCopy[] */
4946 u8 *aSpace1; /* Space for copies of dividers cells before balance */
4947 u8 *aSpace2 = 0; /* Space for overflow dividers cells after balance */
4950 pPage = pCur->apPage[pCur->iPage];
4951 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4952 VVA_ONLY( pCur->pagesShuffled = 1 );
4955 ** Find the parent page.
4957 assert( pCur->iPage>0 );
4958 assert( pPage->isInit );
4959 assert( sqlite3PagerIswriteable(pPage->pDbPage) || pPage->nOverflow==1 );
4961 pParent = pCur->apPage[pCur->iPage-1];
4963 if( SQLITE_OK!=(rc = sqlite3PagerWrite(pParent->pDbPage)) ){
4967 TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
4969 #ifndef SQLITE_OMIT_QUICKBALANCE
4971 ** A special case: If a new entry has just been inserted into a
4972 ** table (that is, a btree with integer keys and all data at the leaves)
4973 ** and the new entry is the right-most entry in the tree (it has the
4974 ** largest key) then use the special balance_quick() routine for
4975 ** balancing. balance_quick() is much faster and results in a tighter
4976 ** packing of data in the common case.
4980 pPage->nOverflow==1 &&
4981 pPage->aOvfl[0].idx==pPage->nCell &&
4983 get4byte(&pParent->aData[pParent->hdrOffset+8])==pPage->pgno
4985 assert( pPage->intKey );
4987 ** TODO: Check the siblings to the left of pPage. It may be that
4988 ** they are not full and no new page is required.
4990 return balance_quick(pCur);
4994 if( SQLITE_OK!=(rc = sqlite3PagerWrite(pPage->pDbPage)) ){
4999 ** Find the cell in the parent page whose left child points back
5000 ** to pPage. The "idx" variable is the index of that cell. If pPage
5001 ** is the rightmost child of pParent then set idx to pParent->nCell
5003 idx = pCur->aiIdx[pCur->iPage-1];
5004 assertParentIndex(pParent, idx, pPage->pgno);
5007 ** Initialize variables so that it will be safe to jump
5008 ** directly to balance_cleanup at any moment.
5013 ** Find sibling pages to pPage and the cells in pParent that divide
5014 ** the siblings. An attempt is made to find NN siblings on either
5015 ** side of pPage. More siblings are taken from one side, however, if
5016 ** pPage there are fewer than NN siblings on the other side. If pParent
5017 ** has NB or fewer children then all children of pParent are taken.
5020 if( nxDiv + NB > pParent->nCell ){
5021 nxDiv = pParent->nCell - NB + 1;
5027 for(i=0, k=nxDiv; i<NB; i++, k++){
5028 if( k<pParent->nCell ){
5029 apDiv[i] = findCell(pParent, k);
5031 assert( !pParent->leaf );
5032 pgnoOld[i] = get4byte(apDiv[i]);
5033 }else if( k==pParent->nCell ){
5034 pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+8]);
5038 rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i]);
5039 if( rc ) goto balance_cleanup;
5040 /* apOld[i]->idxParent = k; */
5044 nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
5047 /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
5049 nMaxCells = (nMaxCells + 3)&~3;
5052 ** Allocate space for memory structures
5055 nMaxCells*sizeof(u8*) /* apCell */
5056 + nMaxCells*sizeof(u16) /* szCell */
5057 + (ROUND8(sizeof(MemPage))+pBt->pageSize)*NB /* aCopy */
5058 + pBt->pageSize /* aSpace1 */
5059 + (ISAUTOVACUUM ? nMaxCells : 0); /* aFrom */
5060 apCell = sqlite3ScratchMalloc( szScratch );
5063 goto balance_cleanup;
5065 szCell = (u16*)&apCell[nMaxCells];
5066 aCopy[0] = (u8*)&szCell[nMaxCells];
5067 assert( ((aCopy[0] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
5068 for(i=1; i<NB; i++){
5069 aCopy[i] = &aCopy[i-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
5070 assert( ((aCopy[i] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
5072 aSpace1 = &aCopy[NB-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
5073 assert( ((aSpace1 - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
5075 aFrom = &aSpace1[pBt->pageSize];
5077 aSpace2 = sqlite3PageMalloc(pBt->pageSize);
5080 goto balance_cleanup;
5084 ** Make copies of the content of pPage and its siblings into aOld[].
5085 ** The rest of this function will use data from the copies rather
5086 ** that the original pages since the original pages will be in the
5087 ** process of being overwritten.
5089 for(i=0; i<nOld; i++){
5090 MemPage *p = apCopy[i] = (MemPage*)aCopy[i];
5091 memcpy(p, apOld[i], sizeof(MemPage));
5092 p->aData = (void*)&p[1];
5093 memcpy(p->aData, apOld[i]->aData, pBt->pageSize);
5097 ** Load pointers to all cells on sibling pages and the divider cells
5098 ** into the local apCell[] array. Make copies of the divider cells
5099 ** into space obtained form aSpace1[] and remove the the divider Cells
5102 ** If the siblings are on leaf pages, then the child pointers of the
5103 ** divider cells are stripped from the cells before they are copied
5104 ** into aSpace1[]. In this way, all cells in apCell[] are without
5105 ** child pointers. If siblings are not leaves, then all cell in
5106 ** apCell[] include child pointers. Either way, all cells in apCell[]
5109 ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf.
5110 ** leafData: 1 if pPage holds key+data and pParent holds only keys.
5113 leafCorrection = pPage->leaf*4;
5114 leafData = pPage->hasData;
5115 for(i=0; i<nOld; i++){
5116 MemPage *pOld = apCopy[i];
5117 int limit = pOld->nCell+pOld->nOverflow;
5118 for(j=0; j<limit; j++){
5119 assert( nCell<nMaxCells );
5120 apCell[nCell] = findOverflowCell(pOld, j);
5121 szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
5125 for(a=0; a<pOld->nOverflow; a++){
5126 if( pOld->aOvfl[a].pCell==apCell[nCell] ){
5127 aFrom[nCell] = 0xFF;
5135 u16 sz = cellSizePtr(pParent, apDiv[i]);
5137 /* With the LEAFDATA flag, pParent cells hold only INTKEYs that
5138 ** are duplicates of keys on the child pages. We need to remove
5139 ** the divider cells from pParent, but the dividers cells are not
5140 ** added to apCell[] because they are duplicates of child cells.
5142 dropCell(pParent, nxDiv, sz);
5145 assert( nCell<nMaxCells );
5147 pTemp = &aSpace1[iSpace1];
5149 assert( sz<=pBt->pageSize/4 );
5150 assert( iSpace1<=pBt->pageSize );
5151 memcpy(pTemp, apDiv[i], sz);
5152 apCell[nCell] = pTemp+leafCorrection;
5154 aFrom[nCell] = 0xFF;
5156 dropCell(pParent, nxDiv, sz);
5157 szCell[nCell] -= leafCorrection;
5158 assert( get4byte(pTemp)==pgnoOld[i] );
5160 assert( leafCorrection==0 );
5161 /* The right pointer of the child page pOld becomes the left
5162 ** pointer of the divider cell */
5163 memcpy(apCell[nCell], &pOld->aData[pOld->hdrOffset+8], 4);
5165 assert( leafCorrection==4 );
5166 if( szCell[nCell]<4 ){
5167 /* Do not allow any cells smaller than 4 bytes. */
5177 ** Figure out the number of pages needed to hold all nCell cells.
5178 ** Store this number in "k". Also compute szNew[] which is the total
5179 ** size of all cells on the i-th page and cntNew[] which is the index
5180 ** in apCell[] of the cell that divides page i from page i+1.
5181 ** cntNew[k] should equal nCell.
5183 ** Values computed by this block:
5185 ** k: The total number of sibling pages
5186 ** szNew[i]: Spaced used on the i-th sibling page.
5187 ** cntNew[i]: Index in apCell[] and szCell[] for the first cell to
5188 ** the right of the i-th sibling page.
5189 ** usableSpace: Number of bytes of space available on each sibling.
5192 usableSpace = pBt->usableSize - 12 + leafCorrection;
5193 for(subtotal=k=i=0; i<nCell; i++){
5194 assert( i<nMaxCells );
5195 subtotal += szCell[i] + 2;
5196 if( subtotal > usableSpace ){
5197 szNew[k] = subtotal - szCell[i];
5199 if( leafData ){ i--; }
5204 szNew[k] = subtotal;
5209 ** The packing computed by the previous block is biased toward the siblings
5210 ** on the left side. The left siblings are always nearly full, while the
5211 ** right-most sibling might be nearly empty. This block of code attempts
5212 ** to adjust the packing of siblings to get a better balance.
5214 ** This adjustment is more than an optimization. The packing above might
5215 ** be so out of balance as to be illegal. For example, the right-most
5216 ** sibling might be completely empty. This adjustment is not optional.
5218 for(i=k-1; i>0; i--){
5219 int szRight = szNew[i]; /* Size of sibling on the right */
5220 int szLeft = szNew[i-1]; /* Size of sibling on the left */
5221 int r; /* Index of right-most cell in left sibling */
5222 int d; /* Index of first cell to the left of right sibling */
5224 r = cntNew[i-1] - 1;
5225 d = r + 1 - leafData;
5226 assert( d<nMaxCells );
5227 assert( r<nMaxCells );
5228 while( szRight==0 || szRight+szCell[d]+2<=szLeft-(szCell[r]+2) ){
5229 szRight += szCell[d] + 2;
5230 szLeft -= szCell[r] + 2;
5232 r = cntNew[i-1] - 1;
5233 d = r + 1 - leafData;
5236 szNew[i-1] = szLeft;
5239 /* Either we found one or more cells (cntnew[0])>0) or we are the
5240 ** a virtual root page. A virtual root page is when the real root
5241 ** page is page 1 and we are the only child of that page.
5243 assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
5246 ** Allocate k new pages. Reuse old pages where possible.
5248 assert( pPage->pgno>1 );
5249 pageFlags = pPage->aData[0];
5253 pNew = apNew[i] = apOld[i];
5254 pgnoNew[i] = pgnoOld[i];
5256 rc = sqlite3PagerWrite(pNew->pDbPage);
5258 if( rc ) goto balance_cleanup;
5261 rc = allocateBtreePage(pBt, &pNew, &pgnoNew[i], pgnoNew[i-1], 0);
5262 if( rc ) goto balance_cleanup;
5268 /* Free any old pages that were not reused as new pages.
5271 rc = freePage(apOld[i]);
5272 if( rc ) goto balance_cleanup;
5273 releasePage(apOld[i]);
5279 ** Put the new pages in accending order. This helps to
5280 ** keep entries in the disk file in order so that a scan
5281 ** of the table is a linear scan through the file. That
5282 ** in turn helps the operating system to deliver pages
5283 ** from the disk more rapidly.
5285 ** An O(n^2) insertion sort algorithm is used, but since
5286 ** n is never more than NB (a small constant), that should
5287 ** not be a problem.
5289 ** When NB==3, this one optimization makes the database
5290 ** about 25% faster for large insertions and deletions.
5292 for(i=0; i<k-1; i++){
5293 int minV = pgnoNew[i];
5295 for(j=i+1; j<k; j++){
5296 if( pgnoNew[j]<(unsigned)minV ){
5306 pgnoNew[i] = pgnoNew[minI];
5307 apNew[i] = apNew[minI];
5312 TRACE(("BALANCE: old: %d %d %d new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",
5314 nOld>=2 ? pgnoOld[1] : 0,
5315 nOld>=3 ? pgnoOld[2] : 0,
5316 pgnoNew[0], szNew[0],
5317 nNew>=2 ? pgnoNew[1] : 0, nNew>=2 ? szNew[1] : 0,
5318 nNew>=3 ? pgnoNew[2] : 0, nNew>=3 ? szNew[2] : 0,
5319 nNew>=4 ? pgnoNew[3] : 0, nNew>=4 ? szNew[3] : 0,
5320 nNew>=5 ? pgnoNew[4] : 0, nNew>=5 ? szNew[4] : 0));
5323 ** Evenly distribute the data in apCell[] across the new pages.
5324 ** Insert divider cells into pParent as necessary.
5327 for(i=0; i<nNew; i++){
5328 /* Assemble the new sibling page. */
5329 MemPage *pNew = apNew[i];
5330 assert( j<nMaxCells );
5331 assert( pNew->pgno==pgnoNew[i] );
5332 zeroPage(pNew, pageFlags);
5333 assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);
5334 assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );
5335 assert( pNew->nOverflow==0 );
5337 /* If this is an auto-vacuum database, update the pointer map entries
5338 ** that point to the siblings that were rearranged. These can be: left
5339 ** children of cells, the right-child of the page, or overflow pages
5340 ** pointed to by cells.
5343 for(k=j; k<cntNew[i]; k++){
5344 assert( k<nMaxCells );
5345 if( aFrom[k]==0xFF || apCopy[aFrom[k]]->pgno!=pNew->pgno ){
5346 rc = ptrmapPutOvfl(pNew, k-j);
5347 if( rc==SQLITE_OK && leafCorrection==0 ){
5348 rc = ptrmapPut(pBt, get4byte(apCell[k]), PTRMAP_BTREE, pNew->pgno);
5350 if( rc!=SQLITE_OK ){
5351 goto balance_cleanup;
5359 /* If the sibling page assembled above was not the right-most sibling,
5360 ** insert a divider cell into the parent page.
5362 if( i<nNew-1 && j<nCell ){
5367 assert( j<nMaxCells );
5369 sz = szCell[j] + leafCorrection;
5370 pTemp = &aSpace2[iSpace2];
5372 memcpy(&pNew->aData[8], pCell, 4);
5374 && (aFrom[j]==0xFF || apCopy[aFrom[j]]->pgno!=pNew->pgno)
5376 rc = ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno);
5377 if( rc!=SQLITE_OK ){
5378 goto balance_cleanup;
5381 }else if( leafData ){
5382 /* If the tree is a leaf-data tree, and the siblings are leaves,
5383 ** then there is no divider cell in apCell[]. Instead, the divider
5384 ** cell consists of the integer key for the right-most cell of
5385 ** the sibling-page assembled above only.
5389 sqlite3BtreeParseCellPtr(pNew, apCell[j], &info);
5391 fillInCell(pParent, pCell, 0, info.nKey, 0, 0, 0, &sz);
5395 /* Obscure case for non-leaf-data trees: If the cell at pCell was
5396 ** previously stored on a leaf node, and its reported size was 4
5397 ** bytes, then it may actually be smaller than this
5398 ** (see sqlite3BtreeParseCellPtr(), 4 bytes is the minimum size of
5399 ** any cell). But it is important to pass the correct size to
5400 ** insertCell(), so reparse the cell now.
5402 ** Note that this can never happen in an SQLite data file, as all
5403 ** cells are at least 4 bytes. It only happens in b-trees used
5404 ** to evaluate "IN (SELECT ...)" and similar clauses.
5407 assert(leafCorrection==4);
5408 sz = cellSizePtr(pParent, pCell);
5412 assert( sz<=pBt->pageSize/4 );
5413 assert( iSpace2<=pBt->pageSize );
5414 rc = insertCell(pParent, nxDiv, pCell, sz, pTemp, 4);
5415 if( rc!=SQLITE_OK ) goto balance_cleanup;
5416 put4byte(findOverflowCell(pParent,nxDiv), pNew->pgno);
5418 /* If this is an auto-vacuum database, and not a leaf-data tree,
5419 ** then update the pointer map with an entry for the overflow page
5420 ** that the cell just inserted points to (if any).
5422 if( ISAUTOVACUUM && !leafData ){
5423 rc = ptrmapPutOvfl(pParent, nxDiv);
5424 if( rc!=SQLITE_OK ){
5425 goto balance_cleanup;
5432 /* Set the pointer-map entry for the new sibling page. */
5434 rc = ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno);
5435 if( rc!=SQLITE_OK ){
5436 goto balance_cleanup;
5443 if( (pageFlags & PTF_LEAF)==0 ){
5444 u8 *zChild = &apCopy[nOld-1]->aData[8];
5445 memcpy(&apNew[nNew-1]->aData[8], zChild, 4);
5447 rc = ptrmapPut(pBt, get4byte(zChild), PTRMAP_BTREE, apNew[nNew-1]->pgno);
5448 if( rc!=SQLITE_OK ){
5449 goto balance_cleanup;
5453 if( nxDiv==pParent->nCell+pParent->nOverflow ){
5454 /* Right-most sibling is the right-most child of pParent */
5455 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew[nNew-1]);
5457 /* Right-most sibling is the left child of the first entry in pParent
5458 ** past the right-most divider entry */
5459 put4byte(findOverflowCell(pParent, nxDiv), pgnoNew[nNew-1]);
5463 ** Balance the parent page. Note that the current page (pPage) might
5464 ** have been added to the freelist so it might no longer be initialized.
5465 ** But the parent page will always be initialized.
5467 assert( pParent->isInit );
5468 sqlite3ScratchFree(apCell);
5472 rc = balance(pCur, 0);
5475 ** Cleanup before returning.
5478 sqlite3PageFree(aSpace2);
5479 sqlite3ScratchFree(apCell);
5480 for(i=0; i<nOld; i++){
5481 releasePage(apOld[i]);
5483 for(i=0; i<nNew; i++){
5484 releasePage(apNew[i]);
5487 /* releasePage(pParent); */
5488 TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n",
5489 pPage->pgno, nOld, nNew, nCell));
5495 ** This routine is called for the root page of a btree when the root
5496 ** page contains no cells. This is an opportunity to make the tree
5497 ** shallower by one level.
5499 static int balance_shallower(BtCursor *pCur){
5500 MemPage *pPage; /* Root page of B-Tree */
5501 MemPage *pChild; /* The only child page of pPage */
5502 Pgno pgnoChild; /* Page number for pChild */
5503 int rc = SQLITE_OK; /* Return code from subprocedures */
5504 BtShared *pBt; /* The main BTree structure */
5505 int mxCellPerPage; /* Maximum number of cells per page */
5506 u8 **apCell; /* All cells from pages being balanced */
5507 u16 *szCell; /* Local size of all cells */
5509 assert( pCur->iPage==0 );
5510 pPage = pCur->apPage[0];
5512 assert( pPage->nCell==0 );
5513 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5515 mxCellPerPage = MX_CELL(pBt);
5516 apCell = sqlite3Malloc( mxCellPerPage*(sizeof(u8*)+sizeof(u16)) );
5517 if( apCell==0 ) return SQLITE_NOMEM;
5518 szCell = (u16*)&apCell[mxCellPerPage];
5520 /* The table is completely empty */
5521 TRACE(("BALANCE: empty table %d\n", pPage->pgno));
5523 /* The root page is empty but has one child. Transfer the
5524 ** information from that one child into the root page if it
5525 ** will fit. This reduces the depth of the tree by one.
5527 ** If the root page is page 1, it has less space available than
5528 ** its child (due to the 100 byte header that occurs at the beginning
5529 ** of the database fle), so it might not be able to hold all of the
5530 ** information currently contained in the child. If this is the
5531 ** case, then do not do the transfer. Leave page 1 empty except
5532 ** for the right-pointer to the child page. The child page becomes
5533 ** the virtual root of the tree.
5535 VVA_ONLY( pCur->pagesShuffled = 1 );
5536 pgnoChild = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5537 assert( pgnoChild>0 );
5538 assert( pgnoChild<=pagerPagecount(pPage->pBt->pPager) );
5539 rc = sqlite3BtreeGetPage(pPage->pBt, pgnoChild, &pChild, 0);
5540 if( rc ) goto end_shallow_balance;
5541 if( pPage->pgno==1 ){
5542 rc = sqlite3BtreeInitPage(pChild);
5543 if( rc ) goto end_shallow_balance;
5544 assert( pChild->nOverflow==0 );
5545 if( pChild->nFree>=100 ){
5546 /* The child information will fit on the root page, so do the
5549 zeroPage(pPage, pChild->aData[0]);
5550 for(i=0; i<pChild->nCell; i++){
5551 apCell[i] = findCell(pChild,i);
5552 szCell[i] = cellSizePtr(pChild, apCell[i]);
5554 assemblePage(pPage, pChild->nCell, apCell, szCell);
5555 /* Copy the right-pointer of the child to the parent. */
5556 put4byte(&pPage->aData[pPage->hdrOffset+8],
5557 get4byte(&pChild->aData[pChild->hdrOffset+8]));
5559 TRACE(("BALANCE: child %d transfer to page 1\n", pChild->pgno));
5561 /* The child has more information that will fit on the root.
5562 ** The tree is already balanced. Do nothing. */
5563 TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno));
5566 memcpy(pPage->aData, pChild->aData, pPage->pBt->usableSize);
5568 rc = sqlite3BtreeInitPage(pPage);
5569 assert( rc==SQLITE_OK );
5571 TRACE(("BALANCE: transfer child %d into root %d\n",
5572 pChild->pgno, pPage->pgno));
5574 assert( pPage->nOverflow==0 );
5576 rc = setChildPtrmaps(pPage);
5578 releasePage(pChild);
5580 end_shallow_balance:
5581 sqlite3_free(apCell);
5587 ** The root page is overfull
5589 ** When this happens, Create a new child page and copy the
5590 ** contents of the root into the child. Then make the root
5591 ** page an empty page with rightChild pointing to the new
5592 ** child. Finally, call balance_internal() on the new child
5593 ** to cause it to split.
5595 static int balance_deeper(BtCursor *pCur){
5596 int rc; /* Return value from subprocedures */
5597 MemPage *pPage; /* Pointer to the root page */
5598 MemPage *pChild; /* Pointer to a new child page */
5599 Pgno pgnoChild; /* Page number of the new child page */
5600 BtShared *pBt; /* The BTree */
5601 int usableSize; /* Total usable size of a page */
5602 u8 *data; /* Content of the parent page */
5603 u8 *cdata; /* Content of the child page */
5604 int hdr; /* Offset to page header in parent */
5605 int cbrk; /* Offset to content of first cell in parent */
5607 assert( pCur->iPage==0 );
5608 assert( pCur->apPage[0]->nOverflow>0 );
5610 VVA_ONLY( pCur->pagesShuffled = 1 );
5611 pPage = pCur->apPage[0];
5613 assert( sqlite3_mutex_held(pBt->mutex) );
5614 rc = allocateBtreePage(pBt, &pChild, &pgnoChild, pPage->pgno, 0);
5616 assert( sqlite3PagerIswriteable(pChild->pDbPage) );
5617 usableSize = pBt->usableSize;
5618 data = pPage->aData;
5619 hdr = pPage->hdrOffset;
5620 cbrk = get2byte(&data[hdr+5]);
5621 cdata = pChild->aData;
5622 memcpy(cdata, &data[hdr], pPage->cellOffset+2*pPage->nCell-hdr);
5623 memcpy(&cdata[cbrk], &data[cbrk], usableSize-cbrk);
5625 rc = sqlite3BtreeInitPage(pChild);
5626 if( rc==SQLITE_OK ){
5627 int nCopy = pPage->nOverflow*sizeof(pPage->aOvfl[0]);
5628 memcpy(pChild->aOvfl, pPage->aOvfl, nCopy);
5629 pChild->nOverflow = pPage->nOverflow;
5630 if( pChild->nOverflow ){
5633 assert( pChild->nCell==pPage->nCell );
5634 zeroPage(pPage, pChild->aData[0] & ~PTF_LEAF);
5635 put4byte(&pPage->aData[pPage->hdrOffset+8], pgnoChild);
5636 TRACE(("BALANCE: copy root %d into %d\n", pPage->pgno, pChild->pgno));
5638 rc = ptrmapPut(pBt, pChild->pgno, PTRMAP_BTREE, pPage->pgno);
5639 if( rc==SQLITE_OK ){
5640 rc = setChildPtrmaps(pChild);
5645 if( rc==SQLITE_OK ){
5647 pCur->apPage[1] = pChild;
5649 rc = balance_nonroot(pCur);
5651 releasePage(pChild);
5658 ** The page that pCur currently points to has just been modified in
5659 ** some way. This function figures out if this modification means the
5660 ** tree needs to be balanced, and if so calls the appropriate balancing
5663 ** Parameter isInsert is true if a new cell was just inserted into the
5664 ** page, or false otherwise.
5666 static int balance(BtCursor *pCur, int isInsert){
5668 MemPage *pPage = pCur->apPage[pCur->iPage];
5670 assert( sqlite3_mutex_held(pPage->pBt->mutex) );
5671 if( pCur->iPage==0 ){
5672 rc = sqlite3PagerWrite(pPage->pDbPage);
5673 if( rc==SQLITE_OK && pPage->nOverflow>0 ){
5674 rc = balance_deeper(pCur);
5676 if( rc==SQLITE_OK && pPage->nCell==0 ){
5677 rc = balance_shallower(pCur);
5680 if( pPage->nOverflow>0 ||
5681 (!isInsert && pPage->nFree>pPage->pBt->usableSize*2/3) ){
5682 rc = balance_nonroot(pCur);
5689 ** This routine checks all cursors that point to table pgnoRoot.
5690 ** If any of those cursors were opened with wrFlag==0 in a different
5691 ** database connection (a database connection that shares the pager
5692 ** cache with the current connection) and that other connection
5693 ** is not in the ReadUncommmitted state, then this routine returns
5696 ** As well as cursors with wrFlag==0, cursors with wrFlag==1 and
5697 ** isIncrblobHandle==1 are also considered 'read' cursors. Incremental
5698 ** blob cursors are used for both reading and writing.
5700 ** When pgnoRoot is the root page of an intkey table, this function is also
5701 ** responsible for invalidating incremental blob cursors when the table row
5702 ** on which they are opened is deleted or modified. Cursors are invalidated
5703 ** according to the following rules:
5705 ** 1) When BtreeClearTable() is called to completely delete the contents
5706 ** of a B-Tree table, pExclude is set to zero and parameter iRow is
5707 ** set to non-zero. In this case all incremental blob cursors open
5708 ** on the table rooted at pgnoRoot are invalidated.
5710 ** 2) When BtreeInsert(), BtreeDelete() or BtreePutData() is called to
5711 ** modify a table row via an SQL statement, pExclude is set to the
5712 ** write cursor used to do the modification and parameter iRow is set
5713 ** to the integer row id of the B-Tree entry being modified. Unless
5714 ** pExclude is itself an incremental blob cursor, then all incremental
5715 ** blob cursors open on row iRow of the B-Tree are invalidated.
5717 ** 3) If both pExclude and iRow are set to zero, no incremental blob
5718 ** cursors are invalidated.
5720 static int checkReadLocks(
5727 BtShared *pBt = pBtree->pBt;
5728 sqlite3 *db = pBtree->db;
5729 assert( sqlite3BtreeHoldsMutex(pBtree) );
5730 for(p=pBt->pCursor; p; p=p->pNext){
5731 if( p==pExclude ) continue;
5732 if( p->pgnoRoot!=pgnoRoot ) continue;
5733 #ifndef SQLITE_OMIT_INCRBLOB
5734 if( p->isIncrblobHandle && (
5736 || (pExclude && !pExclude->isIncrblobHandle && p->info.nKey==iRow)
5738 p->eState = CURSOR_INVALID;
5741 if( p->eState!=CURSOR_VALID ) continue;
5743 #ifndef SQLITE_OMIT_INCRBLOB
5744 || p->isIncrblobHandle
5747 sqlite3 *dbOther = p->pBtree->db;
5749 (dbOther!=db && (dbOther->flags & SQLITE_ReadUncommitted)==0) ){
5750 return SQLITE_LOCKED;
5758 ** Insert a new record into the BTree. The key is given by (pKey,nKey)
5759 ** and the data is given by (pData,nData). The cursor is used only to
5760 ** define what table the record should be inserted into. The cursor
5761 ** is left pointing at a random location.
5763 ** For an INTKEY table, only the nKey value of the key is used. pKey is
5764 ** ignored. For a ZERODATA table, the pData and nData are both ignored.
5766 int sqlite3BtreeInsert(
5767 BtCursor *pCur, /* Insert data into the table of this cursor */
5768 const void *pKey, i64 nKey, /* The key of the new record */
5769 const void *pData, int nData, /* The data of the new record */
5770 int nZero, /* Number of extra 0 bytes to append to data */
5771 int appendBias /* True if this is likely an append */
5778 Btree *p = pCur->pBtree;
5779 BtShared *pBt = p->pBt;
5780 unsigned char *oldCell;
5781 unsigned char *newCell = 0;
5783 assert( cursorHoldsMutex(pCur) );
5784 if( pBt->inTransaction!=TRANS_WRITE ){
5785 /* Must start a transaction before doing an insert */
5786 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
5789 assert( !pBt->readOnly );
5790 if( !pCur->wrFlag ){
5791 return SQLITE_PERM; /* Cursor not open for writing */
5793 if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur, nKey) ){
5794 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
5796 if( pCur->eState==CURSOR_FAULT ){
5800 /* Save the positions of any other cursors open on this table */
5801 sqlite3BtreeClearCursor(pCur);
5803 SQLITE_OK!=(rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur)) ||
5804 SQLITE_OK!=(rc = sqlite3BtreeMoveto(pCur, pKey, nKey, appendBias, &loc))
5809 pPage = pCur->apPage[pCur->iPage];
5810 assert( pPage->intKey || nKey>=0 );
5811 assert( pPage->leaf || !pPage->intKey );
5812 TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
5813 pCur->pgnoRoot, nKey, nData, pPage->pgno,
5814 loc==0 ? "overwrite" : "new entry"));
5815 assert( pPage->isInit );
5816 allocateTempSpace(pBt);
5817 newCell = pBt->pTmpSpace;
5818 if( newCell==0 ) return SQLITE_NOMEM;
5819 rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);
5820 if( rc ) goto end_insert;
5821 assert( szNew==cellSizePtr(pPage, newCell) );
5822 assert( szNew<=MX_CELL_SIZE(pBt) );
5823 idx = pCur->aiIdx[pCur->iPage];
5824 if( loc==0 && CURSOR_VALID==pCur->eState ){
5826 assert( idx<pPage->nCell );
5827 rc = sqlite3PagerWrite(pPage->pDbPage);
5831 oldCell = findCell(pPage, idx);
5833 memcpy(newCell, oldCell, 4);
5835 szOld = cellSizePtr(pPage, oldCell);
5836 rc = clearCell(pPage, oldCell);
5837 if( rc ) goto end_insert;
5838 rc = dropCell(pPage, idx, szOld);
5839 if( rc ) goto end_insert;
5840 }else if( loc<0 && pPage->nCell>0 ){
5841 assert( pPage->leaf );
5842 idx = ++pCur->aiIdx[pCur->iPage];
5843 pCur->info.nSize = 0;
5844 pCur->validNKey = 0;
5846 assert( pPage->leaf );
5848 rc = insertCell(pPage, idx, newCell, szNew, 0, 0);
5849 if( rc!=SQLITE_OK ) goto end_insert;
5850 rc = balance(pCur, 1);
5851 if( rc==SQLITE_OK ){
5859 ** Delete the entry that the cursor is pointing to. The cursor
5860 ** is left pointing at a arbitrary location.
5862 int sqlite3BtreeDelete(BtCursor *pCur){
5863 MemPage *pPage = pCur->apPage[pCur->iPage];
5865 unsigned char *pCell;
5868 Btree *p = pCur->pBtree;
5869 BtShared *pBt = p->pBt;
5871 assert( cursorHoldsMutex(pCur) );
5872 assert( pPage->isInit );
5873 if( pBt->inTransaction!=TRANS_WRITE ){
5874 /* Must start a transaction before doing a delete */
5875 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
5878 assert( !pBt->readOnly );
5879 if( pCur->eState==CURSOR_FAULT ){
5882 if( pCur->aiIdx[pCur->iPage]>=pPage->nCell ){
5883 return SQLITE_ERROR; /* The cursor is not pointing to anything */
5885 if( !pCur->wrFlag ){
5886 return SQLITE_PERM; /* Did not open this cursor for writing */
5888 if( checkReadLocks(pCur->pBtree, pCur->pgnoRoot, pCur, pCur->info.nKey) ){
5889 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
5892 /* Restore the current cursor position (a no-op if the cursor is not in
5893 ** CURSOR_REQUIRESEEK state) and save the positions of any other cursors
5894 ** open on the same table. Then call sqlite3PagerWrite() on the page
5895 ** that the entry will be deleted from.
5898 (rc = restoreCursorPosition(pCur))!=0 ||
5899 (rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur))!=0 ||
5900 (rc = sqlite3PagerWrite(pPage->pDbPage))!=0
5905 /* Locate the cell within its page and leave pCell pointing to the
5906 ** data. The clearCell() call frees any overflow pages associated with the
5907 ** cell. The cell itself is still intact.
5909 idx = pCur->aiIdx[pCur->iPage];
5910 pCell = findCell(pPage, idx);
5912 pgnoChild = get4byte(pCell);
5914 rc = clearCell(pPage, pCell);
5921 ** The entry we are about to delete is not a leaf so if we do not
5922 ** do something we will leave a hole on an internal page.
5923 ** We have to fill the hole by moving in a cell from a leaf. The
5924 ** next Cell after the one to be deleted is guaranteed to exist and
5925 ** to be a leaf so we can use it.
5930 unsigned char *pNext;
5932 unsigned char *tempCell = 0;
5933 assert( !pPage->intKey );
5934 sqlite3BtreeGetTempCursor(pCur, &leafCur);
5935 rc = sqlite3BtreeNext(&leafCur, ¬Used);
5936 if( rc==SQLITE_OK ){
5937 assert( leafCur.aiIdx[leafCur.iPage]==0 );
5938 pLeafPage = leafCur.apPage[leafCur.iPage];
5939 rc = sqlite3PagerWrite(pLeafPage->pDbPage);
5941 if( rc==SQLITE_OK ){
5942 int leafCursorInvalid = 0;
5944 TRACE(("DELETE: table=%d delete internal from %d replace from leaf %d\n",
5945 pCur->pgnoRoot, pPage->pgno, pLeafPage->pgno));
5946 dropCell(pPage, idx, cellSizePtr(pPage, pCell));
5947 pNext = findCell(pLeafPage, 0);
5948 szNext = cellSizePtr(pLeafPage, pNext);
5949 assert( MX_CELL_SIZE(pBt)>=szNext+4 );
5950 allocateTempSpace(pBt);
5951 tempCell = pBt->pTmpSpace;
5955 if( rc==SQLITE_OK ){
5956 rc = insertCell(pPage, idx, pNext-4, szNext+4, tempCell, 0);
5960 /* The "if" statement in the next code block is critical. The
5961 ** slightest error in that statement would allow SQLite to operate
5962 ** correctly most of the time but produce very rare failures. To
5963 ** guard against this, the following macros help to verify that
5964 ** the "if" statement is well tested.
5966 testcase( pPage->nOverflow==0 && pPage->nFree<pBt->usableSize*2/3
5967 && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
5968 testcase( pPage->nOverflow==0 && pPage->nFree==pBt->usableSize*2/3
5969 && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
5970 testcase( pPage->nOverflow==0 && pPage->nFree==pBt->usableSize*2/3+1
5971 && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
5972 testcase( pPage->nOverflow>0 && pPage->nFree<=pBt->usableSize*2/3
5973 && pLeafPage->nFree+2+szNext > pBt->usableSize*2/3 );
5974 testcase( (pPage->nOverflow>0 || (pPage->nFree > pBt->usableSize*2/3))
5975 && pLeafPage->nFree+2+szNext == pBt->usableSize*2/3 );
5978 if( (pPage->nOverflow>0 || (pPage->nFree > pBt->usableSize*2/3)) &&
5979 (pLeafPage->nFree+2+szNext > pBt->usableSize*2/3)
5981 /* This branch is taken if the internal node is now either overflowing
5982 ** or underfull and the leaf node will be underfull after the just cell
5983 ** copied to the internal node is deleted from it. This is a special
5984 ** case because the call to balance() to correct the internal node
5985 ** may change the tree structure and invalidate the contents of
5986 ** the leafCur.apPage[] and leafCur.aiIdx[] arrays, which will be
5987 ** used by the balance() required to correct the underfull leaf
5990 ** The formula used in the expression above are based on facets of
5991 ** the SQLite file-format that do not change over time.
5993 testcase( pPage->nFree==pBt->usableSize*2/3+1 );
5994 testcase( pLeafPage->nFree+2+szNext==pBt->usableSize*2/3+1 );
5995 leafCursorInvalid = 1;
5998 if( rc==SQLITE_OK ){
5999 put4byte(findOverflowCell(pPage, idx), pgnoChild);
6000 VVA_ONLY( pCur->pagesShuffled = 0 );
6001 rc = balance(pCur, 0);
6004 if( rc==SQLITE_OK && leafCursorInvalid ){
6005 /* The leaf-node is now underfull and so the tree needs to be
6006 ** rebalanced. However, the balance() operation on the internal
6007 ** node above may have modified the structure of the B-Tree and
6008 ** so the current contents of leafCur.apPage[] and leafCur.aiIdx[]
6009 ** may not be trusted.
6011 ** It is not possible to copy the ancestry from pCur, as the same
6012 ** balance() call has invalidated the pCur->apPage[] and aiIdx[]
6015 ** The call to saveCursorPosition() below internally saves the
6016 ** key that leafCur is currently pointing to. Currently, there
6017 ** are two copies of that key in the tree - one here on the leaf
6018 ** page and one on some internal node in the tree. The copy on
6019 ** the leaf node is always the next key in tree-order after the
6020 ** copy on the internal node. So, the call to sqlite3BtreeNext()
6021 ** calls restoreCursorPosition() to point the cursor to the copy
6022 ** stored on the internal node, then advances to the next entry,
6023 ** which happens to be the copy of the key on the internal node.
6024 ** Net effect: leafCur is pointing back to the duplicate cell
6025 ** that needs to be removed, and the leafCur.apPage[] and
6026 ** leafCur.aiIdx[] arrays are correct.
6028 VVA_ONLY( Pgno leafPgno = pLeafPage->pgno );
6029 rc = saveCursorPosition(&leafCur);
6030 if( rc==SQLITE_OK ){
6031 rc = sqlite3BtreeNext(&leafCur, ¬Used);
6033 pLeafPage = leafCur.apPage[leafCur.iPage];
6034 assert( pLeafPage->pgno==leafPgno );
6035 assert( leafCur.aiIdx[leafCur.iPage]==0 );
6038 if( rc==SQLITE_OK ){
6039 dropCell(pLeafPage, 0, szNext);
6040 VVA_ONLY( leafCur.pagesShuffled = 0 );
6041 rc = balance(&leafCur, 0);
6042 assert( leafCursorInvalid || !leafCur.pagesShuffled
6043 || !pCur->pagesShuffled );
6046 sqlite3BtreeReleaseTempCursor(&leafCur);
6048 TRACE(("DELETE: table=%d delete from leaf %d\n",
6049 pCur->pgnoRoot, pPage->pgno));
6050 rc = dropCell(pPage, idx, cellSizePtr(pPage, pCell));
6051 if( rc==SQLITE_OK ){
6052 rc = balance(pCur, 0);
6055 if( rc==SQLITE_OK ){
6062 ** Create a new BTree table. Write into *piTable the page
6063 ** number for the root page of the new table.
6065 ** The type of type is determined by the flags parameter. Only the
6066 ** following values of flags are currently in use. Other values for
6067 ** flags might not work:
6069 ** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys
6070 ** BTREE_ZERODATA Used for SQL indices
6072 static int btreeCreateTable(Btree *p, int *piTable, int flags){
6073 BtShared *pBt = p->pBt;
6078 assert( sqlite3BtreeHoldsMutex(p) );
6079 if( pBt->inTransaction!=TRANS_WRITE ){
6080 /* Must start a transaction first */
6081 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
6084 assert( !pBt->readOnly );
6086 #ifdef SQLITE_OMIT_AUTOVACUUM
6087 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
6092 if( pBt->autoVacuum ){
6093 Pgno pgnoMove; /* Move a page here to make room for the root-page */
6094 MemPage *pPageMove; /* The page to move to. */
6096 /* Creating a new table may probably require moving an existing database
6097 ** to make room for the new tables root page. In case this page turns
6098 ** out to be an overflow page, delete all overflow page-map caches
6099 ** held by open cursors.
6101 invalidateAllOverflowCache(pBt);
6103 /* Read the value of meta[3] from the database to determine where the
6104 ** root page of the new table should go. meta[3] is the largest root-page
6105 ** created so far, so the new root-page is (meta[3]+1).
6107 rc = sqlite3BtreeGetMeta(p, 4, &pgnoRoot);
6108 if( rc!=SQLITE_OK ){
6113 /* The new root-page may not be allocated on a pointer-map page, or the
6114 ** PENDING_BYTE page.
6116 while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
6117 pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
6120 assert( pgnoRoot>=3 );
6122 /* Allocate a page. The page that currently resides at pgnoRoot will
6123 ** be moved to the allocated page (unless the allocated page happens
6124 ** to reside at pgnoRoot).
6126 rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, 1);
6127 if( rc!=SQLITE_OK ){
6131 if( pgnoMove!=pgnoRoot ){
6132 /* pgnoRoot is the page that will be used for the root-page of
6133 ** the new table (assuming an error did not occur). But we were
6134 ** allocated pgnoMove. If required (i.e. if it was not allocated
6135 ** by extending the file), the current page at position pgnoMove
6136 ** is already journaled.
6141 releasePage(pPageMove);
6143 /* Move the page currently at pgnoRoot to pgnoMove. */
6144 rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0);
6145 if( rc!=SQLITE_OK ){
6148 rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
6149 if( rc!=SQLITE_OK || eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
6153 assert( eType!=PTRMAP_ROOTPAGE );
6154 assert( eType!=PTRMAP_FREEPAGE );
6155 rc = sqlite3PagerWrite(pRoot->pDbPage);
6156 if( rc!=SQLITE_OK ){
6160 rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
6163 /* Obtain the page at pgnoRoot */
6164 if( rc!=SQLITE_OK ){
6167 rc = sqlite3BtreeGetPage(pBt, pgnoRoot, &pRoot, 0);
6168 if( rc!=SQLITE_OK ){
6171 rc = sqlite3PagerWrite(pRoot->pDbPage);
6172 if( rc!=SQLITE_OK ){
6180 /* Update the pointer-map and meta-data with the new root-page number. */
6181 rc = ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0);
6186 rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
6193 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
6197 assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
6198 zeroPage(pRoot, flags | PTF_LEAF);
6199 sqlite3PagerUnref(pRoot->pDbPage);
6200 *piTable = (int)pgnoRoot;
6203 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
6205 sqlite3BtreeEnter(p);
6207 rc = btreeCreateTable(p, piTable, flags);
6208 sqlite3BtreeLeave(p);
6213 ** Erase the given database page and all its children. Return
6214 ** the page to the freelist.
6216 static int clearDatabasePage(
6217 BtShared *pBt, /* The BTree that contains the table */
6218 Pgno pgno, /* Page number to clear */
6219 MemPage *pParent, /* Parent page. NULL for the root */
6220 int freePageFlag /* Deallocate page if true */
6224 unsigned char *pCell;
6227 assert( sqlite3_mutex_held(pBt->mutex) );
6228 if( pgno>pagerPagecount(pBt->pPager) ){
6229 return SQLITE_CORRUPT_BKPT;
6232 rc = getAndInitPage(pBt, pgno, &pPage);
6233 if( rc ) goto cleardatabasepage_out;
6234 for(i=0; i<pPage->nCell; i++){
6235 pCell = findCell(pPage, i);
6237 rc = clearDatabasePage(pBt, get4byte(pCell), pPage, 1);
6238 if( rc ) goto cleardatabasepage_out;
6240 rc = clearCell(pPage, pCell);
6241 if( rc ) goto cleardatabasepage_out;
6244 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[8]), pPage, 1);
6245 if( rc ) goto cleardatabasepage_out;
6248 rc = freePage(pPage);
6249 }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
6250 zeroPage(pPage, pPage->aData[0] | PTF_LEAF);
6253 cleardatabasepage_out:
6259 ** Delete all information from a single table in the database. iTable is
6260 ** the page number of the root of the table. After this routine returns,
6261 ** the root page is empty, but still exists.
6263 ** This routine will fail with SQLITE_LOCKED if there are any open
6264 ** read cursors on the table. Open write cursors are moved to the
6265 ** root of the table.
6267 int sqlite3BtreeClearTable(Btree *p, int iTable){
6269 BtShared *pBt = p->pBt;
6270 sqlite3BtreeEnter(p);
6272 if( p->inTrans!=TRANS_WRITE ){
6273 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
6274 }else if( (rc = checkReadLocks(p, iTable, 0, 1))!=SQLITE_OK ){
6276 }else if( SQLITE_OK!=(rc = saveAllCursors(pBt, iTable, 0)) ){
6279 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, 0);
6281 sqlite3BtreeLeave(p);
6286 ** Erase all information in a table and add the root of the table to
6287 ** the freelist. Except, the root of the principle table (the one on
6288 ** page 1) is never added to the freelist.
6290 ** This routine will fail with SQLITE_LOCKED if there are any open
6291 ** cursors on the table.
6293 ** If AUTOVACUUM is enabled and the page at iTable is not the last
6294 ** root page in the database file, then the last root page
6295 ** in the database file is moved into the slot formerly occupied by
6296 ** iTable and that last slot formerly occupied by the last root page
6297 ** is added to the freelist instead of iTable. In this say, all
6298 ** root pages are kept at the beginning of the database file, which
6299 ** is necessary for AUTOVACUUM to work right. *piMoved is set to the
6300 ** page number that used to be the last root page in the file before
6301 ** the move. If no page gets moved, *piMoved is set to 0.
6302 ** The last root page is recorded in meta[3] and the value of
6303 ** meta[3] is updated by this procedure.
6305 static int btreeDropTable(Btree *p, int iTable, int *piMoved){
6308 BtShared *pBt = p->pBt;
6310 assert( sqlite3BtreeHoldsMutex(p) );
6311 if( p->inTrans!=TRANS_WRITE ){
6312 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
6315 /* It is illegal to drop a table if any cursors are open on the
6316 ** database. This is because in auto-vacuum mode the backend may
6317 ** need to move another root-page to fill a gap left by the deleted
6318 ** root page. If an open cursor was using this page a problem would
6322 return SQLITE_LOCKED;
6325 rc = sqlite3BtreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
6327 rc = sqlite3BtreeClearTable(p, iTable);
6336 #ifdef SQLITE_OMIT_AUTOVACUUM
6337 rc = freePage(pPage);
6340 if( pBt->autoVacuum ){
6342 rc = sqlite3BtreeGetMeta(p, 4, &maxRootPgno);
6343 if( rc!=SQLITE_OK ){
6348 if( iTable==maxRootPgno ){
6349 /* If the table being dropped is the table with the largest root-page
6350 ** number in the database, put the root page on the free list.
6352 rc = freePage(pPage);
6354 if( rc!=SQLITE_OK ){
6358 /* The table being dropped does not have the largest root-page
6359 ** number in the database. So move the page that does into the
6360 ** gap left by the deleted root-page.
6364 rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0);
6365 if( rc!=SQLITE_OK ){
6368 rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
6370 if( rc!=SQLITE_OK ){
6373 rc = sqlite3BtreeGetPage(pBt, maxRootPgno, &pMove, 0);
6374 if( rc!=SQLITE_OK ){
6377 rc = freePage(pMove);
6379 if( rc!=SQLITE_OK ){
6382 *piMoved = maxRootPgno;
6385 /* Set the new 'max-root-page' value in the database header. This
6386 ** is the old value less one, less one more if that happens to
6387 ** be a root-page number, less one again if that is the
6388 ** PENDING_BYTE_PAGE.
6391 if( maxRootPgno==PENDING_BYTE_PAGE(pBt) ){
6394 if( maxRootPgno==PTRMAP_PAGENO(pBt, maxRootPgno) ){
6397 assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
6399 rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
6401 rc = freePage(pPage);
6406 /* If sqlite3BtreeDropTable was called on page 1. */
6407 zeroPage(pPage, PTF_INTKEY|PTF_LEAF );
6412 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
6414 sqlite3BtreeEnter(p);
6416 rc = btreeDropTable(p, iTable, piMoved);
6417 sqlite3BtreeLeave(p);
6423 ** Read the meta-information out of a database file. Meta[0]
6424 ** is the number of free pages currently in the database. Meta[1]
6425 ** through meta[15] are available for use by higher layers. Meta[0]
6426 ** is read-only, the others are read/write.
6428 ** The schema layer numbers meta values differently. At the schema
6429 ** layer (and the SetCookie and ReadCookie opcodes) the number of
6430 ** free pages is not visible. So Cookie[0] is the same as Meta[1].
6432 int sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
6436 BtShared *pBt = p->pBt;
6438 sqlite3BtreeEnter(p);
6441 /* Reading a meta-data value requires a read-lock on page 1 (and hence
6442 ** the sqlite_master table. We grab this lock regardless of whether or
6443 ** not the SQLITE_ReadUncommitted flag is set (the table rooted at page
6444 ** 1 is treated as a special case by queryTableLock() and lockTable()).
6446 rc = queryTableLock(p, 1, READ_LOCK);
6447 if( rc!=SQLITE_OK ){
6448 sqlite3BtreeLeave(p);
6452 assert( idx>=0 && idx<=15 );
6454 /* The b-tree is already holding a reference to page 1 of the database
6455 ** file. In this case the required meta-data value can be read directly
6456 ** from the page data of this reference. This is slightly faster than
6457 ** requesting a new reference from the pager layer.
6459 pP1 = (unsigned char *)pBt->pPage1->aData;
6461 /* The b-tree does not have a reference to page 1 of the database file.
6462 ** Obtain one from the pager layer.
6464 rc = sqlite3PagerGet(pBt->pPager, 1, &pDbPage);
6466 sqlite3BtreeLeave(p);
6469 pP1 = (unsigned char *)sqlite3PagerGetData(pDbPage);
6471 *pMeta = get4byte(&pP1[36 + idx*4]);
6473 /* If the b-tree is not holding a reference to page 1, then one was
6474 ** requested from the pager layer in the above block. Release it now.
6477 sqlite3PagerUnref(pDbPage);
6480 /* If autovacuumed is disabled in this build but we are trying to
6481 ** access an autovacuumed database, then make the database readonly.
6483 #ifdef SQLITE_OMIT_AUTOVACUUM
6484 if( idx==4 && *pMeta>0 ) pBt->readOnly = 1;
6487 /* Grab the read-lock on page 1. */
6488 rc = lockTable(p, 1, READ_LOCK);
6489 sqlite3BtreeLeave(p);
6494 ** Write meta-information back into the database. Meta[0] is
6495 ** read-only and may not be written.
6497 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
6498 BtShared *pBt = p->pBt;
6501 assert( idx>=1 && idx<=15 );
6502 sqlite3BtreeEnter(p);
6504 if( p->inTrans!=TRANS_WRITE ){
6505 rc = pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
6507 assert( pBt->pPage1!=0 );
6508 pP1 = pBt->pPage1->aData;
6509 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
6510 if( rc==SQLITE_OK ){
6511 put4byte(&pP1[36 + idx*4], iMeta);
6512 #ifndef SQLITE_OMIT_AUTOVACUUM
6514 assert( pBt->autoVacuum || iMeta==0 );
6515 assert( iMeta==0 || iMeta==1 );
6516 pBt->incrVacuum = iMeta;
6521 sqlite3BtreeLeave(p);
6526 ** Return the flag byte at the beginning of the page that the cursor
6527 ** is currently pointing to.
6529 int sqlite3BtreeFlags(BtCursor *pCur){
6530 /* TODO: What about CURSOR_REQUIRESEEK state? Probably need to call
6531 ** restoreCursorPosition() here.
6534 restoreCursorPosition(pCur);
6535 pPage = pCur->apPage[pCur->iPage];
6536 assert( cursorHoldsMutex(pCur) );
6537 assert( pPage->pBt==pCur->pBt );
6538 return pPage ? pPage->aData[pPage->hdrOffset] : 0;
6543 ** Return the pager associated with a BTree. This routine is used for
6544 ** testing and debugging only.
6546 Pager *sqlite3BtreePager(Btree *p){
6547 return p->pBt->pPager;
6550 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
6552 ** Append a message to the error message string.
6554 static void checkAppendMsg(
6555 IntegrityCk *pCheck,
6557 const char *zFormat,
6561 if( !pCheck->mxErr ) return;
6564 va_start(ap, zFormat);
6565 if( pCheck->errMsg.nChar ){
6566 sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);
6569 sqlite3StrAccumAppend(&pCheck->errMsg, zMsg1, -1);
6571 sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap);
6573 if( pCheck->errMsg.mallocFailed ){
6574 pCheck->mallocFailed = 1;
6577 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
6579 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
6581 ** Add 1 to the reference count for page iPage. If this is the second
6582 ** reference to the page, add an error message to pCheck->zErrMsg.
6583 ** Return 1 if there are 2 ore more references to the page and 0 if
6584 ** if this is the first reference to the page.
6586 ** Also check that the page number is in bounds.
6588 static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){
6589 if( iPage==0 ) return 1;
6590 if( iPage>pCheck->nPage || iPage<0 ){
6591 checkAppendMsg(pCheck, zContext, "invalid page number %d", iPage);
6594 if( pCheck->anRef[iPage]==1 ){
6595 checkAppendMsg(pCheck, zContext, "2nd reference to page %d", iPage);
6598 return (pCheck->anRef[iPage]++)>1;
6601 #ifndef SQLITE_OMIT_AUTOVACUUM
6603 ** Check that the entry in the pointer-map for page iChild maps to
6604 ** page iParent, pointer type ptrType. If not, append an error message
6607 static void checkPtrmap(
6608 IntegrityCk *pCheck, /* Integrity check context */
6609 Pgno iChild, /* Child page number */
6610 u8 eType, /* Expected pointer map type */
6611 Pgno iParent, /* Expected pointer map parent page number */
6612 char *zContext /* Context description (used for error msg) */
6618 rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
6619 if( rc!=SQLITE_OK ){
6620 checkAppendMsg(pCheck, zContext, "Failed to read ptrmap key=%d", iChild);
6624 if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
6625 checkAppendMsg(pCheck, zContext,
6626 "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
6627 iChild, eType, iParent, ePtrmapType, iPtrmapParent);
6633 ** Check the integrity of the freelist or of an overflow page list.
6634 ** Verify that the number of pages on the list is N.
6636 static void checkList(
6637 IntegrityCk *pCheck, /* Integrity checking context */
6638 int isFreeList, /* True for a freelist. False for overflow page list */
6639 int iPage, /* Page number for first page in the list */
6640 int N, /* Expected number of pages in the list */
6641 char *zContext /* Context for error messages */
6646 while( N-- > 0 && pCheck->mxErr ){
6648 unsigned char *pOvflData;
6650 checkAppendMsg(pCheck, zContext,
6651 "%d of %d pages missing from overflow list starting at %d",
6652 N+1, expected, iFirst);
6655 if( checkRef(pCheck, iPage, zContext) ) break;
6656 if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){
6657 checkAppendMsg(pCheck, zContext, "failed to get page %d", iPage);
6660 pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
6662 int n = get4byte(&pOvflData[4]);
6663 #ifndef SQLITE_OMIT_AUTOVACUUM
6664 if( pCheck->pBt->autoVacuum ){
6665 checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0, zContext);
6668 if( n>pCheck->pBt->usableSize/4-2 ){
6669 checkAppendMsg(pCheck, zContext,
6670 "freelist leaf count too big on page %d", iPage);
6674 Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
6675 #ifndef SQLITE_OMIT_AUTOVACUUM
6676 if( pCheck->pBt->autoVacuum ){
6677 checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0, zContext);
6680 checkRef(pCheck, iFreePage, zContext);
6685 #ifndef SQLITE_OMIT_AUTOVACUUM
6687 /* If this database supports auto-vacuum and iPage is not the last
6688 ** page in this overflow list, check that the pointer-map entry for
6689 ** the following page matches iPage.
6691 if( pCheck->pBt->autoVacuum && N>0 ){
6692 i = get4byte(pOvflData);
6693 checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage, zContext);
6697 iPage = get4byte(pOvflData);
6698 sqlite3PagerUnref(pOvflPage);
6701 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
6703 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
6705 ** Do various sanity checks on a single page of a tree. Return
6706 ** the tree depth. Root pages return 0. Parents of root pages
6707 ** return 1, and so forth.
6709 ** These checks are done:
6711 ** 1. Make sure that cells and freeblocks do not overlap
6712 ** but combine to completely cover the page.
6713 ** NO 2. Make sure cell keys are in order.
6714 ** NO 3. Make sure no key is less than or equal to zLowerBound.
6715 ** NO 4. Make sure no key is greater than or equal to zUpperBound.
6716 ** 5. Check the integrity of overflow pages.
6717 ** 6. Recursively call checkTreePage on all children.
6718 ** 7. Verify that the depth of all children is the same.
6719 ** 8. Make sure this page is at least 33% full or else it is
6720 ** the root of the tree.
6722 static int checkTreePage(
6723 IntegrityCk *pCheck, /* Context for the sanity check */
6724 int iPage, /* Page number of the page to check */
6725 MemPage *pParent, /* Parent page */
6726 char *zParentContext /* Parent context */
6729 int i, rc, depth, d2, pgno, cnt;
6738 sqlite3_snprintf(sizeof(zContext), zContext, "Page %d: ", iPage);
6740 /* Check that the page exists
6743 usableSize = pBt->usableSize;
6744 if( iPage==0 ) return 0;
6745 if( checkRef(pCheck, iPage, zParentContext) ) return 0;
6746 if( (rc = sqlite3BtreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
6747 checkAppendMsg(pCheck, zContext,
6748 "unable to get the page. error code=%d", rc);
6751 if( (rc = sqlite3BtreeInitPage(pPage))!=0 ){
6752 checkAppendMsg(pCheck, zContext,
6753 "sqlite3BtreeInitPage() returns error code %d", rc);
6758 /* Check out all the cells.
6761 for(i=0; i<pPage->nCell && pCheck->mxErr; i++){
6766 /* Check payload overflow pages
6768 sqlite3_snprintf(sizeof(zContext), zContext,
6769 "On tree page %d cell %d: ", iPage, i);
6770 pCell = findCell(pPage,i);
6771 sqlite3BtreeParseCellPtr(pPage, pCell, &info);
6773 if( !pPage->intKey ) sz += info.nKey;
6774 assert( sz==info.nPayload );
6775 if( sz>info.nLocal ){
6776 int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4);
6777 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]);
6778 #ifndef SQLITE_OMIT_AUTOVACUUM
6779 if( pBt->autoVacuum ){
6780 checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage, zContext);
6783 checkList(pCheck, 0, pgnoOvfl, nPage, zContext);
6786 /* Check sanity of left child page.
6789 pgno = get4byte(pCell);
6790 #ifndef SQLITE_OMIT_AUTOVACUUM
6791 if( pBt->autoVacuum ){
6792 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, zContext);
6795 d2 = checkTreePage(pCheck,pgno,pPage,zContext);
6796 if( i>0 && d2!=depth ){
6797 checkAppendMsg(pCheck, zContext, "Child page depth differs");
6803 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
6804 sqlite3_snprintf(sizeof(zContext), zContext,
6805 "On page %d at right child: ", iPage);
6806 #ifndef SQLITE_OMIT_AUTOVACUUM
6807 if( pBt->autoVacuum ){
6808 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage, 0);
6811 checkTreePage(pCheck, pgno, pPage, zContext);
6814 /* Check for complete coverage of the page
6816 data = pPage->aData;
6817 hdr = pPage->hdrOffset;
6818 hit = sqlite3PageMalloc( pBt->pageSize );
6820 pCheck->mallocFailed = 1;
6822 u16 contentOffset = get2byte(&data[hdr+5]);
6823 if (contentOffset > usableSize) {
6824 checkAppendMsg(pCheck, 0,
6825 "Corruption detected in header on page %d",iPage,0);
6826 goto check_page_abort;
6828 memset(hit, 0, usableSize );
6829 memset(hit, 1, get2byte(&data[hdr+5]));
6830 nCell = get2byte(&data[hdr+3]);
6831 cellStart = hdr + 12 - 4*pPage->leaf;
6832 for(i=0; i<nCell; i++){
6833 int pc = get2byte(&data[cellStart+i*2]);
6836 if( pc<=usableSize ){
6837 size = cellSizePtr(pPage, &data[pc]);
6839 if( (pc+size-1)>=usableSize || pc<0 ){
6840 checkAppendMsg(pCheck, 0,
6841 "Corruption detected in cell %d on page %d",i,iPage,0);
6843 for(j=pc+size-1; j>=pc; j--) hit[j]++;
6846 for(cnt=0, i=get2byte(&data[hdr+1]); i>0 && i<usableSize && cnt<10000;
6848 int size = get2byte(&data[i+2]);
6850 if( (i+size-1)>=usableSize || i<0 ){
6851 checkAppendMsg(pCheck, 0,
6852 "Corruption detected in cell %d on page %d",i,iPage,0);
6854 for(j=i+size-1; j>=i; j--) hit[j]++;
6856 i = get2byte(&data[i]);
6858 for(i=cnt=0; i<usableSize; i++){
6861 }else if( hit[i]>1 ){
6862 checkAppendMsg(pCheck, 0,
6863 "Multiple uses for byte %d of page %d", i, iPage);
6867 if( cnt!=data[hdr+7] ){
6868 checkAppendMsg(pCheck, 0,
6869 "Fragmented space is %d byte reported as %d on page %d",
6870 cnt, data[hdr+7], iPage);
6875 if( hit ) sqlite3PageFree(hit);
6880 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
6882 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
6884 ** This routine does a complete check of the given BTree file. aRoot[] is
6885 ** an array of pages numbers were each page number is the root page of
6886 ** a table. nRoot is the number of entries in aRoot.
6888 ** Write the number of error seen in *pnErr. Except for some memory
6889 ** allocation errors, nn error message is held in memory obtained from
6890 ** malloc is returned if *pnErr is non-zero. If *pnErr==0 then NULL is
6893 char *sqlite3BtreeIntegrityCheck(
6894 Btree *p, /* The btree to be checked */
6895 int *aRoot, /* An array of root pages numbers for individual trees */
6896 int nRoot, /* Number of entries in aRoot[] */
6897 int mxErr, /* Stop reporting errors after this many */
6898 int *pnErr /* Write number of errors seen to this variable */
6903 BtShared *pBt = p->pBt;
6906 sqlite3BtreeEnter(p);
6908 nRef = sqlite3PagerRefcount(pBt->pPager);
6909 if( lockBtreeWithRetry(p)!=SQLITE_OK ){
6911 sqlite3BtreeLeave(p);
6912 return sqlite3DbStrDup(0, "cannot acquire a read lock on the database");
6915 sCheck.pPager = pBt->pPager;
6916 sCheck.nPage = pagerPagecount(sCheck.pPager);
6917 sCheck.mxErr = mxErr;
6919 sCheck.mallocFailed = 0;
6921 #ifndef SQLITE_OMIT_AUTOVACUUM
6922 if( pBt->nTrunc!=0 ){
6923 sCheck.nPage = pBt->nTrunc;
6926 if( sCheck.nPage==0 ){
6927 unlockBtreeIfUnused(pBt);
6928 sqlite3BtreeLeave(p);
6931 sCheck.anRef = sqlite3Malloc( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
6932 if( !sCheck.anRef ){
6933 unlockBtreeIfUnused(pBt);
6935 sqlite3BtreeLeave(p);
6938 for(i=0; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
6939 i = PENDING_BYTE_PAGE(pBt);
6940 if( i<=sCheck.nPage ){
6941 sCheck.anRef[i] = 1;
6943 sqlite3StrAccumInit(&sCheck.errMsg, zErr, sizeof(zErr), 20000);
6945 /* Check the integrity of the freelist
6947 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
6948 get4byte(&pBt->pPage1->aData[36]), "Main freelist: ");
6950 /* Check all the tables.
6952 for(i=0; i<nRoot && sCheck.mxErr; i++){
6953 if( aRoot[i]==0 ) continue;
6954 #ifndef SQLITE_OMIT_AUTOVACUUM
6955 if( pBt->autoVacuum && aRoot[i]>1 ){
6956 checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0, 0);
6959 checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: ");
6962 /* Make sure every page in the file is referenced
6964 for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
6965 #ifdef SQLITE_OMIT_AUTOVACUUM
6966 if( sCheck.anRef[i]==0 ){
6967 checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
6970 /* If the database supports auto-vacuum, make sure no tables contain
6971 ** references to pointer-map pages.
6973 if( sCheck.anRef[i]==0 &&
6974 (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
6975 checkAppendMsg(&sCheck, 0, "Page %d is never used", i);
6977 if( sCheck.anRef[i]!=0 &&
6978 (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
6979 checkAppendMsg(&sCheck, 0, "Pointer map page %d is referenced", i);
6984 /* Make sure this analysis did not leave any unref() pages
6986 unlockBtreeIfUnused(pBt);
6987 if( nRef != sqlite3PagerRefcount(pBt->pPager) ){
6988 checkAppendMsg(&sCheck, 0,
6989 "Outstanding page count goes from %d to %d during this analysis",
6990 nRef, sqlite3PagerRefcount(pBt->pPager)
6994 /* Clean up and report errors.
6996 sqlite3BtreeLeave(p);
6997 sqlite3_free(sCheck.anRef);
6998 if( sCheck.mallocFailed ){
6999 sqlite3StrAccumReset(&sCheck.errMsg);
7000 *pnErr = sCheck.nErr+1;
7003 *pnErr = sCheck.nErr;
7004 if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);
7005 return sqlite3StrAccumFinish(&sCheck.errMsg);
7007 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
7010 ** Return the full pathname of the underlying database file.
7012 ** The pager filename is invariant as long as the pager is
7013 ** open so it is safe to access without the BtShared mutex.
7015 const char *sqlite3BtreeGetFilename(Btree *p){
7016 assert( p->pBt->pPager!=0 );
7017 return sqlite3PagerFilename(p->pBt->pPager);
7021 ** Return the pathname of the directory that contains the database file.
7023 ** The pager directory name is invariant as long as the pager is
7024 ** open so it is safe to access without the BtShared mutex.
7026 const char *sqlite3BtreeGetDirname(Btree *p){
7027 assert( p->pBt->pPager!=0 );
7028 return sqlite3PagerDirname(p->pBt->pPager);
7032 ** Return the pathname of the journal file for this database. The return
7033 ** value of this routine is the same regardless of whether the journal file
7034 ** has been created or not.
7036 ** The pager journal filename is invariant as long as the pager is
7037 ** open so it is safe to access without the BtShared mutex.
7039 const char *sqlite3BtreeGetJournalname(Btree *p){
7040 assert( p->pBt->pPager!=0 );
7041 return sqlite3PagerJournalname(p->pBt->pPager);
7044 #ifndef SQLITE_OMIT_VACUUM
7046 ** Copy the complete content of pBtFrom into pBtTo. A transaction
7047 ** must be active for both files.
7049 ** The size of file pTo may be reduced by this operation.
7050 ** If anything goes wrong, the transaction on pTo is rolled back.
7052 ** If successful, CommitPhaseOne() may be called on pTo before returning.
7053 ** The caller should finish committing the transaction on pTo by calling
7054 ** sqlite3BtreeCommit().
7056 static int btreeCopyFile(Btree *pTo, Btree *pFrom){
7060 Pgno nFromPage; /* Number of pages in pFrom */
7061 Pgno nToPage; /* Number of pages in pTo */
7062 Pgno nNewPage; /* Number of pages in pTo after the copy */
7064 Pgno iSkip; /* Pending byte page in pTo */
7065 int nToPageSize; /* Page size of pTo in bytes */
7066 int nFromPageSize; /* Page size of pFrom in bytes */
7068 BtShared *pBtTo = pTo->pBt;
7069 BtShared *pBtFrom = pFrom->pBt;
7070 pBtTo->db = pTo->db;
7071 pBtFrom->db = pFrom->db;
7073 nToPageSize = pBtTo->pageSize;
7074 nFromPageSize = pBtFrom->pageSize;
7076 if( pTo->inTrans!=TRANS_WRITE || pFrom->inTrans!=TRANS_WRITE ){
7077 return SQLITE_ERROR;
7079 if( pBtTo->pCursor ){
7083 nToPage = pagerPagecount(pBtTo->pPager);
7084 nFromPage = pagerPagecount(pBtFrom->pPager);
7085 iSkip = PENDING_BYTE_PAGE(pBtTo);
7087 /* Variable nNewPage is the number of pages required to store the
7088 ** contents of pFrom using the current page-size of pTo.
7090 nNewPage = ((i64)nFromPage * (i64)nFromPageSize + (i64)nToPageSize - 1) /
7093 for(i=1; rc==SQLITE_OK && (i<=nToPage || i<=nNewPage); i++){
7095 /* Journal the original page.
7097 ** iSkip is the page number of the locking page (PENDING_BYTE_PAGE)
7098 ** in database *pTo (before the copy). This page is never written
7099 ** into the journal file. Unless i==iSkip or the page was not
7100 ** present in pTo before the copy operation, journal page i from pTo.
7102 if( i!=iSkip && i<=nToPage ){
7103 DbPage *pDbPage = 0;
7104 rc = sqlite3PagerGet(pBtTo->pPager, i, &pDbPage);
7105 if( rc==SQLITE_OK ){
7106 rc = sqlite3PagerWrite(pDbPage);
7107 if( rc==SQLITE_OK && i>nFromPage ){
7108 /* Yeah. It seems wierd to call DontWrite() right after Write(). But
7109 ** that is because the names of those procedures do not exactly
7110 ** represent what they do. Write() really means "put this page in the
7111 ** rollback journal and mark it as dirty so that it will be written
7112 ** to the database file later." DontWrite() undoes the second part of
7113 ** that and prevents the page from being written to the database. The
7114 ** page is still on the rollback journal, though. And that is the
7115 ** whole point of this block: to put pages on the rollback journal.
7117 rc = sqlite3PagerDontWrite(pDbPage);
7119 sqlite3PagerUnref(pDbPage);
7123 /* Overwrite the data in page i of the target database */
7124 if( rc==SQLITE_OK && i!=iSkip && i<=nNewPage ){
7126 DbPage *pToPage = 0;
7129 rc = sqlite3PagerGet(pBtTo->pPager, i, &pToPage);
7130 if( rc==SQLITE_OK ){
7131 rc = sqlite3PagerWrite(pToPage);
7135 iOff=(i-1)*nToPageSize;
7136 rc==SQLITE_OK && iOff<i*nToPageSize;
7137 iOff += nFromPageSize
7139 DbPage *pFromPage = 0;
7140 Pgno iFrom = (iOff/nFromPageSize)+1;
7142 if( iFrom==PENDING_BYTE_PAGE(pBtFrom) ){
7146 rc = sqlite3PagerGet(pBtFrom->pPager, iFrom, &pFromPage);
7147 if( rc==SQLITE_OK ){
7148 char *zTo = sqlite3PagerGetData(pToPage);
7149 char *zFrom = sqlite3PagerGetData(pFromPage);
7152 if( nFromPageSize>=nToPageSize ){
7153 zFrom += ((i-1)*nToPageSize - ((iFrom-1)*nFromPageSize));
7154 nCopy = nToPageSize;
7156 zTo += (((iFrom-1)*nFromPageSize) - (i-1)*nToPageSize);
7157 nCopy = nFromPageSize;
7160 memcpy(zTo, zFrom, nCopy);
7161 sqlite3PagerUnref(pFromPage);
7166 MemPage *p = (MemPage *)sqlite3PagerGetExtra(pToPage);
7168 sqlite3PagerUnref(pToPage);
7173 /* If things have worked so far, the database file may need to be
7174 ** truncated. The complex part is that it may need to be truncated to
7175 ** a size that is not an integer multiple of nToPageSize - the current
7176 ** page size used by the pager associated with B-Tree pTo.
7178 ** For example, say the page-size of pTo is 2048 bytes and the original
7179 ** number of pages is 5 (10 KB file). If pFrom has a page size of 1024
7180 ** bytes and 9 pages, then the file needs to be truncated to 9KB.
7182 if( rc==SQLITE_OK ){
7183 if( nFromPageSize!=nToPageSize ){
7184 sqlite3_file *pFile = sqlite3PagerFile(pBtTo->pPager);
7185 i64 iSize = (i64)nFromPageSize * (i64)nFromPage;
7186 i64 iNow = (i64)((nToPage>nNewPage)?nToPage:nNewPage) * (i64)nToPageSize;
7187 i64 iPending = ((i64)PENDING_BYTE_PAGE(pBtTo)-1) *(i64)nToPageSize;
7189 assert( iSize<=iNow );
7191 /* Commit phase one syncs the journal file associated with pTo
7192 ** containing the original data. It does not sync the database file
7193 ** itself. After doing this it is safe to use OsTruncate() and other
7194 ** file APIs on the database file directly.
7196 pBtTo->db = pTo->db;
7197 rc = sqlite3PagerCommitPhaseOne(pBtTo->pPager, 0, 0, 1);
7198 if( iSize<iNow && rc==SQLITE_OK ){
7199 rc = sqlite3OsTruncate(pFile, iSize);
7202 /* The loop that copied data from database pFrom to pTo did not
7203 ** populate the locking page of database pTo. If the page-size of
7204 ** pFrom is smaller than that of pTo, this means some data will
7205 ** not have been copied.
7207 ** This block copies the missing data from database pFrom to pTo
7208 ** using file APIs. This is safe because at this point we know that
7209 ** all of the original data from pTo has been synced into the
7210 ** journal file. At this point it would be safe to do anything at
7211 ** all to the database file except truncate it to zero bytes.
7213 if( rc==SQLITE_OK && nFromPageSize<nToPageSize && iSize>iPending){
7217 rc==SQLITE_OK && iOff<(iPending+nToPageSize);
7218 iOff += nFromPageSize
7220 DbPage *pFromPage = 0;
7221 Pgno iFrom = (iOff/nFromPageSize)+1;
7223 if( iFrom==PENDING_BYTE_PAGE(pBtFrom) || iFrom>nFromPage ){
7227 rc = sqlite3PagerGet(pBtFrom->pPager, iFrom, &pFromPage);
7228 if( rc==SQLITE_OK ){
7229 char *zFrom = sqlite3PagerGetData(pFromPage);
7230 rc = sqlite3OsWrite(pFile, zFrom, nFromPageSize, iOff);
7231 sqlite3PagerUnref(pFromPage);
7236 /* Sync the database file */
7237 if( rc==SQLITE_OK ){
7238 rc = sqlite3PagerSync(pBtTo->pPager);
7241 rc = sqlite3PagerTruncate(pBtTo->pPager, nNewPage);
7243 if( rc==SQLITE_OK ){
7244 pBtTo->pageSizeFixed = 0;
7249 sqlite3BtreeRollback(pTo);
7254 int sqlite3BtreeCopyFile(Btree *pTo, Btree *pFrom){
7256 sqlite3BtreeEnter(pTo);
7257 sqlite3BtreeEnter(pFrom);
7258 rc = btreeCopyFile(pTo, pFrom);
7259 sqlite3BtreeLeave(pFrom);
7260 sqlite3BtreeLeave(pTo);
7264 #endif /* SQLITE_OMIT_VACUUM */
7267 ** Return non-zero if a transaction is active.
7269 int sqlite3BtreeIsInTrans(Btree *p){
7270 assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
7271 return (p && (p->inTrans==TRANS_WRITE));
7275 ** Return non-zero if a statement transaction is active.
7277 int sqlite3BtreeIsInStmt(Btree *p){
7278 assert( sqlite3BtreeHoldsMutex(p) );
7279 return (p->pBt && p->pBt->inStmt);
7283 ** Return non-zero if a read (or write) transaction is active.
7285 int sqlite3BtreeIsInReadTrans(Btree *p){
7286 assert( sqlite3_mutex_held(p->db->mutex) );
7287 return (p && (p->inTrans!=TRANS_NONE));
7291 ** This function returns a pointer to a blob of memory associated with
7292 ** a single shared-btree. The memory is used by client code for its own
7293 ** purposes (for example, to store a high-level schema associated with
7294 ** the shared-btree). The btree layer manages reference counting issues.
7296 ** The first time this is called on a shared-btree, nBytes bytes of memory
7297 ** are allocated, zeroed, and returned to the caller. For each subsequent
7298 ** call the nBytes parameter is ignored and a pointer to the same blob
7299 ** of memory returned.
7301 ** If the nBytes parameter is 0 and the blob of memory has not yet been
7302 ** allocated, a null pointer is returned. If the blob has already been
7303 ** allocated, it is returned as normal.
7305 ** Just before the shared-btree is closed, the function passed as the
7306 ** xFree argument when the memory allocation was made is invoked on the
7307 ** blob of allocated memory. This function should not call sqlite3_free()
7308 ** on the memory, the btree layer does that.
7310 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
7311 BtShared *pBt = p->pBt;
7312 sqlite3BtreeEnter(p);
7313 if( !pBt->pSchema && nBytes ){
7314 pBt->pSchema = sqlite3MallocZero(nBytes);
7315 pBt->xFreeSchema = xFree;
7317 sqlite3BtreeLeave(p);
7318 return pBt->pSchema;
7322 ** Return true if another user of the same shared btree as the argument
7323 ** handle holds an exclusive lock on the sqlite_master table.
7325 int sqlite3BtreeSchemaLocked(Btree *p){
7327 assert( sqlite3_mutex_held(p->db->mutex) );
7328 sqlite3BtreeEnter(p);
7329 rc = (queryTableLock(p, MASTER_ROOT, READ_LOCK)!=SQLITE_OK);
7330 sqlite3BtreeLeave(p);
7335 #ifndef SQLITE_OMIT_SHARED_CACHE
7337 ** Obtain a lock on the table whose root page is iTab. The
7338 ** lock is a write lock if isWritelock is true or a read lock
7341 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
7344 u8 lockType = READ_LOCK + isWriteLock;
7345 assert( READ_LOCK+1==WRITE_LOCK );
7346 assert( isWriteLock==0 || isWriteLock==1 );
7347 sqlite3BtreeEnter(p);
7348 rc = queryTableLock(p, iTab, lockType);
7349 if( rc==SQLITE_OK ){
7350 rc = lockTable(p, iTab, lockType);
7352 sqlite3BtreeLeave(p);
7358 #ifndef SQLITE_OMIT_INCRBLOB
7360 ** Argument pCsr must be a cursor opened for writing on an
7361 ** INTKEY table currently pointing at a valid table entry.
7362 ** This function modifies the data stored as part of that entry.
7363 ** Only the data content may only be modified, it is not possible
7364 ** to change the length of the data stored.
7366 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
7367 assert( cursorHoldsMutex(pCsr) );
7368 assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
7369 assert(pCsr->isIncrblobHandle);
7371 restoreCursorPosition(pCsr);
7372 assert( pCsr->eState!=CURSOR_REQUIRESEEK );
7373 if( pCsr->eState!=CURSOR_VALID ){
7374 return SQLITE_ABORT;
7377 /* Check some preconditions:
7378 ** (a) the cursor is open for writing,
7379 ** (b) there is no read-lock on the table being modified and
7380 ** (c) the cursor points at a valid row of an intKey table.
7382 if( !pCsr->wrFlag ){
7383 return SQLITE_READONLY;
7385 assert( !pCsr->pBt->readOnly
7386 && pCsr->pBt->inTransaction==TRANS_WRITE );
7387 if( checkReadLocks(pCsr->pBtree, pCsr->pgnoRoot, pCsr, 0) ){
7388 return SQLITE_LOCKED; /* The table pCur points to has a read lock */
7390 if( pCsr->eState==CURSOR_INVALID || !pCsr->apPage[pCsr->iPage]->intKey ){
7391 return SQLITE_ERROR;
7394 return accessPayload(pCsr, offset, amt, (unsigned char *)z, 0, 1);
7398 ** Set a flag on this cursor to cache the locations of pages from the
7399 ** overflow list for the current row. This is used by cursors opened
7400 ** for incremental blob IO only.
7402 ** This function sets a flag only. The actual page location cache
7403 ** (stored in BtCursor.aOverflow[]) is allocated and used by function
7404 ** accessPayload() (the worker function for sqlite3BtreeData() and
7405 ** sqlite3BtreePutData()).
7407 void sqlite3BtreeCacheOverflow(BtCursor *pCur){
7408 assert( cursorHoldsMutex(pCur) );
7409 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
7410 assert(!pCur->isIncrblobHandle);
7411 assert(!pCur->aOverflow);
7412 pCur->isIncrblobHandle = 1;