Redis資料持久化機制AOF原理分析一

來源:互聯網
上載者:User

/* Function called at startup to load RDB or AOF file in memory. */void loadDataFromDisk(void) {    long long start = ustime();    if (server.aof_state == REDIS_AOF_ON) {        if (loadAppendOnlyFile(server.aof_filename) == REDIS_OK)            redisLog(REDIS_NOTICE,"DB loaded from append only file: %.3f seconds",(float)(ustime()-start)/1000000);    } else {        if (rdbLoad(server.rdb_filename) == REDIS_OK) {            redisLog(REDIS_NOTICE,"DB loaded from disk: %.3f seconds",                (float)(ustime()-start)/1000000);        } else if (errno != ENOENT) {            redisLog(REDIS_WARNING,"Fatal error loading the DB: %s. Exiting.",strerror(errno));            exit(1);        }    }}
Server首先判斷載入AOF檔案是因為AOF檔案中的資料要比RDB檔案中的資料要新。

int loadAppendOnlyFile(char *filename) {    struct redisClient *fakeClient;    FILE *fp = fopen(filename,"r");    struct redis_stat sb;    int old_aof_state = server.aof_state;    long loops = 0;    //redis_fstat就是fstat64函數,通過fileno(fp)得到檔案描述符,擷取檔案的狀態儲存於sb中,    //具體可以參考stat函數,st_size就是檔案的位元組數    if (fp && redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0) {        server.aof_current_size = 0;        fclose(fp);        return REDIS_ERR;    }    if (fp == NULL) {//開啟檔案失敗        redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));        exit(1);    }    /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI     * to the same file we're about to read. */    server.aof_state = REDIS_AOF_OFF;    fakeClient = createFakeClient(); //建立偽終端    startLoading(fp); // 定義於 rdb.c ,補救伺服器的載入狀態    while(1) {        int argc, j;        unsigned long len;        robj **argv;        char buf[128];        sds argsds;        struct redisCommand *cmd;        /* Serve the clients from time to time */        // 有間隔地處理外部請求,ftello()函數得到檔案的當前位置,傳回值為long        if (!(loops++ % 1000)) {            loadingProgress(ftello(fp));//儲存aof檔案讀取的位置,ftellno(fp)擷取檔案當前位置            aeProcessEvents(server.el, AE_FILE_EVENTS|AE_DONT_WAIT);//處理事件        }        //按行讀取AOF資料        if (fgets(buf,sizeof(buf),fp) == NULL) {            if (feof(fp))//達到檔案尾EOF                break;            else                goto readerr;        }        //讀取AOF檔案中的命令,依照Redis的協議處理        if (buf[0] != '*') goto fmterr;        argc = atoi(buf+1);//參數個數        if (argc < 1) goto fmterr;        argv = zmalloc(sizeof(robj*)*argc);//參數值        for (j = 0; j < argc; j++) {            if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;            if (buf[0] != '$') goto fmterr;            len = strtol(buf+1,NULL,10);//每個bulk的長度            argsds = sdsnewlen(NULL,len);//建立一個空sds            //按照bulk的長度讀取            if (len && fread(argsds,len,1,fp) == 0) goto fmterr;            argv[j] = createObject(REDIS_STRING,argsds);            if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF 跳過\r\n*/        }        /* Command lookup */        cmd = lookupCommand(argv[0]->ptr);        if (!cmd) {            redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", (char*)argv[0]->ptr);            exit(1);        }        /* Run the command in the context of a fake client */        fakeClient->argc = argc;        fakeClient->argv = argv;        cmd->proc(fakeClient);//執行命令        /* The fake client should not have a reply */        redisAssert(fakeClient->bufpos == 0 && listLength(fakeClient->reply) == 0);        /* The fake client should never get blocked */        redisAssert((fakeClient->flags & REDIS_BLOCKED) == 0);        /* Clean up. Command code may have changed argv/argc so we use the         * argv/argc of the client instead of the local variables. */        for (j = 0; j < fakeClient->argc; j++)            decrRefCount(fakeClient->argv[j]);        zfree(fakeClient->argv);    }    /* This point can only be reached when EOF is reached without errors.     * If the client is in the middle of a MULTI/EXEC, log error and quit. */    if (fakeClient->flags & REDIS_MULTI) goto readerr;    fclose(fp);    freeFakeClient(fakeClient);    server.aof_state = old_aof_state;    stopLoading();    aofUpdateCurrentSize(); //更新server.aof_current_size,AOF檔案大小    server.aof_rewrite_base_size = server.aof_current_size;    return REDIS_OK;…………}
在前面一篇關於AOF參數配置的部落格遺留了一個問題,server.aof_current_size參數的初始化,下面解決這個疑問。

void aofUpdateCurrentSize(void) {    struct redis_stat sb;    if (redis_fstat(server.aof_fd,&sb) == -1) {        redisLog(REDIS_WARNING,"Unable to obtain the AOF file length. stat: %s",            strerror(errno));    } else {        server.aof_current_size = sb.st_size;    }}
redis_fstat是作者對Linux中fstat64函數的重新命名,該還是就是擷取檔案相關的參數資訊,具體可以Google之,sb.st_size就是當前AOF檔案的大小。這裡需要知道server.aof_fd即AOF檔案描述符,該參數的初始化在initServer()函數中

/* Open the AOF file if needed. */    if (server.aof_state == REDIS_AOF_ON) {        server.aof_fd = open(server.aof_filename,O_WRONLY|O_APPEND|O_CREAT,0644);        if (server.aof_fd == -1) {            redisLog(REDIS_WARNING, "Can't open the append-only file: %s",strerror(errno));            exit(1);        }    }

至此,Redis Server啟動載入硬碟中AOF檔案資料的操作就成功結束了。

當用戶端執行Set等修改資料庫中欄位的指令時就會造成Server資料庫中資料被修改,這些修改的資料應該被即時更新到AOF檔案中,並且也要按照一定的fsync機制重新整理到硬碟中,保證資料不會丟失。

/* This function gets called every time Redis is entering the * main loop of the event driven library, that is, before to sleep * for ready file descriptors. */void beforeSleep(struct aeEventLoop *eventLoop) {    REDIS_NOTUSED(eventLoop);    listNode *ln;    redisClient *c;    /* Run a fast expire cycle (the called function will return     * ASAP if a fast cycle is not needed). */    if (server.active_expire_enabled && server.masterhost == NULL)        activeExpireCycle(ACTIVE_EXPIRE_CYCLE_FAST);    /* Try to process pending commands for clients that were just unblocked. */    while (listLength(server.unblocked_clients)) {        ln = listFirst(server.unblocked_clients);        redisAssert(ln != NULL);        c = ln->value;        listDelNode(server.unblocked_clients,ln);        c->flags &= ~REDIS_UNBLOCKED;        /* Process remaining data in the input buffer. */        //處理用戶端在阻塞期間接收到的用戶端發送的請求        if (c->querybuf && sdslen(c->querybuf) > 0) {            server.current_client = c;            processInputBuffer(c);            server.current_client = NULL;        }    }    /* Write the AOF buffer on disk */    //將server.aof_buf中的資料追加到AOF檔案中並fsync到硬碟上    flushAppendOnlyFile(0);}
通過上面的代碼及注釋可以發現,beforeSleep函數做了三件事:1、處理到期鍵,2、處理阻塞期間的用戶端請求,3、將server.aof_buf中的資料追加到AOF檔案中並fsync重新整理到硬碟上,flushAppendOnlyFile函數給定了一個參數force,表示是否強制寫入AOF檔案,0表示非強制即支援延遲寫,1表示強制寫入。

void flushAppendOnlyFile(int force) {    ssize_t nwritten;    int sync_in_progress = 0;    if (sdslen(server.aof_buf) == 0) return;    // 返回後台正在等待執行的 fsync 數量    if (server.aof_fsync == AOF_FSYNC_EVERYSEC)        sync_in_progress = bioPendingJobsOfType(REDIS_BIO_AOF_FSYNC) != 0;    // AOF 模式為每秒 fsync ,並且 force 不為 1 如果可以的話,推延沖洗    if (server.aof_fsync == AOF_FSYNC_EVERYSEC && !force) {        /* With this append fsync policy we do background fsyncing.         * If the fsync is still in progress we can try to delay         * the write for a couple of seconds. */        // 如果 aof_fsync 隊列裡已經有正在等待的任務        if (sync_in_progress) {            // 上一次沒有延遲沖洗過,記錄推延的目前時間,然後返回            if (server.aof_flush_postponed_start == 0) {                /* No previous write postponinig, remember that we are                 * postponing the flush and return. */                server.aof_flush_postponed_start = server.unixtime;                return;            } else if (server.unixtime - server.aof_flush_postponed_start < 2) {                // 允許在兩秒之內的推延沖洗                /* We were already waiting for fsync to finish, but for less                 * than two seconds this is still ok. Postpone again. */                return;            }            /* Otherwise fall trough, and go write since we can't wait             * over two seconds. */            server.aof_delayed_fsync++;            redisLog(REDIS_NOTICE,"Asynchronous AOF fsync is taking too long (disk is busy?). Writing the AOF buffer without waiting for fsync to complete, this may slow down Redis.");        }    }    /* If you are following this code path, then we are going to write so     * set reset the postponed flush sentinel to zero. */    server.aof_flush_postponed_start = 0;    /* We want to perform a single write. This should be guaranteed atomic     * at least if the filesystem we are writing is a real physical one.     * While this will save us against the server being killed I don't think     * there is much to do about the whole server stopping for power problems     * or alike */    // 將 AOF 緩衝寫入到檔案,如果一切幸運的話,寫入會原子性地完成    nwritten = write(server.aof_fd,server.aof_buf,sdslen(server.aof_buf));    if (nwritten != (signed)sdslen(server.aof_buf)) {//出錯        /* Ooops, we are in troubles. The best thing to do for now is         * aborting instead of giving the illusion that everything is         * working as expected. */        if (nwritten == -1) {            redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));        } else {            redisLog(REDIS_WARNING,"Exiting on short write while writing to "                                   "the append-only file: %s (nwritten=%ld, "                                   "expected=%ld)",                                   strerror(errno),                                   (long)nwritten,                                   (long)sdslen(server.aof_buf));            if (ftruncate(server.aof_fd, server.aof_current_size) == -1) {                redisLog(REDIS_WARNING, "Could not remove short write "                         "from the append-only file.  Redis may refuse "                         "to load the AOF the next time it starts.  "                         "ftruncate: %s", strerror(errno));            }        }        exit(1);    }    server.aof_current_size += nwritten;    /* Re-use AOF buffer when it is small enough. The maximum comes from the     * arena size of 4k minus some overhead (but is otherwise arbitrary). */    // 如果 aof 緩衝不是太大,那麼重用它,否則,清空 aof 緩衝    if ((sdslen(server.aof_buf)+sdsavail(server.aof_buf)) < 4000) {        sdsclear(server.aof_buf);    } else {        sdsfree(server.aof_buf);        server.aof_buf = sdsempty();    }    /* Don't fsync if no-appendfsync-on-rewrite is set to yes and there are     * children doing I/O in the background. */    //aof rdb子進程運行中不支援fsync並且aof rdb子進程正在運行,那麼直接返回,    //但是資料已經寫到aof檔案中,只是沒有重新整理到硬碟    if (server.aof_no_fsync_on_rewrite &&        (server.aof_child_pid != -1 || server.rdb_child_pid != -1))            return;    /* Perform the fsync if needed. */    if (server.aof_fsync == AOF_FSYNC_ALWAYS) {//總是fsync,那麼直接進行fsync        /* aof_fsync is defined as fdatasync() for Linux in order to avoid         * flushing metadata. */        aof_fsync(server.aof_fd); /* Let's try to get this data on the disk */        server.aof_last_fsync = server.unixtime;    } else if ((server.aof_fsync == AOF_FSYNC_EVERYSEC &&                server.unixtime > server.aof_last_fsync)) {        if (!sync_in_progress) aof_background_fsync(server.aof_fd);//放到後台線程進行fsync        server.aof_last_fsync = server.unixtime;    }}
上述代碼中請關注server.aof_fsync參數,即設定Redis fsync AOF檔案到硬碟的策略,如果設定為AOF_FSYNC_ALWAYS,那麼直接在主進程中fsync,如果設定為AOF_FSYNC_EVERYSEC,那麼放入後台線程中fsync,後台線程的代碼在bio.c中。







相關文章

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.