Redis源碼分析（十九）--- replication主從資料複製的實現

最後更新：2018-07-30 來源：互聯網

上載者：User

創建阿里雲帳戶，並獲得超過 40 款產品的免費試用版；而企業帳戶則可以享有總值 $1200 的免費試用版。立即註冊！

replication的英文單詞的原意是“複製”的意思,replication檔案作為我在Data目錄下的分析的最後一個檔案，足以說明他的重要性，代碼量1800+，的確非常難啃。只能說個我看代碼下來的大致印象吧，要我畫個結構圖好好理理這裡面各個API的關係圖，這個我目前還真做不到。說到主從複製，這個是實現讀寫分離的最好手段了，也很常見，當使用者數達到一定量，當一個伺服器承受不了達到上千萬的pv時，採取主從資料庫的形式也是一般架構師能夠想到的一種手段。Redis的主從資料庫在我這裡就稱為主用戶端，從用戶端，因為用戶端中有所屬於的db，因為資料庫基於客戶單本身進行複製操作的。也就是說，一個Redis，存在一個master主用戶端，多個slave從用戶端，到時實現的就是slave向主用戶端進行複製操作。因為API比較多，進行了稍稍的歸類:

/* ---------------------------------- MASTER -------------------------------- */void createReplicationBacklog(void) /* 建立backlog的buffer */void resizeReplicationBacklog(long long newsize) /* 調整複本備份日誌的大小，當replication backlog被修改的時候 */void freeReplicationBacklog(void) /* 釋放備份日誌 */void feedReplicationBacklog(void *ptr, size_t len) /* 往備份日誌中添加添加資料操作，會引起master_repl_offset位移量的增加 */void feedReplicationBacklogWithObject(robj *o) /* 往backlog添加資料，以Redis 字串對象作為參數 */void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) /* 將主要資料庫複製到從資料庫 */void replicationFeedMonitors(redisClient *c, list *monitors, int dictid, robj **argv, int argc) /* 發送資料給monitor監聽者用戶端 */long long addReplyReplicationBacklog(redisClient *c, long long offset) /* slave從客戶單添加備份日誌 */int masterTryPartialResynchronization(redisClient *c) /* 主要資料庫嘗試分區同步 */void syncCommand(redisClient *c) /* 同步命令函數 */void replconfCommand(redisClient *c) /* 此函數用於從用戶端進行配置複製進程中的執行參數設定 */void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) /* 給slave用戶端發送BULK資料 */void updateSlavesWaitingBgsave(int bgsaveerr) /* 此方法將用於後台儲存進程快結束時調用，更新slave從用戶端 *//* ----------------------------------- SLAVE -------------------------------- */void replicationAbortSyncTransfer(void) /* 中止與master主要資料的同步操作 */void replicationSendNewlineToMaster(void) /* 從用戶端發送空行給主用戶端，破壞了原本的協議格式，避免讓主用戶端檢測出從用戶端逾時的情況 */void replicationEmptyDbCallback(void *privdata) /* 清空資料庫後的回調方法，當老資料被重新整理出去之後等待載入新資料的時候調用 */void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) /* 從用戶端讀取同步的Sync的BULK資料 */char *sendSynchronousCommand(int fd, ...) /* 從用戶端發送給主用戶端同步資料的命令，附上驗證資訊，和一些參數配置資訊 */int slaveTryPartialResynchronization(int fd) /* 從用戶端嘗試分區同步操作 */void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) /* 與主用戶端保持同步，期間包括連接埠號碼等的確認，socket串連 */int connectWithMaster(void) /* 串連主用戶端 */void undoConnectWithMaster(void) /* 撤銷串連主用戶端 */int cancelReplicationHandshake(void) /* 當已經存在一個複製進程時，中止一個非阻塞的replication複製的嘗試 */void replicationSetMaster(char *ip, int port) /* 設定主用戶端的ip地址和連接埠號碼 */void replicationUnsetMaster(void)void slaveofCommand(redisClient *c)void roleCommand(redisClient *c)void replicationSendAck(void) /* 發送ACK包給主用戶端 ，告知當前的進程位移量 *//* ---------------------- MASTER CACHING FOR PSYNC -------------------------- */void replicationCacheMaster(redisClient *c) /* 快取用戶端資訊 */void replicationDiscardCachedMaster(void) /* 當某個用戶端將不會再回複的時候，可以釋放掉緩衝的主用戶端 */void replicationResurrectCachedMaster(int newfd) /* 將快取用戶端複活 *//* ------------------------- MIN-SLAVES-TO-WRITE  --------------------------- */void refreshGoodSlavesCount(void) /* 更新slave從用戶端數量 */void replicationScriptCacheInit(void)void replicationScriptCacheFlush(void)void replicationScriptCacheAdd(sds sha1)int replicationScriptCacheExists(sds sha1)void replicationCron(void)

找一個標準的slave從用戶端向主用戶端實現同步的操作:

/* 與主用戶端保持同步，期間包括連接埠號碼等的確認，socket串連 */void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) {    char tmpfile[256], *err;    int dfd, maxtries = 5;    int sockerr = 0, psync_result;    socklen_t errlen = sizeof(sockerr);    REDIS_NOTUSED(el);    REDIS_NOTUSED(privdata);    REDIS_NOTUSED(mask);    /* If this event fired after the user turned the instance into a master     * with SLAVEOF NO ONE we must just return ASAP. */    if (server.repl_state == REDIS_REPL_NONE) {        close(fd);        return;    }    /* Check for errors in the socket. */    /* socket串連是否正常 */    if (getsockopt(fd, SOL_SOCKET, SO_ERROR, &sockerr, &errlen) == -1)        sockerr = errno;    if (sockerr) {        aeDeleteFileEvent(server.el,fd,AE_READABLE|AE_WRITABLE);        redisLog(REDIS_WARNING,"Error condition on socket for SYNC: %s",            strerror(sockerr));        goto error;    }    /* If we were connecting, it's time to send a non blocking PING, we want to     * make sure the master is able to reply before going into the actual     * replication process where we have long timeouts in the order of     * seconds (in the meantime the slave would block). */    /* 串連測試，將由主用戶端發送PING命令給從用戶端，在給定的延遲時間內觀察是否有回複 */    if (server.repl_state == REDIS_REPL_CONNECTING) {        redisLog(REDIS_NOTICE,"Non blocking connect for SYNC fired the event.");        /* Delete the writable event so that the readable event remains         * registered and we can wait for the PONG reply. */        aeDeleteFileEvent(server.el,fd,AE_WRITABLE);        server.repl_state = REDIS_REPL_RECEIVE_PONG;        /* Send the PING, don't check for errors at all, we have the timeout         * that will take care about this. */        //發送PING命令        syncWrite(fd,"PING\r\n",6,100);        return;    }    /* Receive the PONG command. */    //收到回複了    if (server.repl_state == REDIS_REPL_RECEIVE_PONG) {        char buf[1024];        /* Delete the readable event, we no longer need it now that there is         * the PING reply to read. */        aeDeleteFileEvent(server.el,fd,AE_READABLE);        /* Read the reply with explicit timeout. */        buf[0] = '\0';        if (syncReadLine(fd,buf,sizeof(buf),            server.repl_syncio_timeout*1000) == -1)        {            redisLog(REDIS_WARNING,                "I/O error reading PING reply from master: %s",                strerror(errno));            goto error;        }        /* We accept only two replies as valid, a positive +PONG reply         * (we just check for "+") or an authentication error.         * Note that older versions of Redis replied with "operation not         * permitted" instead of using a proper error code, so we test         * both. */        if (buf[0] != '+' &&            strncmp(buf,"-NOAUTH",7) != 0 &&            strncmp(buf,"-ERR operation not permitted",28) != 0)        {            redisLog(REDIS_WARNING,"Error reply to PING from master: '%s'",buf);            goto error;        } else {            redisLog(REDIS_NOTICE,                "Master replied to PING, replication can continue...");        }    }    /* AUTH with the master if required. */    //auth身分識別驗證    if(server.masterauth) {        err = sendSynchronousCommand(fd,"AUTH",server.masterauth,NULL);        if (err[0] == '-') {            redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",err);            sdsfree(err);            goto error;        }        sdsfree(err);    }    /* Set the slave port, so that Master's INFO command can list the     * slave listening port correctly. */    /* 設定從用戶端監聽連接埠 */    {        sds port = sdsfromlonglong(server.port);        err = sendSynchronousCommand(fd,"REPLCONF","listening-port",port,                                         NULL);        sdsfree(port);        /* Ignore the error if any, not all the Redis versions support         * REPLCONF listening-port. */        if (err[0] == '-') {            redisLog(REDIS_NOTICE,"(Non critical) Master does not understand REPLCONF listening-port: %s", err);        }        sdsfree(err);    }    /* Try a partial resynchonization. If we don't have a cached master     * slaveTryPartialResynchronization() will at least try to use PSYNC     * to start a full resynchronization so that we get the master run id     * and the global offset, to try a partial resync at the next     * reconnection attempt. */    psync_result = slaveTryPartialResynchronization(fd);    if (psync_result == PSYNC_CONTINUE) {        redisLog(REDIS_NOTICE, "MASTER <-> SLAVE sync: Master accepted a Partial Resynchronization.");        return;    }    /* Fall back to SYNC if needed. Otherwise psync_result == PSYNC_FULLRESYNC     * and the server.repl_master_runid and repl_master_initial_offset are     * already populated. */    if (psync_result == PSYNC_NOT_SUPPORTED) {        redisLog(REDIS_NOTICE,"Retrying with SYNC...");        if (syncWrite(fd,"SYNC\r\n",6,server.repl_syncio_timeout*1000) == -1) {            redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",                strerror(errno));            goto error;        }    }    /* Prepare a suitable temp file for bulk transfer */    while(maxtries--) {        snprintf(tmpfile,256,            "temp-%d.%ld.rdb",(int)server.unixtime,(long int)getpid());        dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);        if (dfd != -1) break;        sleep(1);    }    if (dfd == -1) {        redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));        goto error;    }    /* Setup the non blocking download of the bulk file. */    if (aeCreateFileEvent(server.el,fd, AE_READABLE,readSyncBulkPayload,NULL)            == AE_ERR)    {        redisLog(REDIS_WARNING,            "Can't create readable event for SYNC: %s (fd=%d)",            strerror(errno),fd);        goto error;    }    server.repl_state = REDIS_REPL_TRANSFER;    server.repl_transfer_size = -1;    server.repl_transfer_read = 0;    server.repl_transfer_last_fsync_off = 0;    server.repl_transfer_fd = dfd;    server.repl_transfer_lastio = server.unixtime;    server.repl_transfer_tmpfile = zstrdup(tmpfile);    return;error:    close(fd);    server.repl_transfer_s = -1;    server.repl_state = REDIS_REPL_CONNECT;    return;}

在replication中，要一個cacheMaster的概念，就是可以臨時緩衝主用戶端的資訊，一般用於突然master和slave中斷連線的時候，可以下次進行主從同步的時候快速恢複:

/* 快取用戶端資訊 */void replicationCacheMaster(redisClient *c) {    listNode *ln;    redisAssert(server.master != NULL && server.cached_master == NULL);    redisLog(REDIS_NOTICE,"Caching the disconnected master state.");    /* Remove from the list of clients, we don't want this client to be     * listed by CLIENT LIST or processed in any way by batch operations. */    //首先移除此用戶端    ln = listSearchKey(server.clients,c);    redisAssert(ln != NULL);    listDelNode(server.clients,ln);    /* Save the master. Server.master will be set to null later by     * replicationHandleMasterDisconnection(). */    //儲存為快取用戶端    server.cached_master = server.master;    /* Remove the event handlers and close the socket. We'll later reuse     * the socket of the new connection with the master during PSYNC. */    //刪除在這個用戶端上的讀寫事件    aeDeleteFileEvent(server.el,c->fd,AE_READABLE);    aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);    close(c->fd);    /* Set fd to -1 so that we can safely call freeClient(c) later. */    c->fd = -1;    /* Invalidate the Peer ID cache. */    if (c->peerid) {        sdsfree(c->peerid);        c->peerid = NULL;    }    /* Caching the master happens instead of the actual freeClient() call,     * so make sure to adjust the replication state. This function will     * also set server.master to NULL. */    replicationHandleMasterDisconnection();}

當想讓這個master的複活的時候，調用下面的方法:

/* Turn the cached master into the current master, using the file descriptor * passed as argument as the socket for the new master. * * This funciton is called when successfully setup a partial resynchronization * so the stream of data that we'll receive will start from were this * master left. *//* 將快取用戶端複活 */void replicationResurrectCachedMaster(int newfd) {//將cached_master賦值為主用戶端    server.master = server.cached_master;    server.cached_master = NULL;    server.master->fd = newfd;    server.master->flags &= ~(REDIS_CLOSE_AFTER_REPLY|REDIS_CLOSE_ASAP);    server.master->authenticated = 1;    server.master->lastinteraction = server.unixtime;    server.repl_state = REDIS_REPL_CONNECTED;    /* Re-add to the list of clients. */    //重新添加入用戶端列表中    listAddNodeTail(server.clients,server.master);    if (aeCreateFileEvent(server.el, newfd, AE_READABLE,                          readQueryFromClient, server.master)) {        redisLog(REDIS_WARNING,"Error resurrecting the cached master, impossible to add the readable handler: %s", strerror(errno));        freeClientAsync(server.master); /* Close ASAP. */    }    /* We may also need to install the write handler as well if there is     * pending data in the write buffers. */    if (server.master->bufpos || listLength(server.master->reply)) {        if (aeCreateFileEvent(server.el, newfd, AE_WRITABLE,                          sendReplyToClient, server.master)) {            redisLog(REDIS_WARNING,"Error resurrecting the cached master, impossible to add the writable handler: %s", strerror(errno));            freeClientAsync(server.master); /* Close ASAP. */        }    }}

當然如果確定在未來不糊在使用緩衝的master的時，可以徹底摧毀:

/* Free a cached master, called when there are no longer the conditions for * a partial resync on reconnection. *//* 當某個用戶端將不會再回複的時候，可以釋放掉緩衝的主用戶端 */void replicationDiscardCachedMaster(void) {    if (server.cached_master == NULL) return;    redisLog(REDIS_NOTICE,"Discarding previously cached master state.");    server.cached_master->flags &= ~REDIS_MASTER;    //直接釋放用戶端    freeClient(server.cached_master);    //server的快取用戶端賦值為NULL    server.cached_master = NULL;}

在這裡面靠的就是server.cached_master屬性。slave在和master串連的時候，要進行master的ip地址和Port連接埠的確認：

/* Set replication to the specified master address and port. *//* 設定主用戶端的ip地址和連接埠號碼 */void replicationSetMaster(char *ip, int port) {    sdsfree(server.masterhost);    server.masterhost = sdsdup(ip);    server.masterport = port;    //設定完畢之後，斷開所有的串連，中止replication進程    if (server.master) freeClient(server.master);    disconnectSlaves(); /* Force our slaves to resync with us as well. */    replicationDiscardCachedMaster(); /* Don't try a PSYNC. */    freeReplicationBacklog(); /* Don't allow our chained slaves to PSYNC. */    cancelReplicationHandshake();    server.repl_state = REDIS_REPL_CONNECT;    server.master_repl_offset = 0;}

主從複製的實現其實還有很多細節和步驟的。稍稍分析了一下，以後有機會研究的更深入一點

本文章原先以中文撰寫並發佈於 aliyun.com，亦設英文版本，僅作資訊用途。本網站不對文章的準確性，完整性或可靠性或其任何翻譯作出任何明示或暗示的陳述或保證。如對該文章有任何疑慮或投訴，請傳送電郵至 info-contact@alibabacloud.com 並提供相關疑慮或投訴的詳細說明。職員會於 5 個工作天內與您聯絡，一經驗證之後，即會刪除該侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More