From: Pekka Riikonen Date: Mon, 13 Oct 2003 18:38:39 +0000 (+0000) Subject: Backup router testing, fixes. X-Git-Tag: silc.client.0.9.13~4 X-Git-Url: http://git.silcnet.org/gitweb/?p=silc.git;a=commitdiff_plain;h=c8f99a8ce7f8ba6cf87ebcd0900f37023ff9955e Backup router testing, fixes. --- diff --git a/CHANGES b/CHANGES index 26341b4e..5918b56f 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,8 @@ +Mon Oct 13 21:37:47 EEST 2003 Pekka Riikonen + + * Continued backup router tests and fixes. Affected files + silcd/server_backup, server_util.c, server.c. See TODO. + Sun Oct 12 19:58:18 EEST 2003 Pekka Riikonen * Fixed SERVER_SIGNOFF handling in servers. The client diff --git a/TODO b/TODO index 7c7fa709..1dbc95d2 100644 --- a/TODO +++ b/TODO @@ -11,9 +11,77 @@ TODO for SILC Server 1.0 o Backup router testing - - test all resume error cases for backup router - - test all resume error cases for normal server - - test all resume error cases for primary router + - Switching tests + - (1) primary goes down (works) + - (2) server(s) looses primary, but backup doesn't + - Works, if the primary sends SERVER_SIGNOFF to backup for the + the signoffed servers it will disconnect them. The servers + will reconnect to primary, no desync. If primary sends ping + back (before SERVER_SIGNOFF) the backup returns failure to + server. Server resends START_USE, and same repeats. Either + the SERVER_SIGNOFF is received or the server disconnects from + backup, and reconnects to primary. + - (3) server looses primary, but backup doesn't, but ping timeouts + (no crash in primary) + - Works, the backup will be primary, server will notice it and + network works. When the backup gets connection back to primary, + primary will reject resuming. Backup switches back to backup + router. Server timeouts, disconnects and reconnects to primary + to avoid desync. + - (4) backup looses primary, but server(s) doesn't + - Works, the backup is in desync. When backup connects back to + primary it attempts to execute the resuming. The primary will + reject this. The backup accepts it and resumes as backup + router, no desync. Servers get the backup resuming protocol + but it will timeout, and no other action is taken. No desync in + servers. + - (5) server looses primary, backup crashes + - Works, the server will attempt to reconnect to the primary + and backup. Server will be cut from rest of the network. + - (6) server looses primary, backup doesn't, rejects server's use, + then backup looses primary + - Same as (3), if primary crashed, normal resuming occurs. + + - Resuming tests + - (1) normal resuming (works) + - (2) backup crashes during resuming + - Works, no desync in router or server. Server reconnects to + the router to avoid desync in server. + - (3) primary crashes during resuming + - Works, no desync in backup or server. Backup handles crash + during first contact, during resuming and immediately after. + Server handle crash during first contact, after contact and + immediately after. In case of error server fallbacks to the + backup router. If backup rejects fallback, server disconnects + and reconnects to both backup and primary. + - (4) a server crashes during resuming (multiple servers present) + - Works, no desync in backup or servers. The backup restarts the + protocol after timeout. After that the protocol executes + quickly since all other servers are already connected to the + primary. While waiting restart servers fallback to the + backup. + - (5) server can connect to primary but cannot communicate + - Works, no desync in server. The server notices that the + protocol did not succeed, and verifies from backup whether it + can be used still. If backup refuses the server reconnects + to the primary router. + - (6) backup can connect to primary but cannot communicate + - Same as (9). + - (7) primary won't communicate with server + - Same as (5). + - (8) primary won't communicate with backup + - Same as (6), (9). + - (9) backup cannot communicate with server + - Works, no desync in backup or servers. The backup restarts the + protocol after timeout. This happens as long as the protocol + is executed successfully. Primary can communicate through the + backup during this. Servers fallback to backup after timeout + occurs and waits for new resuming. If the server never answers + anything to backup (but is up) then resuming lasts for ever, + until the server is either removed from network or starts + communicating. However, this does not cause network desync. + - (10) server cannot communicate with backup + - Same as (5), (9). - Notifys (works) JOIN, TOPIC_SET, CMODE_CHANGE, CUMODE_CHANGE, CHANNEL_CHANGE, diff --git a/apps/silcd/server.c b/apps/silcd/server.c index 291b0922..963746f9 100644 --- a/apps/silcd/server.c +++ b/apps/silcd/server.c @@ -1443,16 +1443,23 @@ SILC_TASK_CALLBACK(silc_server_connect_to_router_final) server->standalone = FALSE; server->backup_primary = FALSE; - /* If we are router then announce our possible servers. Backup - router announces also global servers. */ - if (server->server_type == SILC_ROUTER) - silc_server_announce_servers(server, - server->backup_router ? TRUE : FALSE, - 0, SILC_PRIMARY_ROUTE(server)); + /* Announce data if we are not backup router (unless not as primary + currently). Backup router announces later at the end of + resuming protocol. */ + if (server->backup_router && server->server_type == SILC_ROUTER) { + SILC_LOG_DEBUG(("Announce data after resume protocol")); + } else { + /* If we are router then announce our possible servers. Backup + router announces also global servers. */ + if (server->server_type == SILC_ROUTER) + silc_server_announce_servers(server, + server->backup_router ? TRUE : FALSE, + 0, SILC_PRIMARY_ROUTE(server)); - /* Announce our clients and channels to the router */ - silc_server_announce_clients(server, 0, SILC_PRIMARY_ROUTE(server)); - silc_server_announce_channels(server, 0, SILC_PRIMARY_ROUTE(server)); + /* Announce our clients and channels to the router */ + silc_server_announce_clients(server, 0, SILC_PRIMARY_ROUTE(server)); + silc_server_announce_channels(server, 0, SILC_PRIMARY_ROUTE(server)); + } /* If we are backup router then this primary router is whom we are backing up. */ @@ -2634,6 +2641,18 @@ void silc_server_packet_parse_type(SilcServer server, /* Attempt to reconnect to primary */ SILC_LOG_DEBUG(("Received failed START_USE from backup %s", sock->ip)); + /* Default action is to disconnect from backup and reconnect to + primary. Since this failure can happen during switching to + backup (backup might have not noticed the primary going down yet), + we will wait a while and keep sending START_USE to backup. + Only after that we'll give up. */ + if (server->router == sock->user_data && + (time(0) - server->router_connect) < 30) { + SILC_LOG_DEBUG(("Resending START_USE to backup router")); + silc_server_backup_send_start_use(server, sock, FALSE); + break; + } + /* If backup is our primary, disconnect now. */ if (server->router == sock->user_data) { if (sock->user_data) @@ -5350,6 +5369,9 @@ SILC_TASK_CALLBACK_GLOBAL(silc_server_rekey_callback) SilcProtocol protocol; SilcServerRekeyInternalContext *proto_ctx; + if (!idata) + return; + /* Do not execute rekey with disabled connections, as it would not go through anyway. */ if (idata->status & SILC_IDLIST_STATUS_DISABLED) diff --git a/apps/silcd/server_backup.c b/apps/silcd/server_backup.c index 876eba20..df9d1b7e 100644 --- a/apps/silcd/server_backup.c +++ b/apps/silcd/server_backup.c @@ -1081,15 +1081,10 @@ SILC_TASK_CALLBACK_GLOBAL(silc_server_protocol_backup) ctx->sessions_count++; } - /* If we are not standalone and our primary is not the one we're - talking to now, then announce our information to it since we - haven't done that yet. Standalone backup router announces - these during connecting to the primary. */ - if (!server->standalone && SILC_PRIMARY_ROUTE(server) != ctx->sock) { - silc_server_announce_servers(server, TRUE, 0, ctx->sock); - silc_server_announce_clients(server, 0, ctx->sock); - silc_server_announce_channels(server, 0, ctx->sock); - } + /* Announce data to the new primary to be. */ + silc_server_announce_servers(server, TRUE, 0, ctx->sock); + silc_server_announce_clients(server, 0, ctx->sock); + silc_server_announce_channels(server, 0, ctx->sock); protocol->state++; @@ -1313,7 +1308,9 @@ SILC_TASK_CALLBACK_GLOBAL(silc_server_protocol_backup) silc_server_local_servers_toggle_enabled(server, FALSE); router->data.status &= ~SILC_IDLIST_STATUS_DISABLED; silc_server_update_servers_by_server(server, backup_router, router); - silc_server_update_clients_by_server(server, NULL, router, FALSE); + silc_server_update_clients_by_server( + server, NULL, router, + server->server_type == SILC_BACKUP_ROUTER); if (server->server_type == SILC_SERVER) silc_server_update_channels_by_server(server, backup_router, router); silc_server_backup_replaced_del(server, backup_router); @@ -1436,14 +1433,15 @@ SILC_TASK_CALLBACK(silc_server_protocol_backup_done) to perfom resuming protocol. */ server->server_type = SILC_BACKUP_ROUTER; silc_server_local_servers_toggle_enabled(server, FALSE); + server_entry->data.status &= ~SILC_IDLIST_STATUS_DISABLED; silc_server_update_servers_by_server(server, server->id_entry, sock->user_data); silc_server_update_clients_by_server(server, NULL, - sock->user_data, FALSE); + sock->user_data, TRUE); /* Announce our clients and channels to the router */ - silc_server_announce_clients(server, ctx->start, sock); - silc_server_announce_channels(server, ctx->start, sock); + silc_server_announce_clients(server, 0, sock); + silc_server_announce_channels(server, 0, sock); } continue; @@ -1460,14 +1458,12 @@ SILC_TASK_CALLBACK(silc_server_protocol_backup_done) if (ctx->type == SILC_SERVER_BACKUP_RESUMED && server->router) { /* Announce all of our information to the router. */ if (server->server_type == SILC_ROUTER) - silc_server_announce_servers(server, FALSE, ctx->start, + silc_server_announce_servers(server, FALSE, 0, server->router->connection); /* Announce our clients and channels to the router */ - silc_server_announce_clients(server, ctx->start, - server->router->connection); - silc_server_announce_channels(server, ctx->start, - server->router->connection); + silc_server_announce_clients(server, 0, server->router->connection); + silc_server_announce_channels(server, 0, server->router->connection); } } else { /* Error */ diff --git a/apps/silcd/server_util.c b/apps/silcd/server_util.c index 8405f7e9..2cc49078 100644 --- a/apps/silcd/server_util.c +++ b/apps/silcd/server_util.c @@ -386,6 +386,8 @@ silc_server_update_clients_by_real_server(SilcServer server, SilcIDCacheList list; bool tolocal = (to == server->id_entry); + SILC_LOG_DEBUG(("Start")); + if (!silc_idcache_get_all(server->local_list->servers, &list)) return NULL; @@ -412,6 +414,7 @@ silc_server_update_clients_by_real_server(SilcServer server, } server_entry = server_entry->router; } else { + SILC_LOG_DEBUG(("Server locally connected")); /* If the client is not marked as local then move it to local list since the server is local. */ if (server_entry->server_type != SILC_BACKUP_ROUTER && !local) { @@ -420,6 +423,16 @@ silc_server_update_clients_by_real_server(SilcServer server, client_cache->id, client_cache->context, client_cache->expire, NULL); silc_idcache_del_by_context(server->global_list->clients, client); + + } else if (server->server_type == SILC_BACKUP_ROUTER && local) { + /* If we are backup router and this client is on local list, we + must move it to global list, as it is not currently local to + us (we are not primary). */ + SILC_LOG_DEBUG(("Moving client to global list")); + silc_idcache_add(server->global_list->clients, client_cache->name, + client_cache->id, client_cache->context, + client_cache->expire, NULL); + silc_idcache_del_by_context(server->local_list->clients, client); } } @@ -459,6 +472,7 @@ silc_server_update_clients_by_real_server(SilcServer server, } server_entry = server_entry->router; } else { + SILC_LOG_DEBUG(("Server locally connected")); /* If the client is marked as local then move it to global list since the server is global. */ if (server_entry->server_type != SILC_BACKUP_ROUTER && local) { @@ -500,6 +514,16 @@ void silc_server_update_clients_by_server(SilcServer server, SilcClientEntry client = NULL; bool local; + if (from && from->id) { + SILC_LOG_DEBUG(("Changing from server %s", + silc_id_render(from->id, SILC_ID_SERVER))); + } + if (to && to->id) { + SILC_LOG_DEBUG(("Changing to server %s", + silc_id_render(to->id, SILC_ID_SERVER))); + } + + SILC_LOG_DEBUG(("global list")); local = FALSE; if (silc_idcache_get_all(server->global_list->clients, &list)) { if (silc_idcache_list_first(list, &id_cache)) { @@ -541,6 +565,13 @@ void silc_server_update_clients_by_server(SilcServer server, } } else { /* All are changed */ + if (resolve_real_server) + /* Call this so that the entry is moved to correct list if + needed. No resolving by real server is actually done. */ + silc_server_update_clients_by_real_server(server, NULL, to, + client, local, + id_cache); + client->router = to; } @@ -555,6 +586,7 @@ void silc_server_update_clients_by_server(SilcServer server, silc_idcache_list_free(list); } + SILC_LOG_DEBUG(("local list")); local = TRUE; if (silc_idcache_get_all(server->local_list->clients, &list)) { if (silc_idcache_list_first(list, &id_cache)) { @@ -592,6 +624,13 @@ void silc_server_update_clients_by_server(SilcServer server, } } else { /* All are changed */ + if (resolve_real_server) + /* Call this so that the entry is moved to correct list if + needed. No resolving by real server is actually done. */ + silc_server_update_clients_by_real_server(server, NULL, to, + client, local, + id_cache); + client->router = to; } diff --git a/apps/silcd/server_util.h b/apps/silcd/server_util.h index 32ea609f..413bdeb9 100644 --- a/apps/silcd/server_util.h +++ b/apps/silcd/server_util.h @@ -34,7 +34,8 @@ bool silc_server_remove_clients_by_server(SilcServer server, attempt to figure out which clients really are originated from the `from' and which are originated from a server that we have connection to, when we've acting as backup router. If it is FALSE the `to' will - be the new source. */ + be the new source. If `from' is NULL then all clients (except locally + connected) are updated `to'. */ void silc_server_update_clients_by_server(SilcServer server, SilcServerEntry from, SilcServerEntry to,