+Mon Oct 13 21:37:47 EEST 2003 Pekka Riikonen <priikone@silcnet.org>
+
+ * Continued backup router tests and fixes. Affected files
+ silcd/server_backup, server_util.c, server.c. See TODO.
+
Sun Oct 12 19:58:18 EEST 2003 Pekka Riikonen <priikone@silcnet.org>
* Fixed SERVER_SIGNOFF handling in servers. The client
o Backup router testing
- - test all resume error cases for backup router
- - test all resume error cases for normal server
- - test all resume error cases for primary router
+ - Switching tests
+ - (1) primary goes down (works)
+ - (2) server(s) looses primary, but backup doesn't
+ - Works, if the primary sends SERVER_SIGNOFF to backup for the
+ the signoffed servers it will disconnect them. The servers
+ will reconnect to primary, no desync. If primary sends ping
+ back (before SERVER_SIGNOFF) the backup returns failure to
+ server. Server resends START_USE, and same repeats. Either
+ the SERVER_SIGNOFF is received or the server disconnects from
+ backup, and reconnects to primary.
+ - (3) server looses primary, but backup doesn't, but ping timeouts
+ (no crash in primary)
+ - Works, the backup will be primary, server will notice it and
+ network works. When the backup gets connection back to primary,
+ primary will reject resuming. Backup switches back to backup
+ router. Server timeouts, disconnects and reconnects to primary
+ to avoid desync.
+ - (4) backup looses primary, but server(s) doesn't
+ - Works, the backup is in desync. When backup connects back to
+ primary it attempts to execute the resuming. The primary will
+ reject this. The backup accepts it and resumes as backup
+ router, no desync. Servers get the backup resuming protocol
+ but it will timeout, and no other action is taken. No desync in
+ servers.
+ - (5) server looses primary, backup crashes
+ - Works, the server will attempt to reconnect to the primary
+ and backup. Server will be cut from rest of the network.
+ - (6) server looses primary, backup doesn't, rejects server's use,
+ then backup looses primary
+ - Same as (3), if primary crashed, normal resuming occurs.
+
+ - Resuming tests
+ - (1) normal resuming (works)
+ - (2) backup crashes during resuming
+ - Works, no desync in router or server. Server reconnects to
+ the router to avoid desync in server.
+ - (3) primary crashes during resuming
+ - Works, no desync in backup or server. Backup handles crash
+ during first contact, during resuming and immediately after.
+ Server handle crash during first contact, after contact and
+ immediately after. In case of error server fallbacks to the
+ backup router. If backup rejects fallback, server disconnects
+ and reconnects to both backup and primary.
+ - (4) a server crashes during resuming (multiple servers present)
+ - Works, no desync in backup or servers. The backup restarts the
+ protocol after timeout. After that the protocol executes
+ quickly since all other servers are already connected to the
+ primary. While waiting restart servers fallback to the
+ backup.
+ - (5) server can connect to primary but cannot communicate
+ - Works, no desync in server. The server notices that the
+ protocol did not succeed, and verifies from backup whether it
+ can be used still. If backup refuses the server reconnects
+ to the primary router.
+ - (6) backup can connect to primary but cannot communicate
+ - Same as (9).
+ - (7) primary won't communicate with server
+ - Same as (5).
+ - (8) primary won't communicate with backup
+ - Same as (6), (9).
+ - (9) backup cannot communicate with server
+ - Works, no desync in backup or servers. The backup restarts the
+ protocol after timeout. This happens as long as the protocol
+ is executed successfully. Primary can communicate through the
+ backup during this. Servers fallback to backup after timeout
+ occurs and waits for new resuming. If the server never answers
+ anything to backup (but is up) then resuming lasts for ever,
+ until the server is either removed from network or starts
+ communicating. However, this does not cause network desync.
+ - (10) server cannot communicate with backup
+ - Same as (5), (9).
- Notifys (works)
JOIN, TOPIC_SET, CMODE_CHANGE, CUMODE_CHANGE, CHANNEL_CHANGE,
server->standalone = FALSE;
server->backup_primary = FALSE;
- /* If we are router then announce our possible servers. Backup
- router announces also global servers. */
- if (server->server_type == SILC_ROUTER)
- silc_server_announce_servers(server,
- server->backup_router ? TRUE : FALSE,
- 0, SILC_PRIMARY_ROUTE(server));
+ /* Announce data if we are not backup router (unless not as primary
+ currently). Backup router announces later at the end of
+ resuming protocol. */
+ if (server->backup_router && server->server_type == SILC_ROUTER) {
+ SILC_LOG_DEBUG(("Announce data after resume protocol"));
+ } else {
+ /* If we are router then announce our possible servers. Backup
+ router announces also global servers. */
+ if (server->server_type == SILC_ROUTER)
+ silc_server_announce_servers(server,
+ server->backup_router ? TRUE : FALSE,
+ 0, SILC_PRIMARY_ROUTE(server));
- /* Announce our clients and channels to the router */
- silc_server_announce_clients(server, 0, SILC_PRIMARY_ROUTE(server));
- silc_server_announce_channels(server, 0, SILC_PRIMARY_ROUTE(server));
+ /* Announce our clients and channels to the router */
+ silc_server_announce_clients(server, 0, SILC_PRIMARY_ROUTE(server));
+ silc_server_announce_channels(server, 0, SILC_PRIMARY_ROUTE(server));
+ }
/* If we are backup router then this primary router is whom we are
backing up. */
/* Attempt to reconnect to primary */
SILC_LOG_DEBUG(("Received failed START_USE from backup %s", sock->ip));
+ /* Default action is to disconnect from backup and reconnect to
+ primary. Since this failure can happen during switching to
+ backup (backup might have not noticed the primary going down yet),
+ we will wait a while and keep sending START_USE to backup.
+ Only after that we'll give up. */
+ if (server->router == sock->user_data &&
+ (time(0) - server->router_connect) < 30) {
+ SILC_LOG_DEBUG(("Resending START_USE to backup router"));
+ silc_server_backup_send_start_use(server, sock, FALSE);
+ break;
+ }
+
/* If backup is our primary, disconnect now. */
if (server->router == sock->user_data) {
if (sock->user_data)
SilcProtocol protocol;
SilcServerRekeyInternalContext *proto_ctx;
+ if (!idata)
+ return;
+
/* Do not execute rekey with disabled connections, as it would not
go through anyway. */
if (idata->status & SILC_IDLIST_STATUS_DISABLED)
ctx->sessions_count++;
}
- /* If we are not standalone and our primary is not the one we're
- talking to now, then announce our information to it since we
- haven't done that yet. Standalone backup router announces
- these during connecting to the primary. */
- if (!server->standalone && SILC_PRIMARY_ROUTE(server) != ctx->sock) {
- silc_server_announce_servers(server, TRUE, 0, ctx->sock);
- silc_server_announce_clients(server, 0, ctx->sock);
- silc_server_announce_channels(server, 0, ctx->sock);
- }
+ /* Announce data to the new primary to be. */
+ silc_server_announce_servers(server, TRUE, 0, ctx->sock);
+ silc_server_announce_clients(server, 0, ctx->sock);
+ silc_server_announce_channels(server, 0, ctx->sock);
protocol->state++;
silc_server_local_servers_toggle_enabled(server, FALSE);
router->data.status &= ~SILC_IDLIST_STATUS_DISABLED;
silc_server_update_servers_by_server(server, backup_router, router);
- silc_server_update_clients_by_server(server, NULL, router, FALSE);
+ silc_server_update_clients_by_server(
+ server, NULL, router,
+ server->server_type == SILC_BACKUP_ROUTER);
if (server->server_type == SILC_SERVER)
silc_server_update_channels_by_server(server, backup_router, router);
silc_server_backup_replaced_del(server, backup_router);
to perfom resuming protocol. */
server->server_type = SILC_BACKUP_ROUTER;
silc_server_local_servers_toggle_enabled(server, FALSE);
+ server_entry->data.status &= ~SILC_IDLIST_STATUS_DISABLED;
silc_server_update_servers_by_server(server, server->id_entry,
sock->user_data);
silc_server_update_clients_by_server(server, NULL,
- sock->user_data, FALSE);
+ sock->user_data, TRUE);
/* Announce our clients and channels to the router */
- silc_server_announce_clients(server, ctx->start, sock);
- silc_server_announce_channels(server, ctx->start, sock);
+ silc_server_announce_clients(server, 0, sock);
+ silc_server_announce_channels(server, 0, sock);
}
continue;
if (ctx->type == SILC_SERVER_BACKUP_RESUMED && server->router) {
/* Announce all of our information to the router. */
if (server->server_type == SILC_ROUTER)
- silc_server_announce_servers(server, FALSE, ctx->start,
+ silc_server_announce_servers(server, FALSE, 0,
server->router->connection);
/* Announce our clients and channels to the router */
- silc_server_announce_clients(server, ctx->start,
- server->router->connection);
- silc_server_announce_channels(server, ctx->start,
- server->router->connection);
+ silc_server_announce_clients(server, 0, server->router->connection);
+ silc_server_announce_channels(server, 0, server->router->connection);
}
} else {
/* Error */
SilcIDCacheList list;
bool tolocal = (to == server->id_entry);
+ SILC_LOG_DEBUG(("Start"));
+
if (!silc_idcache_get_all(server->local_list->servers, &list))
return NULL;
}
server_entry = server_entry->router;
} else {
+ SILC_LOG_DEBUG(("Server locally connected"));
/* If the client is not marked as local then move it to local list
since the server is local. */
if (server_entry->server_type != SILC_BACKUP_ROUTER && !local) {
client_cache->id, client_cache->context,
client_cache->expire, NULL);
silc_idcache_del_by_context(server->global_list->clients, client);
+
+ } else if (server->server_type == SILC_BACKUP_ROUTER && local) {
+ /* If we are backup router and this client is on local list, we
+ must move it to global list, as it is not currently local to
+ us (we are not primary). */
+ SILC_LOG_DEBUG(("Moving client to global list"));
+ silc_idcache_add(server->global_list->clients, client_cache->name,
+ client_cache->id, client_cache->context,
+ client_cache->expire, NULL);
+ silc_idcache_del_by_context(server->local_list->clients, client);
}
}
}
server_entry = server_entry->router;
} else {
+ SILC_LOG_DEBUG(("Server locally connected"));
/* If the client is marked as local then move it to global list
since the server is global. */
if (server_entry->server_type != SILC_BACKUP_ROUTER && local) {
SilcClientEntry client = NULL;
bool local;
+ if (from && from->id) {
+ SILC_LOG_DEBUG(("Changing from server %s",
+ silc_id_render(from->id, SILC_ID_SERVER)));
+ }
+ if (to && to->id) {
+ SILC_LOG_DEBUG(("Changing to server %s",
+ silc_id_render(to->id, SILC_ID_SERVER)));
+ }
+
+ SILC_LOG_DEBUG(("global list"));
local = FALSE;
if (silc_idcache_get_all(server->global_list->clients, &list)) {
if (silc_idcache_list_first(list, &id_cache)) {
}
} else {
/* All are changed */
+ if (resolve_real_server)
+ /* Call this so that the entry is moved to correct list if
+ needed. No resolving by real server is actually done. */
+ silc_server_update_clients_by_real_server(server, NULL, to,
+ client, local,
+ id_cache);
+
client->router = to;
}
silc_idcache_list_free(list);
}
+ SILC_LOG_DEBUG(("local list"));
local = TRUE;
if (silc_idcache_get_all(server->local_list->clients, &list)) {
if (silc_idcache_list_first(list, &id_cache)) {
}
} else {
/* All are changed */
+ if (resolve_real_server)
+ /* Call this so that the entry is moved to correct list if
+ needed. No resolving by real server is actually done. */
+ silc_server_update_clients_by_real_server(server, NULL, to,
+ client, local,
+ id_cache);
+
client->router = to;
}
attempt to figure out which clients really are originated from the
`from' and which are originated from a server that we have connection
to, when we've acting as backup router. If it is FALSE the `to' will
- be the new source. */
+ be the new source. If `from' is NULL then all clients (except locally
+ connected) are updated `to'. */
void silc_server_update_clients_by_server(SilcServer server,
SilcServerEntry from,
SilcServerEntry to,