From: Pekka Riikonen Date: Fri, 10 Oct 2003 17:09:35 +0000 (+0000) Subject: Fixed desync bugs during backup resuming protocol in backup X-Git-Tag: silc.client.0.9.13~9 X-Git-Url: http://git.silcnet.org/gitweb/?a=commitdiff_plain;h=656b89cbd7bc3b8252369b13b4c019c37ba4861c;p=silc.git Fixed desync bugs during backup resuming protocol in backup and normal server. Assure that only one protocol is executing at a time. --- diff --git a/CHANGES b/CHANGES index ab922b39..8288fb9f 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,17 @@ +Fri Oct 10 16:27:12 EEST 2003 Pekka Riikonen + + * On normal server reconnect to primary during resuming 4 + times, then give up. Affected file silcd/server_backup.c. + + * If during reconnecting to routers we notice we have router + connection but no primary router set, the server is in desync. + Reconnect to primary to restore network. Affected file + silcd/server.c. + + * Assure that only one protocol is exeucting at the same time. + Added checks for all protocols. Affected files are + silcd/server.c and server_backup.c. + Thu Oct 9 20:24:09 EEST 2003 Pekka Riikonen * Check that a string is not already part on invite/ban diff --git a/apps/silcd/packet_receive.c b/apps/silcd/packet_receive.c index 3fedae33..1e60853d 100644 --- a/apps/silcd/packet_receive.c +++ b/apps/silcd/packet_receive.c @@ -949,8 +949,8 @@ void silc_server_notify(SilcServer server, } /* Check whether to give founder rights to this user or not. The - problem here is that we get only the public key of the client, - but no authentication data. We must assume that server has + problem here is that we get only the public key of the client, + but no authentication data. We must assume that server has already authenticated the user (and thus we must trust the server). */ if (mode & SILC_CHANNEL_UMODE_CHANFO && @@ -962,7 +962,7 @@ void silc_server_notify(SilcServer server, /* If channel doesn't have founder auth mode then it's impossible that someone would be getting founder rights with CUMODE command. In that case there already either is founder or there isn't - founder at all on the channel (valid only when 'client' is + founder at all on the channel (valid only when 'client' is valid). */ if (client && !(channel->mode & SILC_CHANNEL_MODE_FOUNDER_AUTH)) { /* Force the mode to not have founder mode */ @@ -3400,7 +3400,16 @@ void silc_server_rekey(SilcServer server, SilcServerRekeyInternalContext *proto_ctx; SilcIDListData idata = (SilcIDListData)sock->user_data; - SILC_LOG_DEBUG(("Start")); + SILC_LOG_DEBUG(("Received rekey request")); + + /* If we have other protocol executing we have no other choice but to + not execute rekey. XXX This is very bad thing. Let's hope this + doesn't happen often. */ + if (sock->protocol) { + SILC_LOG_WARNING(("Cannot execute REKEY protocol because other protocol " + "is executing at the same time")); + return; + } /* Allocate internal protocol context. This is sent as context to the protocol. */ diff --git a/apps/silcd/server.c b/apps/silcd/server.c index 6b1b7dc7..d9e756d8 100644 --- a/apps/silcd/server.c +++ b/apps/silcd/server.c @@ -784,10 +784,12 @@ void silc_server_stop(SilcServer server) silc_server_disconnect_remote(server, server->sockets[i], SILC_STATUS_OK, "Server is shutting down"); - if (sock->user_data) - silc_server_free_sock_user_data(server, sock, - "Server is shutting down"); - silc_socket_free(sock); + if (server->sockets[i]) { + if (sock->user_data) + silc_server_free_sock_user_data(server, sock, + "Server is shutting down"); + silc_socket_free(sock); + } } else { silc_socket_free(server->sockets[i]); server->sockets[i] = NULL; @@ -1064,7 +1066,32 @@ SILC_TASK_CALLBACK_GLOBAL(silc_server_connect_to_router) NULL : ptr->host, ptr->port, SILC_SOCKET_TYPE_ROUTER)) { SILC_LOG_DEBUG(("We are already connected to this router")); - continue; + + /* If we don't have primary router and this connection is our + primary router we are in desync. Reconnect to the primary. */ + if (server->standalone && !server->router) { + SilcServerConfigRouter *primary = + silc_server_config_get_primary_router(server); + if (primary == ptr) { + SilcSocketConnection sock = + silc_server_find_socket_by_host(server, SILC_SOCKET_TYPE_ROUTER, + ptr->host, ptr->port); + if (sock) { + server->backup_noswitch = TRUE; + if (sock->user_data) + silc_server_free_sock_user_data(server, sock, NULL); + silc_server_disconnect_remote(server, sock, 0, NULL); + server->backup_noswitch = FALSE; + SILC_LOG_DEBUG(("Reconnecting to primary router")); + } else { + continue; + } + } else { + continue; + } + } else { + continue; + } } if (silc_server_num_sockets_by_remote(server, silc_net_is_ip(ptr->host) ? @@ -2167,7 +2194,8 @@ SILC_TASK_CALLBACK(silc_server_accept_new_connection_final) server->schedule); out: - silc_protocol_free(protocol); + if (sock->protocol == protocol) + silc_protocol_free(protocol); if (ctx->packet) silc_packet_context_free(ctx->packet); if (ctx->ske) @@ -2595,11 +2623,6 @@ void silc_server_packet_parse_type(SilcServer server, */ if (packet->flags & SILC_PACKET_FLAG_LIST) break; - if (sock->protocol) { - sock->protocol->state = SILC_PROTOCOL_STATE_FAILURE; - silc_protocol_execute(sock->protocol, server->schedule, 0, 0); - break; - } /* Check for failure START_USE from backup router */ if (server->server_type == SILC_SERVER && @@ -2609,9 +2632,26 @@ void silc_server_packet_parse_type(SilcServer server, if (type == SILC_SERVER_BACKUP_START_USE) { /* Attempt to reconnect to primary */ SILC_LOG_DEBUG(("Received failed START_USE from backup %s", sock->ip)); + + /* If backup is our primary, disconnect now. */ + if (server->router == sock->user_data) { + if (sock->user_data) + silc_server_free_sock_user_data(server, sock, NULL); + SILC_SET_DISCONNECTING(sock); + silc_server_close_connection(server, sock); + } + + /* Reconnect */ silc_server_create_connections(server); } } + + /* Execute protocol */ + if (sock->protocol) { + sock->protocol->state = SILC_PROTOCOL_STATE_FAILURE; + silc_protocol_execute(sock->protocol, server->schedule, 0, 0); + break; + } break; case SILC_PACKET_REJECT: @@ -2723,7 +2763,12 @@ void silc_server_packet_parse_type(SilcServer server, silc_protocol_execute(sock->protocol, server->schedule, 0, 100000); } else { SILC_LOG_ERROR(("Received Key Exchange packet but no key exchange " - "protocol active, packet dropped.")); + "protocol active (%s:%d [%s]).", sock->hostname, + sock->port, + (sock->type == SILC_SOCKET_TYPE_UNKNOWN ? "Unknown" : + sock->type == SILC_SOCKET_TYPE_CLIENT ? "Client" : + sock->type == SILC_SOCKET_TYPE_SERVER ? "Server" : + "Router"))); } break; @@ -2766,7 +2811,12 @@ void silc_server_packet_parse_type(SilcServer server, } } else { SILC_LOG_ERROR(("Received Key Exchange 1 packet but no key exchange " - "protocol active, packet dropped.")); + "protocol active (%s:%d [%s]).", sock->hostname, + sock->port, + (sock->type == SILC_SOCKET_TYPE_UNKNOWN ? "Unknown" : + sock->type == SILC_SOCKET_TYPE_CLIENT ? "Client" : + sock->type == SILC_SOCKET_TYPE_SERVER ? "Server" : + "Router"))); } break; @@ -2809,7 +2859,12 @@ void silc_server_packet_parse_type(SilcServer server, } } else { SILC_LOG_ERROR(("Received Key Exchange 2 packet but no key exchange " - "protocol active, packet dropped.")); + "protocol active (%s:%d [%s]).", sock->hostname, + sock->port, + (sock->type == SILC_SOCKET_TYPE_UNKNOWN ? "Unknown" : + sock->type == SILC_SOCKET_TYPE_CLIENT ? "Client" : + sock->type == SILC_SOCKET_TYPE_SERVER ? "Server" : + "Router"))); } break; @@ -2846,7 +2901,12 @@ void silc_server_packet_parse_type(SilcServer server, silc_protocol_execute(sock->protocol, server->schedule, 0, 0); } else { SILC_LOG_ERROR(("Received Connection Auth packet but no authentication " - "protocol active, packet dropped.")); + "protocol active (%s:%d [%s]).", sock->hostname, + sock->port, + (sock->type == SILC_SOCKET_TYPE_UNKNOWN ? "Unknown" : + sock->type == SILC_SOCKET_TYPE_CLIENT ? "Client" : + sock->type == SILC_SOCKET_TYPE_SERVER ? "Server" : + "Router"))); } break; @@ -2945,7 +3005,12 @@ void silc_server_packet_parse_type(SilcServer server, silc_protocol_execute(sock->protocol, server->schedule, 0, 0); } else { SILC_LOG_ERROR(("Received Re-key done packet but no re-key " - "protocol active, packet dropped.")); + "protocol active (%s:%d [%s]).", sock->hostname, + sock->port, + (sock->type == SILC_SOCKET_TYPE_UNKNOWN ? "Unknown" : + sock->type == SILC_SOCKET_TYPE_CLIENT ? "Client" : + sock->type == SILC_SOCKET_TYPE_SERVER ? "Server" : + "Router"))); } break; @@ -3281,7 +3346,7 @@ void silc_server_free_sock_user_data(SilcServer server, /* We stop here to take a breath */ sleep(2); - if (server->server_type == SILC_BACKUP_ROUTER) { + if (server->backup_router) { server->server_type = SILC_ROUTER; /* We'll need to constantly try to reconnect to the primary @@ -4379,7 +4444,7 @@ void silc_server_announce_get_inviteban(SilcServer server, silc_buffer_free(idp2); silc_buffer_free(list); } - + /* Encode ban list */ if (channel->ban_list && silc_hash_table_count(channel->ban_list)) { list = silc_buffer_alloc_size(2); @@ -5250,6 +5315,18 @@ SILC_TASK_CALLBACK_GLOBAL(silc_server_rekey_callback) sock->protocol->protocol->type == SILC_PROTOCOL_SERVER_REKEY) return; + /* If any other protocol is active do not start this protocol yet. */ + if (sock->protocol) { + SILC_LOG_DEBUG(("Waiting for other protocol to finish before rekeying")); + silc_schedule_task_add(server->schedule, sock->sock, + silc_server_rekey_callback, + sock, 60, 0, SILC_TASK_TIMEOUT, + SILC_TASK_PRI_NORMAL); + return; + } + + SILC_LOG_DEBUG(("Executing rekey protocol")); + /* Allocate internal protocol context. This is sent as context to the protocol. */ proto_ctx = silc_calloc(1, sizeof(*proto_ctx)); diff --git a/apps/silcd/server_backup.c b/apps/silcd/server_backup.c index 72becccc..7bf72544 100644 --- a/apps/silcd/server_backup.c +++ b/apps/silcd/server_backup.c @@ -489,6 +489,8 @@ void silc_server_backup_send_replaced(SilcServer server, /************************ Backup Resuming Protocol **************************/ +/* Timeout callback for protocol */ + SILC_TASK_CALLBACK(silc_server_backup_timeout) { SilcProtocol protocol = context; @@ -500,6 +502,34 @@ SILC_TASK_CALLBACK(silc_server_backup_timeout) silc_protocol_execute_final(protocol, server->schedule); } +/* Callback to start the protocol as responder */ + +SILC_TASK_CALLBACK(silc_server_backup_responder_start) +{ + SilcServerBackupProtocolContext proto_ctx = context; + SilcSocketConnection sock = proto_ctx->sock; + SilcServer server = app_context; + + /* If other protocol is executing at the same time, start with timeout. */ + if (sock->protocol) { + silc_schedule_task_add(server->schedule, sock->sock, + silc_server_backup_responder_start, + proto_ctx, 2, 0, + SILC_TASK_TIMEOUT, SILC_TASK_PRI_NORMAL); + return; + } + + /* Run the backup resuming protocol */ + silc_protocol_alloc(SILC_PROTOCOL_SERVER_BACKUP, + &sock->protocol, proto_ctx, + silc_server_protocol_backup_done); + silc_protocol_execute(sock->protocol, server->schedule, 0, 0); + silc_schedule_task_add(server->schedule, sock->sock, + silc_server_backup_timeout, + sock->protocol, 30, 0, SILC_TASK_TIMEOUT, + SILC_TASK_PRI_NORMAL); +} + typedef struct { SilcServer server; SilcSocketConnection sock; @@ -646,7 +676,7 @@ void silc_server_backup_resume_router(SilcServer server, proto_ctx = silc_calloc(1, sizeof(*proto_ctx)); proto_ctx->server = server; - proto_ctx->sock = sock; + proto_ctx->sock = silc_socket_dup(sock); proto_ctx->responder = TRUE; proto_ctx->type = type; proto_ctx->session = session; @@ -655,15 +685,11 @@ void silc_server_backup_resume_router(SilcServer server, SILC_LOG_DEBUG(("Starting backup resuming protocol as responder")); SILC_LOG_INFO(("Starting backup resuming protocol")); - /* Run the backup resuming protocol */ - silc_protocol_alloc(SILC_PROTOCOL_SERVER_BACKUP, - &sock->protocol, proto_ctx, - silc_server_protocol_backup_done); - silc_protocol_execute(sock->protocol, server->schedule, 0, 0); + /* Start protocol immediately */ silc_schedule_task_add(server->schedule, sock->sock, - silc_server_backup_timeout, - sock->protocol, 30, 0, SILC_TASK_TIMEOUT, - SILC_TASK_PRI_NORMAL); + silc_server_backup_responder_start, + proto_ctx, 0, 1, + SILC_TASK_TIMEOUT, SILC_TASK_PRI_NORMAL); return; } @@ -699,7 +725,9 @@ void silc_server_backup_resume_router(SilcServer server, bsock->protocol->protocol->type == SILC_PROTOCOL_SERVER_BACKUP) { sock->protocol = bsock->protocol; ctx = sock->protocol->context; - ctx->sock = sock; + if (ctx->sock) + silc_socket_free(ctx->sock); /* unref */ + ctx->sock = silc_socket_dup(sock); } } } @@ -747,6 +775,14 @@ SILC_TASK_CALLBACK(silc_server_backup_connect_to_router) sock = silc_net_create_connection(server_ip, sconn->remote_port, sconn->remote_host); if (sock < 0) { + if (server->server_type == SILC_SERVER) { + sconn->retry_count++; + if (sconn->retry_count > 3) { + silc_free(sconn->remote_host); + silc_free(sconn); + return; + } + } silc_schedule_task_add(server->schedule, 0, silc_server_backup_connect_to_router, context, 10, 0, SILC_TASK_TIMEOUT, @@ -777,6 +813,7 @@ void silc_server_backup_reconnect(SilcServer server, sconn->callback = callback; sconn->callback_context = context; sconn->no_reconnect = TRUE; + sconn->retry_count = 0; silc_schedule_task_add(server->schedule, 0, silc_server_backup_connect_to_router, sconn, 1, 0, SILC_TASK_TIMEOUT, @@ -793,6 +830,16 @@ SILC_TASK_CALLBACK(silc_server_backup_connected_later) SilcServer server = proto_ctx->server; SilcSocketConnection sock = proto_ctx->sock; + /* If running other protocol already run this one a bit later. */ + if (sock->protocol) { + silc_schedule_task_add(server->schedule, 0, + silc_server_backup_connected_later, + proto_ctx, 10, 0, + SILC_TASK_TIMEOUT, + SILC_TASK_PRI_NORMAL); + return; + } + SILC_LOG_DEBUG(("Starting backup resuming protocol as initiator")); SILC_LOG_INFO(("Starting backup resuming protocol")); @@ -823,18 +870,21 @@ void silc_server_backup_connected(SilcServer server, /* Try again */ SilcServerConfigRouter *primary; primary = silc_server_config_get_primary_router(server); - if (primary) - silc_server_backup_reconnect(server, - primary->host, primary->port, - silc_server_backup_connected, - context); + if (primary) { + if (!silc_server_find_socket_by_host(server, SILC_SOCKET_TYPE_ROUTER, + primary->host, primary->port)) + silc_server_backup_reconnect(server, + primary->host, primary->port, + silc_server_backup_connected, + context); + } return; } sock = (SilcSocketConnection)server_entry->connection; proto_ctx = silc_calloc(1, sizeof(*proto_ctx)); proto_ctx->server = server; - proto_ctx->sock = sock; + proto_ctx->sock = silc_socket_dup(sock); proto_ctx->responder = FALSE; proto_ctx->type = SILC_SERVER_BACKUP_START; proto_ctx->start = time(0); @@ -862,18 +912,29 @@ static void silc_server_backup_connect_primary(SilcServer server, SilcIDListData idata; unsigned char data[2]; + if (SILC_IS_DISCONNECTING(backup_router) || + SILC_IS_DISCONNECTED(backup_router)) { + silc_socket_free(backup_router); + return; + } + if (!server_entry) { /* Try again */ SilcServerConfigRouter *primary; primary = silc_server_config_get_primary_router(server); if (primary) - silc_server_backup_reconnect(server, - primary->host, primary->port, - silc_server_backup_connect_primary, - context); + if (!silc_server_find_socket_by_host(server, SILC_SOCKET_TYPE_ROUTER, + primary->host, primary->port)) + silc_server_backup_reconnect(server, + primary->host, primary->port, + silc_server_backup_connect_primary, + context); return; } + /* Unref */ + silc_socket_free(backup_router); + if (!backup_router->protocol) return; if (!server_entry->connection) @@ -902,7 +963,9 @@ static void silc_server_backup_connect_primary(SilcServer server, packets in this protocol. We don't talk with backup router anymore. */ sock->protocol = backup_router->protocol; - ctx->sock = (SilcSocketConnection)server_entry->connection; + if (ctx->sock) + silc_socket_free(ctx->sock); /* unref */ + ctx->sock = silc_socket_dup(server_entry->connection); backup_router->protocol = NULL; } @@ -1037,7 +1100,7 @@ SILC_TASK_CALLBACK_GLOBAL(silc_server_protocol_backup) silc_server_backup_reconnect(server, primary->host, primary->port, silc_server_backup_connect_primary, - ctx->sock); + silc_socket_dup(ctx->sock)); } else { /* Nowhere to connect just return the CONNECTED packet */ SILC_LOG_DEBUG(("Received START (session %d), send CONNECTED back", @@ -1296,8 +1359,12 @@ SILC_TASK_CALLBACK(silc_server_protocol_backup_done) error = (protocol->state == SILC_PROTOCOL_STATE_ERROR || protocol->state == SILC_PROTOCOL_STATE_FAILURE); - if (error) + if (error) { SILC_LOG_ERROR(("Error occurred during backup router resuming protcool")); + if (server->server_type == SILC_SERVER) + silc_schedule_task_del_by_callback(server->schedule, + silc_server_backup_connect_to_router); + } if (server->server_shutdown) return; @@ -1318,16 +1385,22 @@ SILC_TASK_CALLBACK(silc_server_protocol_backup_done) sock->protocol = NULL; if (error) { - /* If we are normal server close router connections and reconnect. */ + /* If we are server close all router connections except backup, + send confirmation to backup that using it is still ok and continue + sending traffic there. The backup will reply with error if + it's not ok. */ if (server->server_type == SILC_SERVER && - (server_entry->server_type == SILC_BACKUP_ROUTER || - server_entry->server_type == SILC_ROUTER)) { + server_entry->server_type == SILC_ROUTER) { server->backup_noswitch = TRUE; if (sock->user_data) silc_server_free_sock_user_data(server, sock, NULL); silc_server_disconnect_remote(server, sock, 0, NULL); server->backup_noswitch = FALSE; + /* Send START_USE just in case using backup wouldn't be ok. */ + silc_server_backup_send_start_use(server, server->router->connection, + FALSE); + silc_server_create_connections(server); continue; } @@ -1336,8 +1409,10 @@ SILC_TASK_CALLBACK(silc_server_protocol_backup_done) if (SILC_PRIMARY_ROUTE(server) == sock && server->backup_router) { server->backup_noswitch = TRUE; server->server_type = SILC_BACKUP_ROUTER; - if (ctx->sock == sock) + if (ctx->sock == sock) { + silc_socket_free(sock); /* unref */ ctx->sock = NULL; + } server->backup_noswitch = TRUE; if (sock->user_data) @@ -1359,6 +1434,8 @@ SILC_TASK_CALLBACK(silc_server_protocol_backup_done) if (ctx->sock && ctx->sock->protocol) ctx->sock->protocol = NULL; + if (ctx->sock) + silc_socket_free(ctx->sock); /* unref */ silc_protocol_free(protocol); silc_free(ctx->sessions); silc_free(ctx); diff --git a/apps/silcd/server_util.c b/apps/silcd/server_util.c index 8e540867..1bc86f7c 100644 --- a/apps/silcd/server_util.c +++ b/apps/silcd/server_util.c @@ -1215,6 +1215,7 @@ bool silc_server_connection_allowed(SilcServer server, r_protocol_version < l_protocol_version) { SILC_LOG_INFO(("Connection %s (%s) is too old version", sock->hostname, sock->ip)); + sock->protocol = NULL; silc_server_disconnect_remote(server, sock, SILC_STATUS_ERR_BAD_VERSION, "You support too old protocol version"); @@ -1226,6 +1227,7 @@ bool silc_server_connection_allowed(SilcServer server, r_software_version < l_software_version) { SILC_LOG_INFO(("Connection %s (%s) is too old version", sock->hostname, sock->ip)); + sock->protocol = NULL; silc_server_disconnect_remote(server, sock, SILC_STATUS_ERR_BAD_VERSION, "You support too old software version"); @@ -1237,6 +1239,7 @@ bool silc_server_connection_allowed(SilcServer server, !silc_string_match(l_vendor_version, r_vendor_version)) { SILC_LOG_INFO(("Connection %s (%s) is unsupported version", sock->hostname, sock->ip)); + sock->protocol = NULL; silc_server_disconnect_remote(server, sock, SILC_STATUS_ERR_BAD_VERSION, "Your software is not supported"); @@ -1255,6 +1258,7 @@ bool silc_server_connection_allowed(SilcServer server, if (max_hosts && conn_number >= max_hosts) { SILC_LOG_INFO(("Server is full, closing %s (%s) connection", sock->hostname, sock->ip)); + sock->protocol = NULL; silc_server_disconnect_remote(server, sock, SILC_STATUS_ERR_RESOURCE_LIMIT, "Server is full, try again later"); @@ -1264,6 +1268,7 @@ bool silc_server_connection_allowed(SilcServer server, if (num_sockets >= max_per_host) { SILC_LOG_INFO(("Too many connections from %s (%s), closing connection", sock->hostname, sock->ip)); + sock->protocol = NULL; silc_server_disconnect_remote(server, sock, SILC_STATUS_ERR_RESOURCE_LIMIT, "Too many connections from your host");