Backup router testing, fixes.
authorPekka Riikonen <priikone@silcnet.org>
Mon, 13 Oct 2003 18:38:39 +0000 (18:38 +0000)
committerPekka Riikonen <priikone@silcnet.org>
Mon, 13 Oct 2003 18:38:39 +0000 (18:38 +0000)
CHANGES
TODO
apps/silcd/server.c
apps/silcd/server_backup.c
apps/silcd/server_util.c
apps/silcd/server_util.h

diff --git a/CHANGES b/CHANGES
index 26341b4e3e1acac5de54d2f1354a72ad9ef1cde7..5918b56f20d28f9e79b44ea33d7cb8cd9839bc6e 100644 (file)
--- a/CHANGES
+++ b/CHANGES
@@ -1,3 +1,8 @@
+Mon Oct 13 21:37:47 EEST 2003  Pekka Riikonen <priikone@silcnet.org>
+
+       * Continued backup router tests and fixes.  Affected files
+         silcd/server_backup, server_util.c, server.c.  See TODO.
+
 Sun Oct 12 19:58:18 EEST 2003  Pekka Riikonen <priikone@silcnet.org>
 
        * Fixed SERVER_SIGNOFF handling in servers.  The client
diff --git a/TODO b/TODO
index 7c7fa70902d4a1d2679edc1a7127b4654895467f..1dbc95d23b9bf0533ec4f0641d633c0697b3fda0 100644 (file)
--- a/TODO
+++ b/TODO
@@ -11,9 +11,77 @@ TODO for SILC Server 1.0
 
  o Backup router testing
 
-   - test all resume error cases for backup router
-   - test all resume error cases for normal server
-   - test all resume error cases for primary router
+   - Switching tests
+     - (1) primary goes down (works)
+     - (2) server(s) looses primary, but backup doesn't
+       - Works, if the primary sends SERVER_SIGNOFF to backup for the
+         the signoffed servers it will disconnect them.  The servers
+         will reconnect to primary, no desync.  If primary sends ping
+         back (before SERVER_SIGNOFF) the backup returns failure to
+         server.  Server resends START_USE, and same repeats.  Either
+         the SERVER_SIGNOFF is received or the server disconnects from
+         backup, and reconnects to primary.
+     - (3) server looses primary, but backup doesn't, but ping timeouts
+          (no crash in primary)
+       - Works, the backup will be primary, server will notice it and
+         network works.  When the backup gets connection back to primary,
+         primary will reject resuming.  Backup switches back to backup
+         router.  Server timeouts, disconnects and reconnects to primary
+          to avoid desync.
+     - (4) backup looses primary, but server(s) doesn't
+       - Works, the backup is in desync.  When backup connects back to
+         primary it attempts to execute the resuming.  The primary will
+         reject this.  The backup accepts it and resumes as backup 
+         router, no desync.  Servers get the backup resuming protocol 
+          but it will timeout, and no other action is taken.  No desync in 
+          servers.
+     - (5) server looses primary, backup crashes
+       - Works, the server will attempt to reconnect to the primary
+         and backup.  Server will be cut from rest of the network.
+     - (6) server looses primary, backup doesn't, rejects server's use,
+           then backup looses primary
+       - Same as (3), if primary crashed, normal resuming occurs.
+
+   - Resuming tests
+     - (1) normal resuming (works)
+     - (2) backup crashes during resuming
+       - Works, no desync in router or server.  Server reconnects to
+         the router to avoid desync in server.
+     - (3) primary crashes during resuming
+       - Works, no desync in backup or server.  Backup handles crash
+         during first contact, during resuming and immediately after.
+         Server handle crash during first contact, after contact and
+         immediately after.  In case of error server fallbacks to the
+         backup router.  If backup rejects fallback, server disconnects
+         and reconnects to both backup and primary.
+     - (4) a server crashes during resuming (multiple servers present)
+       - Works, no desync in backup or servers.  The backup restarts the
+         protocol after timeout.  After that the protocol executes
+         quickly since all other servers are already connected to the
+         primary.  While waiting restart servers fallback to the
+         backup.
+     - (5) server can connect to primary but cannot communicate
+       - Works, no desync in server.  The server notices that the 
+         protocol did not succeed, and verifies from backup whether it
+         can be used still.  If backup refuses the server reconnects
+         to the primary router.
+     - (6) backup can connect to primary but cannot communicate
+       - Same as (9).
+     - (7) primary won't communicate with server
+       - Same as (5).
+     - (8) primary won't communicate with backup
+       - Same as (6), (9).
+     - (9) backup cannot communicate with server
+       - Works, no desync in backup or servers.  The backup restarts the
+         protocol after timeout.  This happens as long as the protocol
+         is executed successfully.  Primary can communicate through the
+         backup during this.  Servers fallback to backup after timeout
+         occurs and waits for new resuming.  If the server never answers
+         anything to backup (but is up) then resuming lasts for ever,
+         until the server is either removed from network or starts
+         communicating.  However, this does not cause network desync.
+     - (10) server cannot communicate with backup
+       - Same as (5), (9).
 
    - Notifys (works)
    JOIN, TOPIC_SET, CMODE_CHANGE, CUMODE_CHANGE, CHANNEL_CHANGE,
index 291b09227aed0f5d7c0f361a28e762dd6fcc423a..963746f9045acc439a214a8294b17724b529f0cd 100644 (file)
@@ -1443,16 +1443,23 @@ SILC_TASK_CALLBACK(silc_server_connect_to_router_final)
       server->standalone = FALSE;
       server->backup_primary = FALSE;
 
-      /* If we are router then announce our possible servers.  Backup
-        router announces also global servers. */
-      if (server->server_type == SILC_ROUTER)
-       silc_server_announce_servers(server,
-                                    server->backup_router ? TRUE : FALSE,
-                                    0, SILC_PRIMARY_ROUTE(server));
+      /* Announce data if we are not backup router (unless not as primary
+        currently).  Backup router announces later at the end of
+        resuming protocol. */
+      if (server->backup_router && server->server_type == SILC_ROUTER) {
+       SILC_LOG_DEBUG(("Announce data after resume protocol"));
+      } else {
+       /* If we are router then announce our possible servers.  Backup
+          router announces also global servers. */
+       if (server->server_type == SILC_ROUTER)
+         silc_server_announce_servers(server,
+                                      server->backup_router ? TRUE : FALSE,
+                                      0, SILC_PRIMARY_ROUTE(server));
 
-      /* Announce our clients and channels to the router */
-      silc_server_announce_clients(server, 0, SILC_PRIMARY_ROUTE(server));
-      silc_server_announce_channels(server, 0, SILC_PRIMARY_ROUTE(server));
+       /* Announce our clients and channels to the router */
+       silc_server_announce_clients(server, 0, SILC_PRIMARY_ROUTE(server));
+       silc_server_announce_channels(server, 0, SILC_PRIMARY_ROUTE(server));
+      }
 
       /* If we are backup router then this primary router is whom we are
         backing up. */
@@ -2634,6 +2641,18 @@ void silc_server_packet_parse_type(SilcServer server,
        /* Attempt to reconnect to primary */
        SILC_LOG_DEBUG(("Received failed START_USE from backup %s", sock->ip));
 
+       /* Default action is to disconnect from backup and reconnect to
+          primary.  Since this failure can happen during switching to
+          backup (backup might have not noticed the primary going down yet),
+          we will wait a while and keep sending START_USE to backup.
+          Only after that we'll give up. */
+       if (server->router == sock->user_data &&
+           (time(0) - server->router_connect) < 30) {
+         SILC_LOG_DEBUG(("Resending START_USE to backup router"));
+         silc_server_backup_send_start_use(server, sock, FALSE);
+         break;
+       }
+
        /* If backup is our primary, disconnect now. */
        if (server->router == sock->user_data) {
          if (sock->user_data)
@@ -5350,6 +5369,9 @@ SILC_TASK_CALLBACK_GLOBAL(silc_server_rekey_callback)
   SilcProtocol protocol;
   SilcServerRekeyInternalContext *proto_ctx;
 
+  if (!idata)
+    return;
+
   /* Do not execute rekey with disabled connections, as it would not
      go through anyway. */
   if (idata->status & SILC_IDLIST_STATUS_DISABLED)
index 876eba20217f62d0b8ded91f46b69f3e3a5bba42..df9d1b7e4ffaf87cf8ff76d0ac43a55b602e0ec6 100644 (file)
@@ -1081,15 +1081,10 @@ SILC_TASK_CALLBACK_GLOBAL(silc_server_protocol_backup)
        ctx->sessions_count++;
       }
 
-      /* If we are not standalone and our primary is not the one we're
-        talking to now, then announce our information to it since we
-        haven't done that yet.  Standalone backup router announces
-        these during connecting to the primary. */
-      if (!server->standalone && SILC_PRIMARY_ROUTE(server) != ctx->sock) {
-       silc_server_announce_servers(server, TRUE, 0, ctx->sock);
-       silc_server_announce_clients(server, 0, ctx->sock);
-       silc_server_announce_channels(server, 0, ctx->sock);
-      }
+      /* Announce data to the new primary to be. */
+      silc_server_announce_servers(server, TRUE, 0, ctx->sock);
+      silc_server_announce_clients(server, 0, ctx->sock);
+      silc_server_announce_channels(server, 0, ctx->sock);
 
       protocol->state++;
 
@@ -1313,7 +1308,9 @@ SILC_TASK_CALLBACK_GLOBAL(silc_server_protocol_backup)
        silc_server_local_servers_toggle_enabled(server, FALSE);
        router->data.status &= ~SILC_IDLIST_STATUS_DISABLED;
        silc_server_update_servers_by_server(server, backup_router, router);
-       silc_server_update_clients_by_server(server, NULL, router, FALSE);
+       silc_server_update_clients_by_server(
+                                  server, NULL, router,
+                                  server->server_type == SILC_BACKUP_ROUTER);
        if (server->server_type == SILC_SERVER)
          silc_server_update_channels_by_server(server, backup_router, router);
        silc_server_backup_replaced_del(server, backup_router);
@@ -1436,14 +1433,15 @@ SILC_TASK_CALLBACK(silc_server_protocol_backup_done)
               to perfom resuming protocol. */
            server->server_type = SILC_BACKUP_ROUTER;
            silc_server_local_servers_toggle_enabled(server, FALSE);
+           server_entry->data.status &= ~SILC_IDLIST_STATUS_DISABLED;
            silc_server_update_servers_by_server(server, server->id_entry,
                                                 sock->user_data);
            silc_server_update_clients_by_server(server, NULL,
-                                                sock->user_data, FALSE);
+                                                sock->user_data, TRUE);
 
            /* Announce our clients and channels to the router */
-           silc_server_announce_clients(server, ctx->start, sock);
-           silc_server_announce_channels(server, ctx->start, sock);
+           silc_server_announce_clients(server, 0, sock);
+           silc_server_announce_channels(server, 0, sock);
          }
 
          continue;
@@ -1460,14 +1458,12 @@ SILC_TASK_CALLBACK(silc_server_protocol_backup_done)
     if (ctx->type == SILC_SERVER_BACKUP_RESUMED && server->router) {
       /* Announce all of our information to the router. */
       if (server->server_type == SILC_ROUTER)
-       silc_server_announce_servers(server, FALSE, ctx->start,
+       silc_server_announce_servers(server, FALSE, 0,
                                     server->router->connection);
 
       /* Announce our clients and channels to the router */
-      silc_server_announce_clients(server, ctx->start,
-                                  server->router->connection);
-      silc_server_announce_channels(server, ctx->start,
-                                   server->router->connection);
+      silc_server_announce_clients(server, 0, server->router->connection);
+      silc_server_announce_channels(server, 0, server->router->connection);
     }
   } else {
     /* Error */
index 8405f7e992cae9f6191dc266b463188a93c1bbfd..2cc4907804fabca8ebffc23fe4fb2b43de50d6aa 100644 (file)
@@ -386,6 +386,8 @@ silc_server_update_clients_by_real_server(SilcServer server,
   SilcIDCacheList list;
   bool tolocal = (to == server->id_entry);
 
+  SILC_LOG_DEBUG(("Start"));
+
   if (!silc_idcache_get_all(server->local_list->servers, &list))
     return NULL;
 
@@ -412,6 +414,7 @@ silc_server_update_clients_by_real_server(SilcServer server,
          }
          server_entry = server_entry->router;
        } else {
+         SILC_LOG_DEBUG(("Server locally connected"));
          /* If the client is not marked as local then move it to local list
             since the server is local. */
          if (server_entry->server_type != SILC_BACKUP_ROUTER && !local) {
@@ -420,6 +423,16 @@ silc_server_update_clients_by_real_server(SilcServer server,
                             client_cache->id, client_cache->context,
                             client_cache->expire, NULL);
            silc_idcache_del_by_context(server->global_list->clients, client);
+
+         } else if (server->server_type == SILC_BACKUP_ROUTER && local) {
+           /* If we are backup router and this client is on local list, we
+              must move it to global list, as it is not currently local to
+              us (we are not primary). */
+           SILC_LOG_DEBUG(("Moving client to global list"));
+           silc_idcache_add(server->global_list->clients, client_cache->name,
+                            client_cache->id, client_cache->context,
+                            client_cache->expire, NULL);
+           silc_idcache_del_by_context(server->local_list->clients, client);
          }
        }
 
@@ -459,6 +472,7 @@ silc_server_update_clients_by_real_server(SilcServer server,
          }
          server_entry = server_entry->router;
        } else {
+         SILC_LOG_DEBUG(("Server locally connected"));
          /* If the client is marked as local then move it to global list
             since the server is global. */
          if (server_entry->server_type != SILC_BACKUP_ROUTER && local) {
@@ -500,6 +514,16 @@ void silc_server_update_clients_by_server(SilcServer server,
   SilcClientEntry client = NULL;
   bool local;
 
+  if (from && from->id) {
+    SILC_LOG_DEBUG(("Changing from server %s",
+                   silc_id_render(from->id, SILC_ID_SERVER)));
+  }
+  if (to && to->id) {
+    SILC_LOG_DEBUG(("Changing to server %s",
+                   silc_id_render(to->id, SILC_ID_SERVER)));
+  }
+
+  SILC_LOG_DEBUG(("global list"));
   local = FALSE;
   if (silc_idcache_get_all(server->global_list->clients, &list)) {
     if (silc_idcache_list_first(list, &id_cache)) {
@@ -541,6 +565,13 @@ void silc_server_update_clients_by_server(SilcServer server,
          }
        } else {
          /* All are changed */
+         if (resolve_real_server)
+           /* Call this so that the entry is moved to correct list if
+              needed.  No resolving by real server is actually done. */
+           silc_server_update_clients_by_real_server(server, NULL, to,
+                                                     client, local,
+                                                     id_cache);
+
          client->router = to;
        }
 
@@ -555,6 +586,7 @@ void silc_server_update_clients_by_server(SilcServer server,
     silc_idcache_list_free(list);
   }
 
+  SILC_LOG_DEBUG(("local list"));
   local = TRUE;
   if (silc_idcache_get_all(server->local_list->clients, &list)) {
     if (silc_idcache_list_first(list, &id_cache)) {
@@ -592,6 +624,13 @@ void silc_server_update_clients_by_server(SilcServer server,
          }
        } else {
          /* All are changed */
+         if (resolve_real_server)
+           /* Call this so that the entry is moved to correct list if
+              needed.  No resolving by real server is actually done. */
+           silc_server_update_clients_by_real_server(server, NULL, to,
+                                                     client, local,
+                                                     id_cache);
+
          client->router = to;
        }
 
index 32ea609f16e9f12de22ece815f2d45b5af49b86b..413bdeb97284c108dec09cb9f6a5b5052b5cd88c 100644 (file)
@@ -34,7 +34,8 @@ bool silc_server_remove_clients_by_server(SilcServer server,
    attempt to figure out which clients really are originated from the
    `from' and which are originated from a server that we have connection
    to, when we've acting as backup router. If it is FALSE the `to' will
-   be the new source. */
+   be the new source.  If `from' is NULL then all clients (except locally
+   connected) are updated `to'. */
 void silc_server_update_clients_by_server(SilcServer server,
                                          SilcServerEntry from,
                                          SilcServerEntry to,