Testei round-robin de DNS com SSH e notei resultados surpreendentes do cliente SSH em meu ambiente de teste. Eu estou usando 3 nós com o RHEL 6.2 (openssh-5.3p1, bind-9.7.3-8.P3). Coisas como as chaves do host foram gerenciadas.
Meu "problema":
Eu gostaria de um tipo rudimentar de balanceamento de carga entre vários servidores SSH, usando várias entradas de DNS. Eu estava (quase) certo de que isso era possível. Mas eu tenho um tipo rudimentar de HA ... Parece que o cliente openssh não se importa com o round-robin, ele sempre se conecta ao mesmo nó, exceto se for down, neste último caso o cliente usa outro registro do a lista de entradas de DNS e, em seguida, conecte-se a ela com êxito. É o comportamento normal / comum? Ou o que está errado nos meus testes?
Eu coloquei meus straces e tcpdumps do que acontece em vários casos. Agradecemos antecipadamente se você tem alguma idéia ou explicação que possa ajudar:)
login = > 10.255.254.1 (nó0), 10.255.254.3 (nó2)
cliente ssh = > 10.255.254.2 (node1)
Servidor DNS no node0, o RR não foi desativado.
login IN A 10.255.254.1
login IN A 10.255.254.3
Confirmo que:
- a pesquisa com host (1) confirma o Round-Robin;
- o comando ping (1) parece bom:
[root @ node1 ~] # login de ping
PING login.node (10.255.254.3) 56(84) bytes of data.
64 bytes from node2.node (10.255.254.3): icmp_seq=1 ttl=64 time=1.73 ms
^C
[root@node1 ~]# ping login
PING login.node (10.255.254.1) 56(84) bytes of data.
64 bytes from node0.node (10.255.254.1): icmp_seq=1 ttl=64 time=0.467 ms
^C
[root@node1 ~]# ping login
PING login.node (10.255.254.3) 56(84) bytes of data.
64 bytes from node2.node (10.255.254.3): icmp_seq=1 ttl=64 time=0.433 ms
^C
TESTE 1 (os dois servidores SSH estão ativos e acessíveis)
[root@node1 ~]# strace -e connect ssh login
connect(3, {sa_family=AF_FILE, path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
(...)
connect(3, {sa_family=AF_INET, sin_port=htons(53), sin_addr=inet_addr("10.255.254.1")}, 16) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.3")}, 16) = 0
connect(3, {sa_family=AF_UNSPEC, sa_data="[root@node1 ~]# strace -e connect ssh login
connect(3, {sa_family=AF_FILE, path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
(...)
connect(3, {sa_family=AF_INET, sin_port=htons(53), sin_addr=inet_addr("10.255.254.1")}, 16) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.1")}, 16) = 0
connect(3, {sa_family=AF_UNSPEC, sa_data="[root@node2 ~]# /etc/init.d/sshd stop
Stopping sshd: [ OK ]
[root@node1 ~]# strace -e connect ssh login
connect(3, {sa_family=AF_FILE, path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
(...)
connect(3, {sa_family=AF_INET, sin_port=htons(53), sin_addr=inet_addr("10.255.254.1")}, 16) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.1")}, 16) = 0
connect(3, {sa_family=AF_UNSPEC, sa_data="[root@node1 ~]# strace -e connect ssh login
connect(3, {sa_family=AF_FILE, path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
(...)
connect(3, {sa_family=AF_INET, sin_port=htons(53), sin_addr=inet_addr("10.255.254.1")}, 16) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.3")}, 16) = 0
connect(3, {sa_family=AF_UNSPEC, sa_data="[root@node2 ~]# /etc/init.d/sshd restart
Stopping sshd: [FAILED]
Starting sshd: [ OK ]
[root@node1 ~]# strace -e connect ssh login
connect(3, {sa_family=AF_FILE, path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
(...)
connect(3, {sa_family=AF_INET, sin_port=htons(53), sin_addr=inet_addr("10.255.254.1")}, 16) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.1")}, 16) = 0
connect(3, {sa_family=AF_UNSPEC, sa_data="login IN A 10.255.254.1
login IN A 10.255.254.3
PING login.node (10.255.254.3) 56(84) bytes of data.
64 bytes from node2.node (10.255.254.3): icmp_seq=1 ttl=64 time=1.73 ms
^C
[root@node1 ~]# ping login
PING login.node (10.255.254.1) 56(84) bytes of data.
64 bytes from node0.node (10.255.254.1): icmp_seq=1 ttl=64 time=0.467 ms
^C
[root@node1 ~]# ping login
PING login.node (10.255.254.3) 56(84) bytes of data.
64 bytes from node2.node (10.255.254.3): icmp_seq=1 ttl=64 time=0.433 ms
^C
[root@node1 ~]# strace -e connect ssh login
connect(3, {sa_family=AF_FILE, path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
(...)
connect(3, {sa_family=AF_INET, sin_port=htons(53), sin_addr=inet_addr("10.255.254.1")}, 16) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.3")}, 16) = 0
connect(3, {sa_family=AF_UNSPEC, sa_data="[root@node1 ~]# strace -e connect ssh login
connect(3, {sa_family=AF_FILE, path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
(...)
connect(3, {sa_family=AF_INET, sin_port=htons(53), sin_addr=inet_addr("10.255.254.1")}, 16) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.1")}, 16) = 0
connect(3, {sa_family=AF_UNSPEC, sa_data="[root@node2 ~]# /etc/init.d/sshd stop
Stopping sshd: [ OK ]
[root@node1 ~]# strace -e connect ssh login
connect(3, {sa_family=AF_FILE, path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
(...)
connect(3, {sa_family=AF_INET, sin_port=htons(53), sin_addr=inet_addr("10.255.254.1")}, 16) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.1")}, 16) = 0
connect(3, {sa_family=AF_UNSPEC, sa_data="[root@node1 ~]# strace -e connect ssh login
connect(3, {sa_family=AF_FILE, path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
(...)
connect(3, {sa_family=AF_INET, sin_port=htons(53), sin_addr=inet_addr("10.255.254.1")}, 16) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.3")}, 16) = 0
connect(3, {sa_family=AF_UNSPEC, sa_data="[root@node2 ~]# /etc/init.d/sshd restart
Stopping sshd: [FAILED]
Starting sshd: [ OK ]
[root@node1 ~]# strace -e connect ssh login
connect(3, {sa_family=AF_FILE, path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
(...)
connect(3, {sa_family=AF_INET, sin_port=htons(53), sin_addr=inet_addr("10.255.254.1")}, 16) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.1")}, 16) = 0
connect(3, {sa_family=AF_UNSPEC, sa_data="%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%"}, 16) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.3")}, 16) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.3")}, 16) = 0
[root@node0 ~]# tcpdump -i eth0 src node1 or dst node1
(...)
17:17:12.893633 IP node1.node.42432 > node0.node.domain: 7264+ A? login.node. (29)
17:17:12.893988 IP node0.node.domain > node1.node.42432: 7264* 2/1/1 A 10.255.254.1, A 10.255.254.3 (102)
(...)
%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%"}, 16) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.1")}, 16) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.3")}, 16) = -1 ECONNREFUSED (Connection refused)
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.1")}, 16) = 0
[root@node0 ~]# tcpdump -i eth0 src node1 or dst node1
(...)
17:11:44.154595 IP node1.node.56947 > node0.node.domain: 4602+ A? login.node. (29)
17:11:44.154862 IP node0.node.domain > node1.node.56947: 4602* 2/1/1 A 10.255.254.3, A 10.255.254.1 (102)
(...)
%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%"}, 16) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.3")}, 16) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.3")}, 16) = -1 ECONNREFUSED (Connection refused)
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.1")}, 16) = 0
[root@node0 ~]# tcpdump -i eth0 src node1 or dst node1
17:09:05.854022 IP node1.node.41233 > node0.node.domain: 63435+ A? login.node. (29)
17:09:05.854055 IP node1.node.41233 > node0.node.domain: 3015+ AAAA? login.node. (29)
17:09:05.854436 IP node0.node.domain > node1.node.41233: 63435* 2/1/1 A 10.255.254.1, A 10.255.254.3 (102)
17:09:05.854531 IP node0.node.domain > node1.node.41233: 3015* 0/1/0 (79)
17:09:05.856764 IP node1.node.59579 > node0.node.ssh: Flags [S], seq 3025023931, win 14600, options [mss 1460,sackOK,TS val 9854496 ecr 0,nop,wscale 7], length 0
17:09:05.856806 IP node0.node.ssh > node1.node.59579: Flags [S.], seq 1105519762, ack 3025023932, win 14480, options [mss 1460,sackOK,TS val 350907197 ecr 9854496,nop,wscale 7], length 0
17:09:05.857106 IP node1.node.59579 > node0.node.ssh: Flags [.], ack 1, win 115, options [nop,nop,TS val 9854496 ecr 350907197], length 0
17:09:05.865291 IP node0.node.ssh > node1.node.59579: Flags [P.], seq 1:22, ack 1, win 114, options [nop,nop,TS val 350907205 ecr 9854496], length 21
(...)
%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%"}, 16) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.3")}, 16) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.3")}, 16) = 0
(...)
[root@node0 ~]# tcpdump -i eth0 src node1 or dst node1
17:04:29.663664 IP node1.node.51950 > node0.node.domain: 4685+ A? login.node. (29)
17:04:29.663685 IP node1.node.51950 > node0.node.domain: 36559+ AAAA? login.node. (29)
17:04:29.664046 IP node0.node.domain > node1.node.51950: 4685* 2/1/1 A 10.255.254.1, A 10.255.254.3 (102)
17:04:29.664110 IP node0.node.domain > node1.node.51950: 36559* 0/1/0 (79)
%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%"}, 16) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.1")}, 16) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.3")}, 16) = 0
(...)
[root@node0 ~]# tcpdump -i eth0 src node1 or dst node1
listening on eth0, link-type EN10MB (Ethernet), capture size 65535 bytes
17:03:04.875099 IP node1.node.53511 > node0.node.domain: 55904+ A? login.node. (29)
17:03:04.875417 IP node0.node.domain > node1.node.53511: 55904* 2/1/1 A 10.255.254.3, A 10.255.254.1 (102)
17:03:04.875432 IP node1.node.53511 > node0.node.domain: 22271+ AAAA? login.node. (29)
17:03:04.875523 IP node0.node.domain > node1.node.53511: 22271* 0/1/0 (79)
%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%"}, 16) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.3")}, 16) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.3")}, 16) = 0
[root@node0 ~]# tcpdump -i eth0 src node1 or dst node1
(...)
17:17:12.893633 IP node1.node.42432 > node0.node.domain: 7264+ A? login.node. (29)
17:17:12.893988 IP node0.node.domain > node1.node.42432: 7264* 2/1/1 A 10.255.254.1, A 10.255.254.3 (102)
(...)
%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%"}, 16) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.1")}, 16) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.3")}, 16) = -1 ECONNREFUSED (Connection refused)
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.1")}, 16) = 0
[root@node0 ~]# tcpdump -i eth0 src node1 or dst node1
(...)
17:11:44.154595 IP node1.node.56947 > node0.node.domain: 4602+ A? login.node. (29)
17:11:44.154862 IP node0.node.domain > node1.node.56947: 4602* 2/1/1 A 10.255.254.3, A 10.255.254.1 (102)
(...)
%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%"}, 16) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.3")}, 16) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.3")}, 16) = -1 ECONNREFUSED (Connection refused)
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.1")}, 16) = 0
[root@node0 ~]# tcpdump -i eth0 src node1 or dst node1
17:09:05.854022 IP node1.node.41233 > node0.node.domain: 63435+ A? login.node. (29)
17:09:05.854055 IP node1.node.41233 > node0.node.domain: 3015+ AAAA? login.node. (29)
17:09:05.854436 IP node0.node.domain > node1.node.41233: 63435* 2/1/1 A 10.255.254.1, A 10.255.254.3 (102)
17:09:05.854531 IP node0.node.domain > node1.node.41233: 3015* 0/1/0 (79)
17:09:05.856764 IP node1.node.59579 > node0.node.ssh: Flags [S], seq 3025023931, win 14600, options [mss 1460,sackOK,TS val 9854496 ecr 0,nop,wscale 7], length 0
17:09:05.856806 IP node0.node.ssh > node1.node.59579: Flags [S.], seq 1105519762, ack 3025023932, win 14480, options [mss 1460,sackOK,TS val 350907197 ecr 9854496,nop,wscale 7], length 0
17:09:05.857106 IP node1.node.59579 > node0.node.ssh: Flags [.], ack 1, win 115, options [nop,nop,TS val 9854496 ecr 350907197], length 0
17:09:05.865291 IP node0.node.ssh > node1.node.59579: Flags [P.], seq 1:22, ack 1, win 114, options [nop,nop,TS val 350907205 ecr 9854496], length 21
(...)
%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%"}, 16) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.3")}, 16) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.3")}, 16) = 0
(...)
[root@node0 ~]# tcpdump -i eth0 src node1 or dst node1
17:04:29.663664 IP node1.node.51950 > node0.node.domain: 4685+ A? login.node. (29)
17:04:29.663685 IP node1.node.51950 > node0.node.domain: 36559+ AAAA? login.node. (29)
17:04:29.664046 IP node0.node.domain > node1.node.51950: 4685* 2/1/1 A 10.255.254.1, A 10.255.254.3 (102)
17:04:29.664110 IP node0.node.domain > node1.node.51950: 36559* 0/1/0 (79)
%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%%pre%"}, 16) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.1")}, 16) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(22), sin_addr=inet_addr("10.255.254.3")}, 16) = 0
(...)
[root@node0 ~]# tcpdump -i eth0 src node1 or dst node1
listening on eth0, link-type EN10MB (Ethernet), capture size 65535 bytes
17:03:04.875099 IP node1.node.53511 > node0.node.domain: 55904+ A? login.node. (29)
17:03:04.875417 IP node0.node.domain > node1.node.53511: 55904* 2/1/1 A 10.255.254.3, A 10.255.254.1 (102)
17:03:04.875432 IP node1.node.53511 > node0.node.domain: 22271+ AAAA? login.node. (29)
17:03:04.875523 IP node0.node.domain > node1.node.53511: 22271* 0/1/0 (79)
= > conexão no node2 (10.255.254.3)
TEST 2 (os dois servidores SSH ainda estão ativos e acessíveis)
%pre%
= > conexão no node2
(outro teste confirma a conexão com o node2 novamente. Parece que o round-robin é usado apenas para testes preliminares pelo cliente ssh)
TESTE 3 (o servidor SSH no node2 está parado)
%pre%
= > conexão em node0 (failover ?? surpresa!)
TESTE 4 (mesmas condições)
%pre%
= > mesmo resultado (conexão no node0)
TESTE 5 (o servidor SSH no node2 é reiniciado)
%pre%
= > conexão em node2 novamente (failback)