Observação: essa resposta apenas aborda alguns dos "5Ws investigativos de upload de dados" desejados.
Use o tcpdump para capturar todo o tráfego de pacotes e use pós-processamento para extrair as informações desejadas.
sudo tcpdump -i enp4s0 -w 'ext-%F-%H-%M-%S.bin' -G 3600 -z /home/doug/bin/packet_post_processor2
Onde:
minha interface voltada para WAN é enp4s0
;
Os nomes dos arquivos incluem automaticamente a data e a hora (requer um pacote adicional, mas não me lembro quais);
Estou pedindo a rotação de arquivos uma vez por hora;
Cada arquivo deve ser processado pelo script packet_post_processor
(2 é para essa resposta).
O script de pós-processamento:
#!/bin/dash
#
# packet_post_processor2 Doug Smythies. 2017.09.08
# Edits as required for updated c prgram, and bad sort order.
# There may be little use in sort by packets count, but for now
# it remians.
#
# packet_post_processor2 Doug Smythies. 2017.09.01
# This script will be called from the always running tcpdump.
# It is called for every binary file rotation.
# The purpose is to make summary files of things that one
# may want to investigate in more detail later on.
#
# This version is for WinEunuuchs2Unix and
# https://sobrelinux.info/questions/28447/why-is-internet-upload-so-high-when-i-dont-actually-upload-much"Usage - /*****************************************************************************
*
* tcpdump_bytes.c 2017.09.08 Smythies
* By sorting the input file before running this program, it can do bytes
* per IP all on its own, and in one pass through the file. At this time,
* it is for outgoing only. A future revision will add command line
* options for incoming and such.
* Might as well group by 1st 2 IP address bytes at the same time,
* i.e. for some (not all) of those multiple IP situations.
*
* tcpdump_bytes.c 2017.09.01 Smythies
* Count the bytes for all the packets in the passed file.
* See also tcpdump_extract.c, from which this was taken.
* This program is very quite, just printing bytes, unless there
* is some error. The idea is that is part of something bigger and
* therefore extra verbosity would just get in the way.
*
* Note: The input tcpdump file needs to have been done
* with the -e option.
*
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAX_LENGTH 2000 /* maximum line length */
void main(int argc, char **argv){
char in_buffer[MAX_LENGTH];
char *infile, *outfile1, *outfile2;
char *index, *index2;
FILE *inf, *out1, *out2;
unsigned current_bytes, sip3, sip2, sip1, sip0, sport, dip3, dip2, dip1, dip0, dport;
unsigned dest_ip, dest_ip_16, dest_ip_old, dest_ip_16_old;
unsigned num_lines, num_ips, num_16s;
unsigned long long total_bytes, total_bytes_16;
switch(argc){
case 4:
infile = argv[1];
outfile1 = argv[2];
outfile2 = argv[3];
break;
default:
printf("tcpdump_bytes infile outfile1 outfile2\n");
printf(" parse outgoing bytes per IP out of a sorted tcpdump file where the -e option was used.\n");
printf(" infile is sorted tcpdump output file; oufile1 is bytes per IP; outfile 2 is bytes per IP/16.\n");
exit(-1);
} /* endcase */
if((inf = fopen(infile, "rt")) == NULL){
printf("Unable to open input file '%s'\n", infile);
exit(-1);
} /* endif */
if((out1 = fopen(outfile1, "wt")) == NULL){
printf("Error opening output file '%s'\n", outfile1);
exit(-1);
} /* endif */
if((out2 = fopen(outfile2, "wt")) == NULL){
printf("Error opening output file '%s'\n", outfile2);
exit(-1);
} /* endif */
total_bytes = 0;
total_bytes_16 = 0;
dest_ip_old = 0;
dest_ip_16_old = 0;
num_lines = 0;
num_ips = 0;
num_16s = 0;
while((fgets(in_buffer, MAX_LENGTH, inf)) != NULL){ /* do infile line at a time */
num_lines++;
if((index = strstr(in_buffer, "), length ")) != NULL){ /* find search string if it is there, then parse the data */
sscanf(index, "), length %u: %u.%u.%u.%u.%u > %u.%u.%u.%u.%u:",
¤t_bytes,
&sip3, &sip2, &sip1, &sip0,
&sport,
&dip3, &dip2, &dip1, &dip0,
&dport);
} else {
printf("tcpdump_bytes: Got an odd line: %s", in_buffer);
} /* endif */
dest_ip_16 = (dip3 << 24) + (dip2 << 16);
dest_ip = dest_ip_16 + (dip1 << 8) + dip0;
// printf("debug: B: %u S: %u.%u.%u.%u.%u D: %u.%u.%u.%u.%u %u %u\n", current_bytes, sip3, sip2, sip1, sip0, sport, dip3, dip2, dip1, dip0, dport, dest_ip, dest_ip_16);
if(dest_ip != dest_ip_old){
if(total_bytes != 0){
fprintf(out1, "%llu %u.%u.%u.%u\n", total_bytes, (dest_ip_old >> 24) & 0xff, (dest_ip_old >> 16) & 0xff, (dest_ip_old >> 8) & 0xff, dest_ip_old & 0xff);
total_bytes = 0;
} /* endif */
dest_ip_old = dest_ip;
num_ips++;
} /* endif */
total_bytes = total_bytes + (unsigned long long) current_bytes;
if(dest_ip_16 != dest_ip_16_old){
if(total_bytes_16 != 0){
fprintf(out2, "%llu %u.%u.0.0/16\n", total_bytes_16, (dest_ip_16_old >> 24) & 0xff, (dest_ip_16_old >> 16) & 0xff);
total_bytes_16 = 0;
} /* endif */
dest_ip_16_old = dest_ip_16;
num_16s++;
} /* endif */
total_bytes_16 = total_bytes_16 + (unsigned long long) current_bytes;
} /* endwhile */
/* don't forget to output the last data */
if(total_bytes != 0){
fprintf(out1, "%llu %u.%u.%u.%u\n", total_bytes, dip3, dip2, dip1, dip0);
} else {
printf("tcpdump_bytes: Something is wrong. Last IP address has no bytes.\n");
} /* endif */
if(total_bytes_16 != 0){
fprintf(out2, "%llu %u.%u.0.0/16\n", total_bytes_16, dip3, dip2);
} else {
printf("tcpdump_bytes: Something is wrong. Last IP/16 address has no bytes.\n");
} /* endif */
fclose(inf);
fclose(out1);
fclose(out2);
printf("tcpdump_bytes: Done. Processed %d lines and %d IP addresses and %d /16 addresses\n", num_lines, num_ips, num_16s);
} /* endprogram */
file-name"
exit 1
fi
# check that the file actually exists:
if [ ! -f ]
then
echo "tcpdump binary file does not exist, aborting..."
exit 1
fi
echo "data extraction 1: All the packets..."
# Note: Using the -e option will ease subsequent bytes per unit time calculations
sudo tcpdump -n -tttt -e -r >all_e.txt
echo "data extraction 2: The outgoing normal packets..."
# Note: We might want to check that something important doesn't get missed here.
# Note: replace the fake IP address with your actual IP address.
grep ": XXX\.XXX\.XXX\.XXX\." all_e.txt | grep Flags >outgoing.txt
echo "data extraction 3: Make a histogram of the destination IP addresses by packets..."
# Note: use field 13
cut -d" " -f13 outgoing.txt | sed 's/.[^.]*$//' | sort | uniq -c | sort -g >outhisto.txt
# Phase 2: Maximum packet count might not mean maximum byte count, so figure out maximum byte count
echo "data extraction 4: Sort the outgoing file by destination IP address."
LC_ALL=C sort -k 13 <outgoing.txt >outgoing.srt
echo "data extraction 5: Now, calculate bytes per IP and bytes per IP/16 and make sorted historgrams"
# Note: There might be some clever awk or whatever way to do this, but I have a c program.
./tcpdump_bytes outgoing.srt outb.txt out16.txt
sort --general-numeric-sort <outb.txt >outhistob.txt
sort --general-numeric-sort <out16.txt >outhistob16.txt
#Leave the intermidiate files, just for now, while we debug.
#
# packet_post_process. End.
O programa c chamado de dentro do script:
2017-05-31 08:10:31.721956 00:22:b0:75:c2:bd > 6c:be:e9:a7:f1:07, ethertype IPv4 (0x0800), length 400: XXX.XXX.XXX.XXX.52779 > 38.113.165.77.443: Flags [P.], seq 1:347, ack 1, win 256, length 346
2017-05-31 08:10:31.826241 6c:be:e9:a7:f1:07 > 00:22:b0:75:c2:bd, ethertype IPv4 (0x0800), length 157: 38.113.165.77.443 > XXX.XXX.XXX.XXX.52779: Flags [P.], seq 1:104, ack 347, win 1026, length 103
2017-05-31 08:10:31.877945 00:22:b0:75:c2:bd > 6c:be:e9:a7:f1:07, ethertype IPv4 (0x0800), length 54: XXX.XXX.XXX.XXX.52779 > 38.113.165.77.443: Flags [.], ack 104, win 256, length 0
2017-05-31 08:10:32.603768 00:22:b0:75:c2:bd > 6c:be:e9:a7:f1:07, ethertype ARP (0x0806), length 42: Request who-has XXX.XXX.XXX.YYY tell XXX.XXX.XXX.XXX, length 28
2017-05-31 08:10:32.630960 6c:be:e9:a7:f1:07 > 00:22:b0:75:c2:bd, ethertype ARP (0x0806), length 60: Reply XXX.XXX.XXX.YYY is-at 6c:be:e9:a7:f1:07, length 46
2017-05-31 08:10:33.643468 00:90:d0:63:ff:00 > 01:00:5e:00:00:01, ethertype IPv4 (0x0800), length 60: 10.197.248.13 > 224.0.0.1: igmp query v2
2017-05-31 08:10:37.448732 00:22:b0:75:c2:bd > 6c:be:e9:a7:f1:07, ethertype IPv4 (0x0800), length 90: XXX.XXX.XXX.XXX.53120 > 91.189.89.199.123: NTPv4, Client, length 48
Observe que alguns arquivos serão reprovados no processamento das próximas horas. Eu vou consertar isso depois.
Um breve resumo do que o script de pós-processamento está fazendo:
Primeiro, o arquivo binário tcpdump é convertido em texto por sumários de pacotes. Exemplo (meu endereço foi alterado para XXX.XXX.XXX.XXX):
packets IP Address Added Comment
299517 91.189.95.84 Ubuntu stuff
301129 198.38.112.140 Netflix
306815 17.253.31.206 Apple stuff
319558 129.97.134.71 Ubuntu stuff (mirror, I think)
333334 91.189.88.152 Ubuntu stuff
352141 91.189.88.39 Ubuntu stuff
353160 209.121.139.153 Telus (Microsoft updates streaming)
368669 209.121.139.163 Telus (Microsoft updates streaming)
389928 91.189.88.161 Ubuntu stuff
396087 23.60.74.158 deploy.static.akamaitechnologies.com (?)
421259 198.38.112.170 Netflix
474506 17.253.31.205 Apple stuff
477706 198.38.109.153 Netflix
480452 198.38.112.159 Netflix
540261 198.38.112.173 Netflix
574592 198.38.112.132 Netflix
710022 198.38.112.174 Netflix
728434 209.121.139.144 Telus (Microsoft updates streaming)
738839 198.38.112.130 Netflix
883688 198.38.109.171 Netflix
1049778 198.38.112.154 Netflix
2166582 72.21.81.200 Hmmmm ? MCI Communications Services, (Skype, I think)
7512548 13.107.4.50 Microsoft (updates)
É de propósito que um par de pacotes ARP seja incluído no exemplo, portanto, mostre algo que seria excluído do processamento posterior.
O pacote IGMP irritante de um IP de LAN privada é do meu provedor e também será excluído do processamento posterior. No entanto, se o meu ISP afirmar que eu ultrapassei meu limite de dados mensais, indicarei esses pacotes quando disser que não vou pagar.
Observe dois comprimentos mostrados em cada linha, o primeiro é bytes no fio e o segundo é o comprimento da carga útil. Queremos bytes no fio, e é por isso que usamos a opção -e com o tcpdump.
Em segundo lugar, o pacote de saída pode ser identificado exclusivamente encontrando ": XXX.XXX.XXX.XXX.", portanto, extraia todos os pacotes de saída, não incluindo ARP e ICMP, usando grep.
Em terceiro lugar, usando o espaço como um delimitador, o campo 13 é o endereço IP de destino, portanto, use um grupo complicado de comandos canalizados para extrair, contar e classificar os pacotes de endereço IP de destino.
Para frente, classifique os pacotes de saída pelo endereço IP de destino.
Quinto, use o programa c para calcular bytes por IP e bytes por IP / 16 e classificar a saída em histogramas.
Em sexto lugar, investigue manualmente os principais endereços IP na tentativa de identificar o que está acontecendo. Observe que, com muita frequência, é possível encontrar a consulta DNS de pesquisa direta relacionada na saída tcpdump.
Como exemplo, examinei meus dados de WAN / LAN entre 2017-05-31 08:09:33 e 2017-08-09 22:13:11 e editei o que encontrei para os vários endereços IP. < br>
Primeiro os poucos mais por contagem de pacotes:
Bytes IP Added Comment
32358580 17.253.31.205 Apple stuff
32625068 198.38.112.159 Netflix
34220805 172.217.3.206 Google web crawler
36628021 198.38.112.173 Netflix
37022702 17.188.208.132 Apple stuff
39105254 198.38.112.132 Netflix
40697177 209.121.139.144 Telus Microsoft updates file streaming
48247623 198.38.112.174 Netflix
49537980 64.4.54.254 Microsoft
50358753 198.38.112.130 Netflix
59623846 198.38.109.171 Netflix
71532166 198.38.112.154 Netflix
98480036 207.167.198.18 Telus e-mail stuff
139907010 72.21.81.200 Hmmmm ? MCI Communications Services, (Skype, I think)
210138801 91.189.95.84 Ubuntu stuff
325511064 204.79.197.213 Microsoft (?) msedge.net storage.skyprod.akadns.net
479586878 13.107.4.50 Microsoft (updates)
Em segundo lugar, os poucos mais por contagem de bytes:
107592753 209.52.0.0/16 cache.google.com (for example)
116538884 207.167.0.0/16 Telus e-mail stuff
120769715 17.188.0.0/16 Apple. store-025-failover2.blobstore-apple.com.akadns.net (for example)
139261655 52.218.0.0/16 s3-us-west-2.amazonaws.com (for example) ? Hmmm...
147091123 172.217.0.0/16 Google web crawler
153146532 17.248.0.0/16 p46-keyvalueservice.fe.apple-dns.net. Apple iCloud Drive
183300509 72.21.0.0/16 Skype (I think)
213119564 209.121.0.0/16 Telus Microsoft updates file streaming
333374588 204.79.0.0/16 Microsoft
354346088 91.189.0.0/16 Ubuntu stuff
488793579 13.107.0.0/16 Microsoft (updates)
621733032 198.38.0.0/16 Netflix
Observe como, como o Netflix, por exemplo, usa muitos endereços IP, ele pode cair mais baixo no ranking do que deveria, se todos os seus endereços IP fossem tratados como um.
Em terceiro lugar, os primeiros grupos / 16 grupos por contagem de bytes. Observe como o Netflix é agora o maior:
%pre%