padrão de correspondência e arquivos mesclados do awk

0

Eu tenho 3 arquivos 1.csv e 2.csv e 3.csv

1.csv

TELECOM_DEVELOPMENT_AFGHANISTAN_COMPANY,AFGHANISTAN,Alphanumeric_A_MSISDN_blocking,1  
CABLE&WIRELESS_BARBADOS,BARBADOS,Alphanumeric_A_MSISDN_blocking,791  
SIMINN_ICELAND_TELECOM,ICELAND,Alphanumeric_A_MSISDN_blocking,109373  
CABLE&WIRELESS_SEYCHELLES,SEYCHELLES,Alphanumeric_A_MSISDN_blocking,2  
CABLE&WIRELESS_JAMAICA,JAMAICA,Alphanumeric_A_MSISDN_blocking,85  

2.csv

SIMINN_ICELAND_TELECOM,ICELAND,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),7795  
CABLE&WIRELESS_SEYCHELLES,SEYCHELLES,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),638

3.csv:

TELECOM_DEVELOPMENT_AFGHANISTAN_COMPANY,AFGHANISTAN,Calling_Party_Address_Blocking,79  
CABLE&WIRELESS_BARBADOS,BARBADOS,Calling_Party_Address_Blocking,30  
MOBILKOM_LIECHTENSTEIN,LIECHTENSTEIN,Calling_Party_Address_Blocking,6
SYNIVERSE_ANSI,UNITED_STATES,Calling_Party_Address_Blocking,12

Eu quero mesclar arquivos para que ele imprima o arquivo de saída como abaixo

TELECOM_DEVELOPMENT_AFGHANISTAN_COMPANY,AFGHANISTAN,Alphanumeric_A_MSISDN_blocking,1,NA,NA,Calling_Party_Address_Blocking,79
CABLE&WIRELESS_BARBADOS,BARBADOS,Alphanumeric_A_MSISDN_blocking,791,NA,NA,Calling_Party_Address_Blocking,30
SIMINN_ICELAND_TELECOM,ICELAND,Alphanumeric_A_MSISDN_blocking,109373,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),7795,NA,NA
CABLE&WIRELESS_SEYCHELLES,SEYCHELLES,Alphanumeric_A_MSISDN_blocking,2,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),638,NA,NA
CABLE&WIRELESS_JAMAICA,JAMAICA,Alphanumeric_A_MSISDN_blocking,85,NA,NA,NA,NA 
MOBILKOM_LIECHTENSTEIN,LIECHTENSTEIN,NA,NA,NA,NA,Calling_Party_Address_Blocking,6
SYNIVERSE_ANSI,UNITED_STATES,NA,NA,NA,NA,Calling_Party_Address_Blocking,12
    
por UJJAWAL KHARE 05.08.2014 / 09:46

3 respostas

1

Se você não se importa com a ordem de saída, aqui está uma perl solution:

$ perl -F',' -anle '
    push @{$h{$F[0].",".$F[1]}{$ARGV}}, @F[2..$#F];
    push @file, $ARGV if eof;
    END {
        for $k (keys %h) {
            for (sort {$a <=> $b} @file) {
                if (defined($h{$k}{$_})) {
                    push @tmp, @{$h{$k}{$_}};
                } else {
                    push @tmp, qw(NA NA);
                }
            }
            print join ",",($k,@tmp);
            @tmp=();
        }
    }
' 1.csv 2.csv 3.csv
MOBILKOM_LIECHTENSTEIN,LIECHTENSTEIN,NA,NA,NA,NA,Calling_Party_Address_Blocking,6
CABLE&WIRELESS_SEYCHELLES,SEYCHELLES,Alphanumeric_A_MSISDN_blocking,2,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),638,NA,NA
CABLE&WIRELESS_BARBADOS,BARBADOS,Alphanumeric_A_MSISDN_blocking,791,NA,NA,Calling_Party_Address_Blocking,30
SIMINN_ICELAND_TELECOM,ICELAND,Alphanumeric_A_MSISDN_blocking,109373,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),7795,NA,NA
CABLE&WIRELESS_JAMAICA,JAMAICA,Alphanumeric_A_MSISDN_blocking,85,NA,NA,NA,NA
SYNIVERSE_ANSI,UNITED_STATES,NA,NA,NA,NA,Calling_Party_Address_Blocking,12
TELECOM_DEVELOPMENT_AFGHANISTAN_COMPANY,AFGHANISTAN,Alphanumeric_A_MSISDN_blocking,1,NA,NA,Calling_Party_Address_Blocking,79
    
por 05.08.2014 / 11:37
1

Em awk :

#!/usr/bin/awk -f

BEGIN {
    FS=",";
}

{  
    k=$1
    if (k in ar) {
        for (i = 3; i <= NF; i++) ar[k] = ar[k] "," $i
    } else {
        ar[k] = $0;
    }
}

END { 
    for (j in ar) print ar[j]
}

Salvar como merge.awk e torná-lo executável com chmod +x merge.awk .

Deve (deve) produzir:

CABLE&WIRELESS_SEYCHELLES,SEYCHELLES,Alphanumeric_A_MSISDN_blocking,2,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),638
SYNIVERSE_ANSI,UNITED_STATES,Calling_Party_Address_Blocking,12
MOBILKOM_LIECHTENSTEIN,LIECHTENSTEIN,Calling_Party_Address_Blocking,6
CABLE&WIRELESS_JAMAICA,JAMAICA,Alphanumeric_A_MSISDN_blocking,85
SIMINN_ICELAND_TELECOM,ICELAND,Alphanumeric_A_MSISDN_blocking,109373,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),7795
TELECOM_DEVELOPMENT_AFGHANISTAN_COMPANY,AFGHANISTAN,Alphanumeric_A_MSISDN_blocking,1,Calling_Party_Address_Blocking,79
CABLE&WIRELESS_BARBADOS,BARBADOS,Alphanumeric_A_MSISDN_blocking,791,Calling_Party_Address_Blocking,30

Quando executado como merge.awk 1.csv 2.csv 3.csv ou mesmo merge.awk *.csv .

    
por 05.08.2014 / 12:05
0
awk -F ',' '
  FILENAME == ARGV[1] {
    a[++n] = ($1 FS $2);
    ar1[a[n]]=($3 FS $4);
    next}
  FILENAME == ARGV[2] {
    if(!(($1 FS $2) in ar1))
      {a[++n] = ($1 FS $2)};
    ar2[$1 FS $2]=($3 FS $4);
    next}
  FILENAME == ARGV[3] {
    if(!(($1 FS $2) in ar1))
      {a[++n] = ($1 FS $2)};
    ar3[$1 FS $2]=($3 FS $4);
    next}
  END {for(i=1; i<=n; i++)
    {if(!(a[i] in ar1))
      ar1[a[i]] = ("NA,NA")
    if(!(a[i] in ar2))
      ar2[a[i]] = ("NA,NA")
    if(!(a[i] in ar3))
      ar3[a[i]] = ("NA,NA")
    printf "%s,%s,%s,%s\n",
      a[i], ar1[a[i]], ar2[a[i]], ar3[a[i]]}}' 1.csv 2.csv 3.csv

A saída do comando acima é

TELECOM_DEVELOPMENT_AFGHANISTAN_COMPANY,AFGHANISTAN,Alphanumeric_A_MSISDN_blocking,1,NA,NA,Calling_Party_Address_Blocking,79
CABLE&WIRELESS_BARBADOS,BARBADOS,Alphanumeric_A_MSISDN_blocking,791,NA,NA,Calling_Party_Address_Blocking,30
SIMINN_ICELAND_TELECOM,ICELAND,Alphanumeric_A_MSISDN_blocking,109373,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),7795,NA,NA
CABLE&WIRELESS_SEYCHELLES,SEYCHELLES,Alphanumeric_A_MSISDN_blocking,2,SPAM_CHAIN_SMS_REJECT(Spam_Detection_and_Blocking),638,NA,NA
CABLE&WIRELESS_JAMAICA,JAMAICA,Alphanumeric_A_MSISDN_blocking,85,NA,NA,NA,NA
MOBILKOM_LIECHTENSTEIN,LIECHTENSTEIN,NA,NA,NA,NA,Calling_Party_Address_Blocking,6
SYNIVERSE_ANSI,UNITED_STATES,NA,NA,NA,NA,Calling_Party_Address_Blocking,12
    
por 06.08.2014 / 08:18

Tags