Parece que isso pode ser feito de uma forma mais simples, mas o melhor que posso fazer depois de uma hora de arranhões na cabeça é este script python :
#! /usr/bin/env python3
import sys, os
class Block:
block_id = ''
source1 = ''
source2 = ''
mixtures = []
def __init__(self, block_id = '', source1 = '', source2 = '', mixtures = []):
self.block_id = block_id
self.mixtures = mixtures
self.source1 = source1
self.source2 = source2
# Convert mixtures to a set of characters. For example,
# ''.join(["AT", "TT"])
# creates the string "ATTT". set() then converts that string to
# a set of characters {'A', 'T'}
sources = set(''.join(mixtures))
# If a source is empty, we take from the set the first element (pop())
# after removing the other source (difference()). Since the set
# contains single characters, we double it to get "AA", "TT", etc.
if self.source1 == '':
self.source1 = sources.difference(set(self.source2)).pop()*2
sources.remove (self.source1[0])
if self.source2 == '':
self.source2 = sources.pop()*2
def print (self):
print (self.block_id, "source1", self.source1)
print (self.block_id, "source2", self.source2)
for mix in self.mixtures:
print (self.block_id, "mixture", mix)
if len(sys.argv) == 1:
files = [os.stdin]
else:
files = (open(f) for f in sys.argv[1:])
for f in files:
# Read in all the lines
data = [line for rawline in f for line in [rawline.strip().split(' ')]]
# Get the unique block IDs
blocks = set (lines[0] for line in data)
# For each block ID
for b in blocks:
# The corresponding mixtures
mix = [line[2] for line in data if line[0] == b and "mixture" == line[1]]
# If "source1 XX" is present, we will get the list ['XX'], and [] if
# source1 is not present. ''.join() allows us to flatten ['XX'] to
# just 'XX' (and doesn't affect []). Similarly for source2.
source1 = ''.join(d[2] for line in data if line[0] == b and "source1" == line[1])
source2 = ''.join(d[2] for line in data if line[0] == b and "source2" == line[1])
# Create an object of the class defined above, and print it.
# Initialization fills up the blank values.
Block(b, source1, source2, mix).print()
Mesmo assim, isso fornecerá resultados aleatórios, fora de ordem (ou seja, block3
dados podem vir antes de block1
, etc.).
Salve isso em um script (digamos, insert.py
) e execute:
python3 insert.py inputfile
Eu reescrevi isso em awk :
#! /usr/bin/awk -f
function build (block, source1, source2, sources, mixtures)
{
if (! source1)
{
for (char in sources)
{
if (source2 != char char)
{
source1 = char char
delete sources[char]
break
}
}
}
if (! source2)
{
for (char in sources)
{
if (source1 != char char)
{
source2 = char char
delete sources[char]
break
}
}
}
printf "%s %s %s\n", block, "source1", source1
printf "%s %s %s\n", block, "source2", source2
for (m in mixtures)
{
for (i = 0; i < mixtures[m]; i++)
{
printf "%s %s %s\n", block, "mixture", m
}
}
}
{
if (prev != $1)
{
if (prev in data)
{
build(prev, source1, source2, sources, mixtures)
}
prev = $1
source1 = ""
source2 = ""
delete sources
delete mixtures
}
data[$1]++
if ($2 == "source1") {source1 = $3; next}
if ($2 == "source2") {source2 = $3; next}
if ($2 == "mixture")
{
mixtures[$3]++
split ($3, chars, "")
for (i=1; i <= length($3); i++)
{
sources[chars[i]]++
}
}
}
END { build(prev, source1, source2, sources, mixtures) }
Salve isso em um script (digamos insert.awk
), chmod +x
e execute:
./insert.awk inputfile
Agora, também deve manter o pedido. Note que usei delete
, que pode não estar presente em alguns awks (mas deve estar no GNU awk e mawk).