Então eu dei uma olhada nisso com o Python e acho que criei um script simples que faz o que você quer. Aqui está:
#!/usr/bin/env python2
# -*- coding: ascii -*-
"""pathlist2xml.py
Takes a list of file-system paths and
generates an XML representation of the
corresponding file-system hierarchy.
"""
import sys
from lxml.etree import Element, SubElement, fromstring, tostring, XMLParser
from xml.sax.saxutils import escape, unescape
from os.path import join, isdir
from posix import lstat
import fileinput
def insert_path(xmlroot, path):
"""Updates an XML element 'xmlroot' and adds the
child elements that represent the path 'path'."""
# Initialize a node cursor to start at the root node
xmlcursor = xmlroot
# Keep track of the relative path
fullpath = ''
# Iterate through the components of the path
for path_component in path.split('/'):
# Update the path
fullpath = join(fullpath, path_component)
# UTF and XML encode the strings
fullpath_encoded = escape(fullpath.encode('string-escape'))
path_component_encoded = escape(path_component.encode('string-escape'))
# Check to see if the component if already represented by a node
xmlnodes = xmlcursor.xpath("./*[@name='%s']" % path_component_encoded)
# If the node exists, update the cursor
if xmlnodes:
xmlcursor = xmlnodes[0]
# If the node doesn't exists, create it
else:
# Create the node
if isdir(fullpath):
xmlcursor = SubElement(xmlcursor, "directory")
else:
xmlcursor = SubElement(xmlcursor, "file")
# (Optional) Add some file-attributes
# xmlcursor.set('name', path_component)
xmlcursor.set('name', path_component_encoded)
xmlcursor.set('path', fullpath_encoded)
xmlcursor.set('inode', str(lstat(fullpath).st_ino))
# Return the modified root element (for convenience - not necessary)
return(xmlroot)
def paths_to_xml(pathlist):
""" Takes a list of file-system paths and generates an XML
representation of the corresponding file-system hierarchy.
"""
xmlroot = Element('root')
for path in pathlist:
insert_path(xmlroot, path.strip().strip('/'))
return(xmlroot)
# Read a list of file paths standard input or from a list of files
if __name__ == "__main__":
# Get the XML document
xmlroot = paths_to_xml(fileinput.input())
# Display the generated XML document
print(tostring(xmlroot, pretty_print=True))
E aqui está um pequeno exemplo ilustrando como isso pode funcionar na prática. Primeiro eu criei alguns diretórios e arquivos:
mkdir -p /tmp/xmltest
cd /tmp/xmltest
touch file1
touch file2
mkdir dir1
touch dir1/file3
touch dir1/file4
mkdir dir2
mkdir dir2/dir3
touch dir2/dir3/file5
Veja como esta sub-hierarquia se parece com tree
:
.
├── dir1
│ ├── file3
│ └── file4
├── dir2
│ └── dir3
│ └── file5
├── file1
└── file2
E aqui está um exemplo de como você pode chamar o script com saída apenas de find
:
find . | pathlist2xml.py
E aqui está a saída XML que foi gerada:
<root>
<directory name="." path="." inode="3587802">
<directory name="dir1" path="./dir1" inode="3587817">
<file name="file3" path="./dir1/file3" inode="3587818"/>
<file name="file4" path="./dir1/file4" inode="3587819"/>
</directory>
<directory name="dir2" path="./dir2" inode="3587820">
<directory name="dir3" path="./dir2/dir3" inode="3587821">
<file name="file5" path="./dir2/dir3/file5" inode="3587822"/>
</directory>
</directory>
<file name="file1" path="./file1" inode="3587815"/>
<file name="file2" path="./file2" inode="3587816"/>
</directory>
</root>
E aqui está um segundo exemplo que combina find
com grep
:
find . | grep dir2 | pathlist2xml.py
E aqui está a saída desse segundo exemplo:
<root>
<directory name="." path="." inode="3587802">
<directory name="dir2" path="./dir2" inode="3587820">
<directory name="dir3" path="./dir2/dir3" inode="3587821">
<file name="file5" path="./dir2/dir3/file5" inode="3587822"/>
</directory>
</directory>
</directory>
</root>