<?php
/**
* Cli process that gets as 1st argument the output of tesseract ... hocr and dumps
* its text nodes
* Usage: script.php in.tif.html out.txt
*/
$inFile = $argv[1];
$outFile = $argv[2];
$stream = file_get_contents($inFile);
$dom = DOMDocument::loadHTML($stream);
$out = array();
foreach ($dom->getElementsByTagName('p') as $tag) {
$out[] = $tag->nodeValue;
}
file_put_contents($outFile, implode("\n", $out));