#!/usr/bin/ruby
#Matt Smith/Shawn Rainey
#antiword-xp.rb - Convert docx files to plaintext
# add an each_wrapped_line method to the String class
class String
#Takes a width to wrap, defaulted to $wrapWidth, and a paragraph separator
# the separator is inserted after each paragraph
# along with an option to add the seperator after single-line "paragraphs"
# which fit on one line with no wrapping. This will likely always be false
def each_wrapped_line(cols = $wrapWidth, p_seperator="\n", seperateSingle=false)
lines = []
self.each_line { |line|
words = line.split
wrapped_line = ""
seperate = seperateSingle
words.each { | word |
word.strip!
#Is there room for the next word?
if (wrapped_line.length + word.length) <= cols || cols == 0
wrapped_line << word
#Add Space if it will fit
wrapped_line << " " unless wrapped_line.length == cols
else
#Always use seperator when paragraphs span more than one line
seperate = true
lines << wrapped_line
#If the word length is bigger than the number of columns
# add it to lines. Otherwise add it to the next wrapped line
if word.length + 1 > cols
lines << word
else
wrapped_line = word + " "
end
end
}
wrapped_line += p_seperator if seperate
lines << wrapped_line
}
#Yield lines if block given, otherwise return them.
lines.each { |line| yield(line) } if block_given?
return lines
end
end
#Found out the hard way that the env. var $COLUMNS is not exported...
#So we do this instead
begin
IO.popen("tput cols"){ |process| $consoleWidth = process.read.to_i }
$wrapWidth = $consoleWidth
rescue Errno::ENOENT
$wrapWidth = $consoleWidth = 80
end
def usage
"Usage: #{$0} takes a .doc or .docx formatted word document. It can be called either by piping the document to antiword, or by calling `#{$0} filename`
".each_wrapped_line($consoleWidth) { | line | puts line }
puts "
Arguments:"
"-w## or -w ## Set wrap with. If not specified, uses console width or 80 if console width cannot be determined.
--notimeout Disable input timeout. This could be necessary for large files or files from external sources. Only needed when piping in a word file, and not when one is specified in the programs argument list.".each_wrapped_line($consoleWidth) { | line | puts line }
puts"
Examples:
$#{$0} < mydoc.doc[x]
$#{$0} mydoc.doc[x] -w 60 --notimeout
$cat mydoc.doc[x] | #{$0} -w80
"
end
stdinTimeout = 5
filename = nil
#Generate a hash string from a random number to seperate the arguments
#See antiword.rb.txt
require 'digest'
arg_sep = "<"
#Choose a new argument seperator until sep. not found in arguments
# use the TR because we don't want digits in the seperator
until ARGV.join("") !~ /#{arg_sep}/
arg_sep = Digest::hexencode(Digest::SHA2.new().digest(rand().to_s)).tr("0-9", "G-P")
end
argstring = ARGV.join(arg_sep)
#if we can find a -h or -help, or can't find a good indicator
#of a doc/docx
if (argstring =~ /(?:#{arg_sep}|^)-+h(?:elp)?(?=#{arg_sep}|$)/)
usage
exit(1)
else
temp_fname = nil
argtokens = {
"--notimeout" => \
lambda { | matchData | stdinTimeout = 0 if matchData
},
#Take a width value. can be "-w10" or "-w 10" on the command line
"-w(?:#{arg_sep})?(\\d+)" => \
lambda { | matchData |
$wrapWidth = matchData.to_a[1].to_i unless matchData == nil
},
"(.+\\.docx?)" => \
lambda { | matchData |
temp_fname = matchData.to_a[1] unless matchData == nil
}
}
argtokens.each_pair { | expression, callback |
#Call the callback function with the matchdata from the expression-injected RE
# expression is matched between arg_sep or start/end anchors
# ending seperator is not included in match, but beginning seperator is.
callback.call(argstring.match(/.*(?:#{arg_sep}|^)(?>#{expression})(?=#{arg_sep}|$)/))
}
#If a file name is given,
# test if the given filename exists
if temp_fname != nil
if File.exist?(temp_fname)
filename = temp_fname
else
puts "#{temp_fname} does not exist!"
usage
Process.exit(1);
end
end
#Clear the argument list of known arguments
argtokens.each_key { | expression |
argstring.gsub!(/(?:#{arg_sep}|^)#{expression}(?=#{arg_sep}|$)/, "")
}
#If there are still arguments left, and the file name has not been
# assigned, assume that the unrecognized arg is meant to be the file,
# and output the error message + usage.
#This allows us to ignore garbage arguments when a file is supplied.
# Unfortunately, they will still be problematic when the word file is
# piped in.
if(temp_fname == nil && !argstring.empty?)
argstring.gsub!(/#{arg_sep}/, "")
puts "#{argstring} is not a valid word file."
usage
Process.exit(1)
end
end
process_xml = true;
#Copy contents of stdin to antiword.zip
if filename == nil
begin
require 'timeout'
Timeout::timeout(stdinTimeout) do
File.open("antiword_temp.zip", "w") { |file| file.write($stdin.read) }
filename = "antiword_temp.zip"
end
rescue Timeout::Error
File.delete("antiword_temp.zip")
"Timed out. This can happen if you piped in a very large file, or if you did not specify a file at all. To remedy this with very large files, add the --notimeout argument when calling #{$0}.
".each_wrapped_line { |line| puts line }
usage
Process.exit(1)
end
end
document = String.new
gotContents = true
#Set to "if false" if RubyZip is causing problems
# and it will use the system's unzip instead.
if true
require 'rubygems'
require 'zip/zipfilesystem'
begin
Zip::ZipFile.open(filename) { | awContents |
document = awContents.read("word/document.xml")
}
rescue Zip::ZipError
gotContents = false;
end
else
#unzip options: pipe output to stdout, only extract word/document.xml
#result.read captures stdout from the opened process.
IO.popen("unzip -p antiword_temp.zip word/document.xml 2> /dev/null") { |result| document = result.read }
gotContents = ($? == 0)
end
#If the unzip failed
unless gotContents
process_xml = nil
#If the filename isn't antiword_temp, and this is a doc, copy
#the file to antiword_temp. Do this to avoid having to escape the filename
unless filename == "antiword_temp.zip"
File.open("antiword_temp.zip", "w") { |awfile|
File.open(filename) { | inFile | awfile.write(inFile.read) }
filename = "antiword_temp.zip"
# ^^^ antiword_temp.zip doesn't get deleted unless filename is set to this
}
end
#Try to process with system's antiword, maybe it's an old doc file.
#Set antiword's options: one paragraph per line, text mode, no images.
# This matches the format of document.xml with the tags processed
IO.popen('antiword antiword_temp.zip -w 0 -t -i 1 2> /dev/null') {
|result| document = result.read
}
#if antiword failed
#You're SOL
unless $? == 0
$stderr.write("Unsupported format\n")
usage
File.delete("antiword_temp.zip")
Process.exit 1
end
end
if(process_xml)
replacements = []
#Remove line breaks. There are none in MS-Words's XML, but
#Could change in the future. Or could have been generated
#using something else
replacements << [ /\n|\r/, '']
#Add seperators where column tags are using pipe, unless last in row
replacements << [ /<\/w:p><\/w:tc>(?!<\/w:tr>)/, " | " ]
#list elements, may add more soon
replacements << [ /<w:numPr>/, "-" ]
#Tabbed Columns
replacements << [ /<w:tab[^\/]*\/>/, " " ]
#insert [pic] to replace graphics.
replacements << [ /<pic:pic[^>]*>/, '[pic]']
replacements << [ /<wp:posOffset>\d+?<\/wp:posOffset>/, "" ]
#Adding elements in markdown format for formatting
#Getting info from http://www.jackreichert.com/2012/11/09/how-to-convert-docx-to-html/
#key is that each group of formatted words is ended with </w:r>
#Using the whole regex match because the tags get stripped in a sec anyway, and helps solve if there's multiple issues.
#italics
replacements << [ /<w:i\/><.+?><w:t>.+?<\/w:r>/, "*\\0*" ]
#bold
replacements << [ /<w:b\/><.+?><w:t>.+?<\/w:r>/, "**\\0**" ]
#Underscore (yes, I know not really, but it works in plain text and it should probably be emphasis anyway.)
replacements << [ /<w:u\/><.+?><w:t>.+?<\/w:r>/, "_\\0_" ]
#heading1
replacements << [ /<w:pStyle w:val="Heading1"\/>.+?<w:p>/, "# \\0" ]
#heading2
replacements << [ /<w:pStyle w:val="Heading2"\/>.+?<w:p>/, "## \\0" ]
#heading3
replacements << [ /<w:pStyle w:val="Heading3"\/>.+?<w:p>/, "### \\0" ]
#heading4
replacements << [ /<w:pStyle w:val="Heading3"\/>.+?<w:p>/, "#### \\0" ]
#heading5
replacements << [ /<w:pStyle w:val="Heading3"\/>.+?<w:p>/, "##### \\0" ]
#heading6
replacements << [ /<w:pStyle w:val="Heading3"\/>.+?<w:p>/, "###### \\0" ]
#Substitute end paragraph tag with newline
#Effectively, this should treat each paragraph on one line
replacements << [ /<w:p>/, "\n" ]
replacements << [ /<\/w:p>/, "\n" ]
#Remove all other tags
replacements << [ /<[^>]*>/, "" ]
#Not sure if any other replacements need to be made, but this should
# make it easy enough to add more
replacements << [ /</ , '<' ] <<
[ />/, '>' ] <<
[ /&/, "&"] <<
[ /"/, '"'] <<
[ /'/, "'" ]
replacements.each { | replacement |
document.gsub!(replacement[0], replacement[1])
}
#Some UTF-8 characters don't print
#This translates from utf-8 to ascii
require "iconv"
document = Iconv.conv("ascii//translit", "UTF-8", document)
end
begin
document.each_wrapped_line {|line| $stdout.write( line + "\n") }
rescue Errno::EPIPE
end
File.delete("antiword_temp.zip") if filename == "antiword_temp.zip"