#!/usr/bin/env ruby

=begin

= Info
    This is a PDF document filtering engine using Origami.
    Security policies are based on a white list of PDF features.
    Default policies details can be found in the default configuration file.
    You can also add your own policy and activate it using the -p switch.

= License
    Copyright (C) 2016  Guillaume Delugré.

    Origami is free software: you can redistribute it and/or modify
    it under the terms of the GNU Lesser General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    Origami is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public License
    along with Origami.  If not, see <http://www.gnu.org/licenses/>.

=end

begin
    require 'origami'
rescue LoadError
    $: << File.join(__dir__, '../lib')
    require 'origami'
end

require 'optparse'
require 'yaml'
require 'rexml/document'
require 'digest/sha2'
require 'fileutils'
require 'colorize'

DEFAULT_CONFIG_FILE = "#{File.dirname(__FILE__)}/config/pdfcop.conf.yml"
DEFAULT_POLICY = "standard"
SECURITY_POLICIES = {}

def load_config_file(path)
    SECURITY_POLICIES.update(Hash.new(false).update YAML.load(File.read(path)))
end

class OptParser
    BANNER = <<USAGE
Usage: #{$0} [options] <PDF-file>
The PDF filtering engine. Scans PDF documents for malicious structures.
Bug reports or feature requests at: http://github.com/gdelugre/origami

Options:
USAGE

    def self.parse(args)
        options = {colors: true, password: ''}

        opts = OptionParser.new do |opts|
            opts.banner = BANNER

            opts.on("-o", "--output LOG_FILE", "Output log file (default STDOUT)") do |o|
                options[:output_log] = o
            end

            opts.on("-c", "--config CONFIG_FILE", "Load security policies from given configuration file") do |cf|
                options[:config_file] = cf
            end

            opts.on("-p", "--policy POLICY_NAME", "Specify applied policy. Predefined policies: 'none', 'standard', 'strong', 'paranoid'") do |policy|
                options[:policy] = policy
            end

            opts.on("-m", "--move PATH", "Move rejected documents to the specified directory.") do |dir|
                options[:move_dir] = dir
            end

            opts.on("-P", "--password PASSWORD", "Password to use if the document is encrypted") do |passwd|
                options[:password] = passwd
            end

            opts.on("-n", "--no-color", "Turn off colorized output") do
                options[:disable_colors] = true
            end

            opts.on_tail("-h", "--help", "Show this message") do
                puts opts
                exit
            end
        end

        opts.parse!(args)

        options
    end
end

@options = OptParser.parse(ARGV)
if @options.key?(:output_log)
    LOGGER = File.open(@options[:output_log], "a+")
else
    LOGGER = STDOUT
end

if not @options.key?(:policy)
    @options[:policy] = DEFAULT_POLICY
end

if @options.key?(:move_dir) and not File.directory?(@options[:move_dir])
    abort "Error: #{@options[:move_dir]} is not a valid directory."
end

String.disable_colorization @options[:disable_colors]

load_config_file(@options[:config_file] || DEFAULT_CONFIG_FILE)
unless SECURITY_POLICIES.key?("POLICY_#{@options[:policy].upcase}")
    abort "Undeclared policy `#{@options[:policy]}'"
end

if ARGV.empty?
    abort "Error: No filename was specified. #{$0} --help for details."
else
    TARGET = ARGV.shift
end

def log(str, color = :default)
    LOGGER.puts("[#{Time.now}]".cyan + " #{str.colorize(color)}")
end

def reject(cause)
    log("Document rejected by policy `#{@options[:policy]}', caused by #{cause.inspect}.", :red)

    if @options.key?(:move_dir)
        quarantine(TARGET, @options[:move_dir])
    end

    abort
end

def quarantine(file, quarantine_folder)
    digest = Digest::SHA256.file(TARGET)
    ext = File.extname(TARGET)
    dest_name = "#{File.basename(TARGET, ext)}_#{digest}#{ext}"
    dest_path = File.join(@options[:move_dir], dest_name)

    FileUtils.move(TARGET, dest_path)
end

def check_rights(*required_rights)
    current_rights = SECURITY_POLICIES["POLICY_#{@options[:policy].upcase}"]

    reject(required_rights) if required_rights.any?{|right| current_rights[right.to_s] == false}
end

def analyze_xfa_forms(xfa)
    case xfa
    when Origami::Array then
        xml = ""
        i = 0
        xfa.each do |packet|
            if i % 2 == 1
                xml << packet.solve.data
            end

            i = i + 1
        end
    when Origami::Stream then
        xml = xfa.data
    else
        reject("Malformed XFA dictionary")
    end

    xfadoc = REXML::Document.new(xml)
    REXML::XPath.match(xfadoc, "//script").each do |script|
        case script.attributes["contentType"]
        when "application/x-formcalc" then
            check_rights(:allowFormCalc)
        else
            check_rights(:allowJS)
        end
    end
end

def analyze_annotation(annot, _level = 0)
    check_rights(:allowAnnotations)

    if annot.is_a?(Origami::Dictionary) and annot.key?(:Subtype)
        case annot[:Subtype].solve.value
        when :FileAttachment
            check_rights(:allowAttachments, :allowFileAttachmentAnnotation)

        when :Sound
            check_rights(:allowSoundAnnotation)

        when :Movie
            check_rights(:allowMovieAnnotation)

        when :Screen
            check_rights(:allowScreenAnnotation)

        when :Widget
            check_rights(:allowAcroforms)

        when :"3D"
            check_rights(:allow3DAnnotation)

            # 3D annotation might pull in JavaScript for real-time driven behavior.
            if annot.key?(:"3DD")
                dd = annot[:"3DD"].solve
                u3dstream = nil

                case dd
                when Origami::Stream
                    u3dstream = dd
                when Origami::Dictionary
                    u3dstream = dd[:"3DD"]
                end

                if u3dstream and u3dstream.key?(:OnInstantiate)
                    check_rights(:allowJS)

                    if annot.key?(:"3DA") # is 3d view instantiated automatically?
                        u3dactiv = annot[:"3DA"].solve

                        check_rights(:allowJSAtOpening) if u3dactiv.is_a?(Origami::Dictionary) and (u3dactiv[:A] == :PO or u3dactiv[:A] == :PV)
                    end
                end
            end

        when :RichMedia
            check_rights(:allowRichMediaAnnotation)
        end
    end
end

def analyze_page(page, level = 0)
    section_prefix = " " * 2 * level + ">" * (level + 1)
    log(section_prefix + " Inspecting page...")

    text_prefix = " " * 2 * (level + 1) + "." * (level + 1)
    if page.is_a?(Origami::Dictionary)
        #
        # Checking page additional actions.
        #
        if page.key?(:AA)
            if page.AA.is_a?(Origami::Dictionary)
                log(text_prefix + " Page has an action dictionary.")

                aa = Origami::Page::AdditionalActions.new(page.AA); aa.parent = page.AA.parent
                analyze_action(aa.O, true, level + 1) if aa.key?(:O)
                analyze_action(aa.C, false, level + 1) if aa.key?(:C)
            end
        end

        #
        # Looking for page annotations.
        #
        page.each_annotation do |annot|
            analyze_annotation(annot, level + 1)
        end
    end
end

def analyze_action(action, triggered_at_opening, level = 0)
    section_prefix = " " * 2 * level + ">" * (level + 1)
    log(section_prefix + " Inspecting action...")

    text_prefix = " " * 2 * (level + 1) + "." * (level + 1)
    if action.is_a?(Origami::Dictionary)
        log(text_prefix + " Found #{action[:S]} action.")
        type = action[:S].is_a?(Origami::Reference) ? action[:S].solve : action[:S]

        case type.value
        when :JavaScript
            check_rights(:allowJS)
            check_rights(:allowJSAtOpening) if triggered_at_opening

        when :Launch
            check_rights(:allowLaunchAction)

        when :Named
            check_rights(:allowNamedAction)

        when :GoTo
            check_rights(:allowGoToAction)
            dest = action[:D].is_a?(Origami::Reference) ? action[:D].solve : action[:D]
            if dest.is_a?(Origami::Array) and dest.length > 0 and dest.first.is_a?(Origami::Reference)
                dest_page = dest.first.solve
                if dest_page.is_a?(Origami::Page)
                    log(text_prefix + " Destination page found.")
                    analyze_page(dest_page, level + 1)
                end
            end

        when :GoToE
            check_rights(:allowAttachments,:allowGoToEAction)

        when :GoToR
            check_rights(:allowGoToRAction)

        when :Thread
            check_rights(:allowGoToRAction) if action.key?(:F)

        when :URI
            check_rights(:allowURIAction)

        when :SubmitForm
            check_rights(:allowAcroForms,:allowSubmitFormAction)

        when :ImportData
            check_rights(:allowAcroForms,:allowImportDataAction)

        when :Rendition
            check_rights(:allowScreenAnnotation,:allowRenditionAction)

        when :Sound
            check_rights(:allowSoundAnnotation,:allowSoundAction)

        when :Movie
            check_rights(:allowMovieAnnotation,:allowMovieAction)

        when :RichMediaExecute
            check_rights(:allowRichMediaAnnotation,:allowRichMediaAction)

        when :GoTo3DView
            check_rights(:allow3DAnnotation,:allowGoTo3DAction)
        end

        if action.key?(:Next)
            log(text_prefix + "This action is chained to another action!")
            check_rights(:allowChainedActions)
            analyze_action(action.Next)
        end

    elsif action.is_a?(Origami::Array)
        dest = action
        if dest.length > 0 and dest.first.is_a?(Origami::Reference)
            dest_page = dest.first.solve
            if dest_page.is_a?(Origami::Page)
                log(text_prefix + " Destination page found.")
                check_rights(:allowGoToAction)
                analyze_page(dest_page, level + 1)
            end
        end
    end
end

begin
    log("PDFcop is running on target `#{TARGET}', policy = `#{@options[:policy]}'", :green)
    log("  File size: #{File.size(TARGET)} bytes", :magenta)
    log("  SHA256: #{Digest::SHA256.file(TARGET)}", :magenta)

    @pdf = Origami::PDF.read(TARGET,
        verbosity: Origami::Parser::VERBOSE_QUIET,
        ignore_errors: SECURITY_POLICIES["POLICY_#{@options[:policy].upcase}"]['allowParserErrors'],
        decrypt: SECURITY_POLICIES["POLICY_#{@options[:policy].upcase}"]['allowEncryption'],
        prompt_password: lambda { '' },
        password: @options[:password]
    )

    log("> Inspecting document structure...", :yellow)
    if @pdf.encrypted?
        log("  . Encryption = YES")
        check_rights(:allowEncryption)
    end

    log("> Inspecting document catalog...", :yellow)
    catalog = @pdf.Catalog
    reject("Invalid document catalog") unless catalog.is_a?(Origami::Catalog)

    if catalog.key?(:OpenAction)
        log("  . OpenAction entry = YES")
        check_rights(:allowOpenAction)
        action = catalog.OpenAction
        analyze_action(action, true, 1)
    end

    if catalog.key?(:AA)
        if catalog.AA.is_a?(Origami::Dictionary)
            aa = Origami::CatalogAdditionalActions.new(catalog.AA); aa.parent = catalog;
            log("  . Additional actions dictionary = YES")
            analyze_action(aa.WC, false, 1) if aa.key?(:WC)
            analyze_action(aa.WS, false, 1) if aa.key?(:WS)
            analyze_action(aa.DS, false, 1) if aa.key?(:DS)
            analyze_action(aa.WP, false, 1) if aa.key?(:WP)
            analyze_action(aa.DP, false, 1) if aa.key?(:DP)
        end
    end

    if catalog.key?(:AcroForm)
        acroform = catalog.AcroForm
        if acroform.is_a?(Origami::Dictionary)
            log("  . AcroForm = YES")
            check_rights(:allowAcroForms)
            if acroform.key?(:XFA)
                log("  . XFA = YES")
                check_rights(:allowXFAForms)

                analyze_xfa_forms(acroform[:XFA].solve)
            end
        end
    end

    log("> Inspecting JavaScript names directory...", :yellow)
    if @pdf.each_named_script.any?
        check_rights(:allowJS)
        check_rights(:allowJSAtOpening)
    end

    log("> Inspecting attachment names directory...", :yellow)
    if @pdf.each_attachment.any?
        check_rights(:allowAttachments)
    end

    log("> Inspecting document pages...", :yellow)
    @pdf.each_page do |page|
        analyze_page(page, 1)
    end

    log("> Inspecting document streams...", :yellow)
    @pdf.indirect_objects.find_all{|obj| obj.is_a?(Origami::Stream)}.each do |stream|
        if stream.dictionary.key?(:Filter)
            filters = stream.Filter
            filters = [ filters ] if filters.is_a?(Origami::Name)

            if filters.is_a?(Origami::Array)
                filters.each do |filter|
                    case filter.value
                    when :ASCIIHexDecode
                        check_rights(:allowASCIIHexFilter)
                    when :ASCII85Decode
                        check_rights(:allowASCII85Filter)
                    when :LZWDecode
                        check_rights(:allowLZWFilter)
                    when :FlateDecode
                        check_rights(:allowFlateDecode)
                    when :RunLengthDecode
                        check_rights(:allowRunLengthFilter)
                    when :CCITTFaxDecode
                        check_rights(:allowCCITTFaxFilter)
                    when :JBIG2Decode
                        check_rights(:allowJBIG2Filter)
                    when :DCTDecode
                        check_rights(:allowDCTFilter)
                    when :JPXDecode
                        check_rights(:allowJPXFilter)
                    when :Crypt
                        check_rights(:allowCryptFilter)
                    end
                end
            end
        end
    end

    #
    # TODO: Detect JS at opening in XFA (check event tag)
    #       Check image encoding in XFA ?
    #       Only allow valid signed documents ?
    #       Recursively scan attached files.
    #       On-the-fly injection of prerun JS code to hook vulnerable methods (dynamic exploit detection) ???
    #       ...
    #

    log("Document accepted by policy `#{@options[:policy]}'.", :green)

rescue
    log("An error occured during analysis : #{$!.class} (#{$!.message})")
    reject("Analysis failure")
ensure
    LOGGER.close
end
