debian/htdig/htdig-3.2.0b6/contrib/doc2html/pdf2html.pl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161

#!/usr/bin/perl -w
use strict;
#
# Version 1.0.1	12-Feb-2002
# Written by David Adams <d.j.adams@soton.ac.uk>
#
# Uses pdftotext & pdfinfo utilities from the xpdf package
# to read an Adobe Acrobat file and produce HTML output.
#  
# Can be called directly from htdig as an external converter,
#  or may be called by doc2html.pl converter script. 
#

####--- Configuration ---####
# Full paths of pdtotext and pdfinfo
# (get them from the xpdf package at http://www.foolabs.com/xpdf/):

#### YOU MUST SET THESE  ####

my $PDFTOTEXT = "/... .../pdftotext";
my $PDFINFO = "/... .../pdfinfo";
#
# De-hyphenation option (only affects end-of-line hyphens):
my $Dehyphenate = 1;
#
# Set title to be used when none is found:
my $Default_title = "Adobe Acrobat Document";
#  
# make portable to win32 platform or unix:
my $null = "/dev/null";
if ($^O eq "MSWin32") {$null = "nul";}
####--- End of configuration ---###

if (! -x $PDFTOTEXT) { die "Unable to execute pdftotext" }

my $Input = $ARGV[0] || die "Usage: pdf2html.pl filename [mime-type] [URL]";
my $MIME_type = $ARGV[1] || '';
if ($MIME_type and ($MIME_type !~ m#^application/pdf#i)) {
  die "MIME/type $MIME_type wrong";
}

my $Name = $ARGV[2] || '';
$Name =~ s#^(.*/)##;
# decode if 2nd argument was a URL 
$Name =~ s/%([A-F0-9][A-F0-9])/pack("C", hex($1))/gie if $1;

&pdf_head;
&pdf_body;
exit;

#------------------------------------------------------------------------------

sub pdf_head {
#
#  Contributed by Greg Holmes and Michael Fuller
#   (any errors by David Adams)
#
    my $title = '';
    my $subject = '';
    my $keywords = '';
    if (open(INFO, "$PDFINFO '$Input' 2>$null |")) {
        while (<INFO>) {
            if (m/^title:/i) {
                s/^title:\s+//i;
		$title = &clean_pdf($_);
	    } elsif (m/^subject:/i) {
                s/^subject:\s+//i;
                $subject = &clean_pdf($_);
            } elsif (m/^keywords:/i) {
                s/^keywords:\s+//i;
                $keywords = &clean_pdf($_);
            }

        }
        close INFO;
    } else { warn "cannot execute pdfinfo" }
    if (not length $title) {
      if ($Name) {
        $title = '[' . $Name . ']';
      } else {
        $title = $Default_title;
      }
    }

    print "<HTML>\n<HEAD>\n";
    print "<TITLE>$title</TITLE>\n";
    if (length $subject) {
      print '<META NAME="DESCRIPTION" CONTENT="' . $subject. "\">\n";
    }
    if (length $keywords) {
      print '<META NAME="KEYWORDS" CONTENT="' . $keywords . "\">\n";
    }
    print "</HEAD>\n";

###print STDERR "\n$Name:\n";
###print STDERR "\tTitle:\t$title\n";
###print STDERR "\tDescription:\t$subject\n";
###print STDERR "\tKeywords:\t$keywords\n";

}

#------------------------------------------------------------------------------

sub pdf_body {

  my $bline = '';
  open(CAT, "$PDFTOTEXT -raw '$Input' - |") || 
	  die "$PDFTOTEXT doesn't want to be opened using pipe\n";
  print "<BODY>\n";
  while (<CAT>) {
    while ( m/[A-Za-z\300-\377]-\s*$/ && $Dehyphenate) {
	  $_ .= <CAT>;
	  last if eof;
	  s/([A-Za-z\300-\377])-\s*\n\s*([A-Za-z\300-\377])/$1$2/s;
    }
    s/\255/-/g;	# replace dashes with hyphens
    # replace bell, backspace, tab. etc. with single space:
    s/[\000-\040]+/ /g;
    $_ = &HTML($_);
    if (length) {
      print $bline, $_, "\n";
      $bline = "<br>\n";
    } else {
      $bline = "<p>\n";
    }
  }
  close CAT;

  print "</BODY>\n</HTML>\n";
  return;
}

#------------------------------------------------------------------------------

sub HTML {

  my $text = shift;

  $text =~ s/\f/\n/gs;	# replace form feed
  $text =~ s/\s+/ /g;	# replace multiple spaces, etc. with a single space
  $text =~ s/\s+$//gm;	# remove trailing space
  $text =~ s/&/&amp;/g;
  $text =~ s/</&lt;/g;
  $text =~ s/>/&gt;/g;
  chomp $text;

  return $text;
}

#------------------------------------------------------------------------------

sub clean_pdf {
# removes odd pair of characters that may be in pdfinfo output
# Any double quotes are replaced with single

  my $text = shift;
  chomp $text;
  $text =~  s/\376\377//g;
  $text =~  s/\"/\'/g;
  return $text;
}