quilt files
Ivan Kohler
11 years ago
0 | patches |
0 | series |
0 | 2 |
0 | #!/usr/bin/perl -w | |
1 | ||
2 | package main; | |
3 | ||
4 | use warnings; | |
5 | use strict; | |
6 | use CAM::PDF; | |
7 | use Getopt::Long; | |
8 | use Pod::Usage; | |
9 | use English qw(-no_match_vars); | |
10 | ||
11 | our $VERSION = '1.58'; | |
12 | ||
13 | my %opts = ( | |
14 | # Hardcoded: | |
15 | template => 'crunchjpg_tmpl.pdf', | |
16 | ||
17 | # User settable values: | |
18 | justjpgs => 0, | |
19 | quality => 50, | |
20 | scale => undef, | |
21 | scalemin => 0, | |
22 | skip => {}, | |
23 | only => {}, | |
24 | Verbose => 0, | |
25 | verbose => 0, | |
26 | order => 0, | |
27 | help => 0, | |
28 | version => 0, | |
29 | ||
30 | # Temporary values: | |
31 | onlyval => [], | |
32 | skipval => [], | |
33 | qualityval => undef, | |
34 | scaleminval=> undef, | |
35 | scaleval => undef, | |
36 | scales => {1 => undef, 2 => '50%', 4 => '25%', 8 => '12.5%'}, | |
37 | ); | |
38 | ||
39 | Getopt::Long::Configure('bundling'); | |
40 | GetOptions('S|skip=s' => \@{$opts{skipval}}, | |
41 | 'O|only=s' => \@{$opts{onlyval}}, | |
42 | 'q|quality=i' => \$opts{qualityval}, | |
43 | 's|scale=i' => \$opts{scaleval}, | |
44 | 'm|scalemin=i' => \$opts{scaleminval}, | |
45 | 'j|justjpgs' => \$opts{justjpgs}, | |
46 | 'veryverbose' => \$opts{Verbose}, | |
47 | 'v|verbose' => \$opts{verbose}, | |
48 | 'o|order' => \$opts{order}, | |
49 | 'h|help' => \$opts{help}, | |
50 | 'V|version' => \$opts{version}, | |
51 | ) or pod2usage(1); | |
52 | if ($opts{help}) | |
53 | { | |
54 | pod2usage(-exitstatus => 0, -verbose => 2); | |
55 | } | |
56 | if ($opts{version}) | |
57 | { | |
58 | print "CAM::PDF v$CAM::PDF::VERSION\n"; | |
59 | exit 0; | |
60 | } | |
61 | ||
62 | ## Fix up and validate special options: | |
63 | ||
64 | if ($opts{Verbose}) | |
65 | { | |
66 | $opts{verbose} = 1; | |
67 | } | |
68 | if (defined $opts{scaleval}) | |
69 | { | |
70 | if (exists $opts{scales}->{$opts{scaleval}}) | |
71 | { | |
72 | $opts{scale} = $opts{scales}->{$opts{scaleval}}; | |
73 | } | |
74 | else | |
75 | { | |
76 | die "Invalid value for --scale switch\n"; | |
77 | } | |
78 | } | |
79 | if (defined $opts{scaleminval}) | |
80 | { | |
81 | if ($opts{scaleminval} =~ m/\A\d+\z/xms && $opts{scaleminval} > 0) | |
82 | { | |
83 | $opts{scalemin} = $opts{scaleminval}; | |
84 | } | |
85 | else | |
86 | { | |
87 | die "Invalid value for --scalemin switch\n"; | |
88 | } | |
89 | } | |
90 | if (defined $opts{qualityval}) | |
91 | { | |
92 | if ($opts{qualityval} =~ m/\A\d+\z/xms && $opts{qualityval} >= 1 && $opts{qualityval} <= 100) | |
93 | { | |
94 | $opts{quality} = $opts{qualityval}; | |
95 | } | |
96 | else | |
97 | { | |
98 | die "The JPEG --quality setting must be between 1 and 100\n"; | |
99 | } | |
100 | } | |
101 | foreach my $flag (qw( skip only )) | |
102 | { | |
103 | foreach my $val (@{$opts{$flag.'val'}}) | |
104 | { | |
105 | foreach my $key (split /\D+/xms, $val) | |
106 | { | |
107 | $opts{$flag}->{$key} = 1; | |
108 | } | |
109 | } | |
110 | } | |
111 | if (!-f $opts{template}) | |
112 | { | |
113 | die "Cannot find the template pdf called $opts{template}\n"; | |
114 | } | |
115 | ||
116 | # Start work: | |
117 | ||
118 | if (@ARGV < 1) | |
119 | { | |
120 | pod2usage(1); | |
121 | } | |
122 | ||
123 | my $infile = shift; | |
124 | my $outfile = shift || q{-}; | |
125 | ||
126 | my $doc = CAM::PDF->new($infile) || die "$CAM::PDF::errstr\n"; | |
127 | ||
128 | if (!$doc->canModify()) | |
129 | { | |
130 | die "This PDF forbids modification\n"; | |
131 | } | |
132 | ||
133 | my $pages = $doc->numPages(); | |
134 | my $nimages = 0; | |
135 | my $rimages = 0; | |
136 | ||
137 | my %doneobjs; | |
138 | ||
139 | my $oldcontentsize = $doc->{contentlength}; | |
140 | my $oldtotsize = 0; | |
141 | my $newtotsize = 0; | |
142 | ||
143 | for my $p (1..$pages) | |
144 | { | |
145 | my $c = $doc->getPageContent($p); | |
146 | my @parts = split /(\/[\w]+\s*Do)\b/xms, $c; | |
147 | foreach my $part (@parts) | |
148 | { | |
149 | if ($part =~ m/\A(\/[\w]+)\s*Do\z/xms) | |
150 | { | |
151 | my $ref = $1; | |
152 | my $xobj = $doc->dereference($ref, $p); | |
153 | my $objnum = $xobj->{objnum}; | |
154 | my $im = $doc->getValue($xobj); | |
155 | my $l = $im->{Length} || $im->{L} || 0; | |
156 | if ($l) | |
157 | { | |
158 | $l = $doc->getValue($l); | |
159 | } | |
160 | my $w = $im->{Width} || $im->{W} || 0; | |
161 | if ($w) | |
162 | { | |
163 | $w = $doc->getValue($w); | |
164 | } | |
165 | my $h = $im->{Height} || $im->{H} || 0; | |
166 | if ($h) | |
167 | { | |
168 | $h = $doc->getValue($h); | |
169 | } | |
170 | ||
171 | next if (exists $doneobjs{$objnum}); | |
172 | ||
173 | $nimages++; | |
174 | _inform("Image $nimages page $p, $ref = object $objnum, (w,h)=($w,$h), length $l", $opts{verbose}); | |
175 | ||
176 | if (exists $opts{skip}->{$objnum} || | |
177 | (0 < scalar keys %{$opts{only}} && !exists $opts{only}->{$objnum})) | |
178 | { | |
179 | _inform("Skipping object $objnum", $opts{verbose}); | |
180 | next; | |
181 | } | |
182 | ||
183 | my $isjpg = _isjpg($im); | |
184 | ||
185 | if ((!$isjpg) && $opts{justjpgs}) | |
186 | { | |
187 | _inform('Not a jpeg', $opts{verbose}); | |
188 | } | |
189 | else | |
190 | { | |
191 | my $oldsize = $doc->getValue($im->{Length}); | |
192 | if (!$oldsize) | |
193 | { | |
194 | die "PDF error: Failed to get size of image\n"; | |
195 | } | |
196 | $oldtotsize += $oldsize; | |
197 | ||
198 | my $tmpl = CAM::PDF->new($opts{template}) || die "$CAM::PDF::errstr\n"; | |
199 | ||
200 | # Get a handle on the needed data bits from the template | |
201 | my $media_array = $tmpl->getValue($tmpl->getPage(1)->{MediaBox}); | |
202 | my $rawpage = $tmpl->getPageContent(1); | |
203 | ||
204 | $media_array->[2]->{value} = $w; | |
205 | $media_array->[3]->{value} = $h; | |
206 | my $page = $rawpage; | |
207 | $page =~ s/xxx/$w/igxms; | |
208 | $page =~ s/yyy/$h/igxms; | |
209 | $tmpl->setPageContent(1, $page); | |
210 | $tmpl->replaceObject(9, $doc, $objnum, 1); | |
211 | ||
212 | my $ofile = "/tmp/crunchjpg.$$"; | |
213 | $tmpl->cleanoutput($ofile); | |
214 | ||
215 | my $cmd = ('convert ' . | |
216 | ($opts{scale} && $w > $opts{scalemin} && $h > $opts{scalemin} ? | |
217 | "-scale '$opts{scale}' " : q{}) . | |
218 | "-quality $opts{quality} " . | |
219 | '-density 72x72 ' . | |
220 | "-page ${w}x$h " . | |
221 | "pdf:$ofile jpg:- | " . | |
222 | 'convert jpg:- pdf:- |'); | |
223 | ||
224 | _inform($cmd, $opts{Verbose}); | |
225 | ||
226 | # TODO: this should use IPC::Open3 or the like | |
227 | open my $pipe, $cmd ## no critic (ProhibitTwoArgOpen) | |
228 | or die "Failed to convert object $objnum to a jpg and back\n"; | |
229 | my $content = do { local $RS = undef; <$pipe>; }; | |
230 | close $pipe | |
231 | or die "Failed to convert object $objnum to a jpg and back\n"; | |
232 | ||
233 | my $jpg = CAM::PDF->new($content) || die "$CAM::PDF::errstr\n"; | |
234 | my $jpgim = $jpg->getObjValue(8); | |
235 | my $jpgsize = $jpg->getValue($jpgim->{Length}); | |
236 | ||
237 | if ($jpgsize < $oldsize) { | |
238 | $doc->replaceObject($objnum, $jpg, 8, 1); | |
239 | ||
240 | $newtotsize += $jpgsize; | |
241 | ||
242 | my $percent = sprintf '%.1f', 100 * ($oldsize - $jpgsize) / $oldsize; | |
243 | _inform("\tcompressed $oldsize -> $jpgsize ($percent%)", $opts{verbose}); | |
244 | $doneobjs{$objnum} = 1; | |
245 | $rimages++; | |
246 | } else { | |
247 | _inform("\tskipped $oldsize -> $jpgsize", $opts{verbose}); | |
248 | } | |
249 | ||
250 | } | |
251 | } | |
252 | } | |
253 | } | |
254 | ||
255 | _inform("Crunched $rimages of $nimages images", $opts{verbose}); | |
256 | $doc->cleanoutput($outfile); | |
257 | ||
258 | my $newcontentsize = $doc->{contentlength}; | |
259 | ||
260 | if ($opts{verbose}) | |
261 | { | |
262 | my $contentpercent = sprintf '%.1f', $oldcontentsize ? 100 * ($oldcontentsize - $newcontentsize) / $oldcontentsize : 0; | |
263 | my $totpercent = sprintf '%.1f', $oldtotsize ? 100 * ($oldtotsize - $newtotsize) / $oldtotsize : 0; | |
264 | _inform('Compression summary:', 1); | |
265 | _inform(" Document: $oldcontentsize -> $newcontentsize ($contentpercent%)", 1); | |
266 | _inform(" Images: $oldtotsize -> $newtotsize ($totpercent%)", 1); | |
267 | } | |
268 | ||
269 | sub _isjpg | |
270 | { | |
271 | my $im = shift; | |
272 | return if (!$im->{Filter}); | |
273 | ||
274 | my $f = $im->{Filter}; | |
275 | my @names = $f->{type} eq 'array' ? @{$f->{value}} : $f; | |
276 | for my $e (@names) | |
277 | { | |
278 | my $name = $doc->getValue($e); | |
279 | if (ref $name) | |
280 | { | |
281 | $name = $name->{value}; | |
282 | } | |
283 | #warn "Checking $name\n"; | |
284 | if ($name eq 'DCTDecode') | |
285 | { | |
286 | return 1; | |
287 | } | |
288 | } | |
289 | return; | |
290 | } | |
291 | ||
292 | sub _inform | |
293 | { | |
294 | my $str = shift; | |
295 | my $verbose = shift; | |
296 | ||
297 | if ($verbose) | |
298 | { | |
299 | print STDERR $str, "\n"; | |
300 | } | |
301 | return; | |
302 | } | |
303 | ||
304 | __END__ | |
305 | ||
306 | =for stopwords crunchjpgs.pl ImageMagick JPG rescaling | |
307 | ||
308 | =head1 NAME | |
309 | ||
310 | crunchjpgs.pl - Compress all JPG images in a PDF | |
311 | ||
312 | =head1 SYNOPSIS | |
313 | ||
314 | crunchjpgs.pl [options] infile.pdf [outfile.pdf] | |
315 | ||
316 | Options: | |
317 | -j --justjpgs make script skip non-JPGs | |
318 | -q --quality select JPG output quality (default 50) | |
319 | -s --scale=num select a rescaling factor for the JPGs (default 100%) | |
320 | -m --scalemin=size don't scale JPGs smaller than this pixel size (width or height) | |
321 | -O --only=imnum only change the specified images (can be used mutliple times) | |
322 | -S --skip=imnum don't change the specified images (can be used mutliple times) | |
323 | -o --order preserve the internal PDF ordering for output | |
324 | --veryverbose increases the verbosity | |
325 | -v --verbose print diagnostic messages | |
326 | -h --help verbose help message | |
327 | -V --version print CAM::PDF version | |
328 | ||
329 | The available values for --scale are: | |
330 | ||
331 | 1 100% | |
332 | 2 50% | |
333 | 4 25% | |
334 | 8 12.5% | |
335 | ||
336 | C<imnum> is a comma-separated list of integers indicating the images | |
337 | in order that they appear in the PDF. Use F<listimages.pl> to retrieve | |
338 | the image numbers. | |
339 | ||
340 | =head1 DESCRIPTION | |
341 | ||
342 | Requires the ImageMagick B<convert> program to be available | |
343 | ||
344 | Tweak all of the JPG images embedded in a PDF to reduce their size. | |
345 | This reduction can come from increasing the compression and/or | |
346 | rescaling the whole image. Various options give you full control over | |
347 | which images are altered. | |
348 | ||
349 | =head1 SEE ALSO | |
350 | ||
351 | CAM::PDF | |
352 | ||
353 | F<listimages.pl> | |
354 | ||
355 | F<extractallimages.pl> | |
356 | ||
357 | F<extractjpgs.pl> | |
358 | ||
359 | F<uninlinepdfimages.pl> | |
360 | ||
361 | =head1 AUTHOR | |
362 | ||
363 | See L<CAM::PDF> | |
364 | ||
365 | =cut |
0 | #!/usr/bin/perl -w | |
1 | ||
2 | package main; | |
3 | ||
4 | use warnings; | |
5 | use strict; | |
6 | use CAM::PDF; | |
7 | use Getopt::Long; | |
8 | use Pod::Usage; | |
9 | ||
10 | our $VERSION = '1.58'; | |
11 | ||
12 | my %opts = ( | |
13 | template => 'crunchjpg_tmpl.pdf', | |
14 | ||
15 | verbose => 0, | |
16 | help => 0, | |
17 | version => 0, | |
18 | skip => {}, | |
19 | only => {}, | |
20 | ||
21 | # Temporary values: | |
22 | onlyval => [], | |
23 | skipval => [], | |
24 | ); | |
25 | ||
26 | Getopt::Long::Configure('bundling'); | |
27 | GetOptions('S|skip=s' => \@{$opts{skipval}}, | |
28 | 'O|only=s' => \@{$opts{onlyval}}, | |
29 | 'v|verbose' => \$opts{verbose}, | |
30 | 'h|help' => \$opts{help}, | |
31 | 'V|version' => \$opts{version}, | |
32 | ) or pod2usage(1); | |
33 | if ($opts{help}) | |
34 | { | |
35 | pod2usage(-exitstatus => 0, -verbose => 2); | |
36 | } | |
37 | if ($opts{version}) | |
38 | { | |
39 | print "CAM::PDF v$CAM::PDF::VERSION\n"; | |
40 | exit 0; | |
41 | } | |
42 | ||
43 | foreach my $flag (qw( skip only )) | |
44 | { | |
45 | foreach my $val (@{$opts{$flag.'val'}}) | |
46 | { | |
47 | foreach my $key (split /\D+/xms, $val) | |
48 | { | |
49 | $opts{$flag}->{$key} = 1; | |
50 | } | |
51 | } | |
52 | } | |
53 | if (!-f $opts{template}) | |
54 | { | |
55 | die "Cannot find the template pdf called $opts{template}\n"; | |
56 | } | |
57 | ||
58 | if (@ARGV < 2) | |
59 | { | |
60 | pod2usage(1); | |
61 | } | |
62 | ||
63 | my $infile = shift; | |
64 | my $outdir = shift; | |
65 | ||
66 | my $doc = CAM::PDF->new($infile) || die "$CAM::PDF::errstr\n"; | |
67 | ||
68 | my $nimages = 0; | |
69 | my $rimages = 0; | |
70 | my %doneobjs; | |
71 | ||
72 | foreach my $objnum (keys %{$doc->{xref}}) | |
73 | { | |
74 | my $xobj = $doc->dereference($objnum); | |
75 | if ($xobj->{value}->{type} eq 'dictionary') | |
76 | { | |
77 | my $im = $xobj->{value}->{value}; | |
78 | if (exists $im->{Type} && $doc->getValue($im->{Type}) eq 'XObject' && | |
79 | exists $im->{Subtype} && $doc->getValue($im->{Subtype}) eq 'Image') | |
80 | { | |
81 | my $ref = '(no name)'; | |
82 | if ($im->{Name}) | |
83 | { | |
84 | $ref = $doc->getValue($im->{Name}); | |
85 | } | |
86 | my $w = $im->{Width} || $im->{W} || 0; | |
87 | if ($w) | |
88 | { | |
89 | $w = $doc->getValue($w); | |
90 | } | |
91 | my $h = $im->{Height} || $im->{H} || 0; | |
92 | if ($h) | |
93 | { | |
94 | $h = $doc->getValue($h); | |
95 | } | |
96 | ||
97 | next if (exists $doneobjs{$objnum}); | |
98 | ||
99 | $nimages++; | |
100 | _inform("Image $nimages, $ref = object $objnum, (w,h)=($w,$h)", $opts{verbose}); | |
101 | ||
102 | if (exists $opts{skip}->{$objnum} || | |
103 | (0 < scalar keys %{$opts{only}} && !exists $opts{only}->{$objnum})) | |
104 | { | |
105 | _inform("Skipping object $objnum", $opts{verbose}); | |
106 | next; | |
107 | } | |
108 | ||
109 | my $isjpg = _isjpg($im); | |
110 | ||
111 | my $oldsize = $doc->getValue($im->{Length}); | |
112 | if (!$oldsize) | |
113 | { | |
114 | die "PDF error: Failed to get size of image\n"; | |
115 | } | |
116 | ||
117 | my $tmpl = CAM::PDF->new($opts{template}) || die "$CAM::PDF::errstr\n"; | |
118 | ||
119 | # Get a handle on the needed data bits from the template | |
120 | my $media_array = $tmpl->getValue($tmpl->getPage(1)->{MediaBox}); | |
121 | my $rawpage = $tmpl->getPageContent(1); | |
122 | ||
123 | $media_array->[2]->{value} = $w; | |
124 | $media_array->[3]->{value} = $h; | |
125 | my $page = $rawpage; | |
126 | $page =~ s/xxx/$w/igxms; | |
127 | $page =~ s/yyy/$h/igxms; | |
128 | $tmpl->setPageContent(1, $page); | |
129 | $tmpl->replaceObject(9, $doc, $objnum, 1); | |
130 | ||
131 | my $ofile = "/tmp/crunchjpg.$$"; | |
132 | $tmpl->cleanoutput($ofile); | |
133 | ||
134 | if (!-d $outdir) | |
135 | { | |
136 | require File::Path; | |
137 | File::Path::mkpath($outdir); | |
138 | } | |
139 | if ($isjpg) | |
140 | { | |
141 | my $result = `convert -quality 50 -density 72x72 -page ${w}x$h pdf:$ofile jpg:$outdir/$objnum.jpg`; ## no critic (Backtick) | |
142 | _inform($result, $opts{verbose}); | |
143 | } | |
144 | else | |
145 | { | |
146 | my $result = `convert -density 72x72 -page ${w}x$h pdf:$ofile gif:$outdir/$objnum.gif`; ## no critic (Backtick) | |
147 | _inform($result, $opts{verbose}); | |
148 | } | |
149 | ||
150 | $doneobjs{$objnum} = 1; | |
151 | $rimages++; | |
152 | } | |
153 | } | |
154 | } | |
155 | ||
156 | _inform("Extracted $rimages of $nimages images", $opts{verbose}); | |
157 | ||
158 | ||
159 | sub _isjpg | |
160 | { | |
161 | my $im = shift; | |
162 | return if (!$im->{Filter}); | |
163 | ||
164 | my $f = $im->{Filter}; | |
165 | my @names = $f->{type} eq 'array' ? @{$f->{value}} : $f; | |
166 | for my $e (@names) | |
167 | { | |
168 | my $name = $doc->getValue($e); | |
169 | if (ref $name) | |
170 | { | |
171 | $name = $name->{value}; | |
172 | } | |
173 | #warn "Checking $name\n"; | |
174 | if ($name eq 'DCTDecode') | |
175 | { | |
176 | return 1; | |
177 | } | |
178 | } | |
179 | return; | |
180 | } | |
181 | ||
182 | sub _inform | |
183 | { | |
184 | my $str = shift; | |
185 | my $verbose = shift; | |
186 | ||
187 | if ($verbose) | |
188 | { | |
189 | print STDERR $str, "\n"; | |
190 | } | |
191 | return; | |
192 | } | |
193 | ||
194 | __END__ | |
195 | ||
196 | ||
197 | =for stopwords extractallimages.pl ImageMagick | |
198 | ||
199 | =head1 NAME | |
200 | ||
201 | extractallimages.pl - Save copies of all PDF images to a directory | |
202 | ||
203 | =head1 SYNOPSIS | |
204 | ||
205 | extractallimages.pl [options] infile.pdf outdirectory | |
206 | ||
207 | Options: | |
208 | -O --only=imnum only output the specified images (can be used mutliple times) | |
209 | -S --skip=imnum don't output the specified images (can be used mutliple times) | |
210 | -v --verbose print diagnostic messages | |
211 | -h --help verbose help message | |
212 | -V --version print CAM::PDF version | |
213 | ||
214 | C<imnum> is a comma-separated list of integers indicating the images | |
215 | in order that they appear in the PDF. Use F<listimages.pl> to retrieve | |
216 | the image numbers. | |
217 | ||
218 | =head1 DESCRIPTION | |
219 | ||
220 | Requires the ImageMagick B<convert> program to be available | |
221 | ||
222 | Searches the PDF for images and saves them as individual files in the | |
223 | specified directory. The files are named C<E<lt>imnumE<gt>.jpg> or C<E<lt>imnumE<gt>.gif>. | |
224 | ||
225 | =head1 SEE ALSO | |
226 | ||
227 | CAM::PDF | |
228 | ||
229 | F<crunchjpgs.pl> | |
230 | ||
231 | F<listimages.pl> | |
232 | ||
233 | F<extractjpgs.pl> | |
234 | ||
235 | F<uninlinepdfimages.pl> | |
236 | ||
237 | =head1 AUTHOR | |
238 | ||
239 | See L<CAM::PDF> | |
240 | ||
241 | =cut |
0 | #!/usr/bin/perl -w | |
1 | ||
2 | package main; | |
3 | ||
4 | use warnings; | |
5 | use strict; | |
6 | use CAM::PDF; | |
7 | use Getopt::Long; | |
8 | use Pod::Usage; | |
9 | ||
10 | our $VERSION = '1.58'; | |
11 | ||
12 | my %opts = ( | |
13 | template => 'crunchjpg_tmpl.pdf', | |
14 | ||
15 | verbose => 0, | |
16 | help => 0, | |
17 | version => 0, | |
18 | skip => {}, | |
19 | ||
20 | # Temporary values: | |
21 | skipval => [], | |
22 | ); | |
23 | ||
24 | Getopt::Long::Configure('bundling'); | |
25 | GetOptions('S|skip=s' => \@{$opts{skipval}}, | |
26 | 'v|verbose' => \$opts{verbose}, | |
27 | 'h|help' => \$opts{help}, | |
28 | 'V|version' => \$opts{version}, | |
29 | ) or pod2usage(1); | |
30 | if ($opts{help}) | |
31 | { | |
32 | pod2usage(-exitstatus => 0, -verbose => 2); | |
33 | } | |
34 | if ($opts{version}) | |
35 | { | |
36 | print "CAM::PDF v$CAM::PDF::VERSION\n"; | |
37 | exit 0; | |
38 | } | |
39 | ||
40 | foreach my $flag (qw( skip )) | |
41 | { | |
42 | foreach my $val (@{$opts{$flag.'val'}}) | |
43 | { | |
44 | foreach my $key (split /\D+/xms, $val) | |
45 | { | |
46 | $opts{$flag}->{$key} = 1; | |
47 | } | |
48 | } | |
49 | } | |
50 | if (!-f $opts{template}) | |
51 | { | |
52 | die "Cannot find the template pdf called $opts{template}\n"; | |
53 | } | |
54 | ||
55 | if (@ARGV < 2) | |
56 | { | |
57 | pod2usage(1); | |
58 | } | |
59 | ||
60 | my $infile = shift; | |
61 | my $outdir = shift; | |
62 | ||
63 | my $doc = CAM::PDF->new($infile) || die "$CAM::PDF::errstr\n"; | |
64 | ||
65 | my $pages = $doc->numPages(); | |
66 | my $nimages = 0; | |
67 | my $rimages = 0; | |
68 | my %doneobjs; | |
69 | ||
70 | for my $p (1..$pages) | |
71 | { | |
72 | my $c = $doc->getPageContent($p); | |
73 | my @parts = split /(\/[\w]+\s*Do)\b/xms, $c; | |
74 | foreach my $part (@parts) | |
75 | { | |
76 | if ($part =~ m/\A(\/[\w]+)\s*Do\z/xms) | |
77 | { | |
78 | my $ref = $1; | |
79 | my $xobj = $doc->dereference($ref, $p); | |
80 | my $objnum = $xobj->{objnum}; | |
81 | my $im = $doc->getValue($xobj); | |
82 | my $w = $im->{Width} || $im->{W} || 0; | |
83 | if ($w) | |
84 | { | |
85 | $w = $doc->getValue($w); | |
86 | } | |
87 | my $h = $im->{Height} || $im->{H} || 0; | |
88 | if ($h) | |
89 | { | |
90 | $h = $doc->getValue($h); | |
91 | } | |
92 | ||
93 | next if (exists $doneobjs{$objnum}); | |
94 | ||
95 | $nimages++; | |
96 | _inform("Image $nimages page $p, $ref = object $objnum, (w,h)=($w,$h)", $opts{verbose}); | |
97 | ||
98 | if (exists $opts{skip}->{$objnum}) | |
99 | { | |
100 | _inform("Skipping object $objnum", $opts{verbose}); | |
101 | next; | |
102 | } | |
103 | ||
104 | my $isjpg = _isjpg($im); | |
105 | ||
106 | if (!$isjpg) | |
107 | { | |
108 | _inform('Not a jpeg', $opts{verbose}); | |
109 | } | |
110 | else | |
111 | { | |
112 | my $oldsize = $doc->getValue($im->{Length}); | |
113 | if (!$oldsize) | |
114 | { | |
115 | die "PDF error: Failed to get size of image\n"; | |
116 | } | |
117 | ||
118 | my $tmpl = CAM::PDF->new($opts{template}) || die "$CAM::PDF::errstr\n"; | |
119 | ||
120 | # Get a handle on the needed data bits from the template | |
121 | my $media_array = $tmpl->getValue($tmpl->getPage(1)->{MediaBox}); | |
122 | my $rawpage = $tmpl->getPageContent(1); | |
123 | ||
124 | $media_array->[2]->{value} = $w; | |
125 | $media_array->[3]->{value} = $h; | |
126 | my $page = $rawpage; | |
127 | $page =~ s/xxx/$w/igxms; | |
128 | $page =~ s/yyy/$h/igxms; | |
129 | $tmpl->setPageContent(1, $page); | |
130 | $tmpl->replaceObject(9, $doc, $objnum, 1); | |
131 | ||
132 | my $ofile = "/tmp/crunchjpg.$$"; | |
133 | $tmpl->cleanoutput($ofile); | |
134 | ||
135 | if (!-d $outdir) | |
136 | { | |
137 | require File::Path; | |
138 | File::Path::mkpath($outdir); | |
139 | } | |
140 | `convert -quality 50 -density 72x72 -page ${w}x$h pdf:$ofile jpg:$outdir/$objnum.jpg`; ## no critic (Backtick) | |
141 | ||
142 | $doneobjs{$objnum} = 1; | |
143 | $rimages++; | |
144 | } | |
145 | } | |
146 | } | |
147 | } | |
148 | ||
149 | _inform("Extracted $rimages of $nimages images", $opts{verbose}); | |
150 | ||
151 | sub _isjpg | |
152 | { | |
153 | my $im = shift; | |
154 | return if (!$im->{Filter}); | |
155 | ||
156 | my $f = $im->{Filter}; | |
157 | my @names = $f->{type} eq 'array' ? @{$f->{value}} : $f; | |
158 | for my $e (@names) | |
159 | { | |
160 | my $name = $doc->getValue($e); | |
161 | if (ref $name) | |
162 | { | |
163 | $name = $name->{value}; | |
164 | } | |
165 | #warn "Checking $name\n"; | |
166 | if ($name eq 'DCTDecode') | |
167 | { | |
168 | return 1; | |
169 | } | |
170 | } | |
171 | return; | |
172 | } | |
173 | ||
174 | sub _inform | |
175 | { | |
176 | my $str = shift; | |
177 | my $verbose = shift; | |
178 | ||
179 | if ($verbose) | |
180 | { | |
181 | print STDERR $str, "\n"; | |
182 | } | |
183 | return; | |
184 | } | |
185 | ||
186 | ||
187 | __END__ | |
188 | ||
189 | =for stopwords extractjpgs.pl ImageMagick JPG | |
190 | ||
191 | =head1 NAME | |
192 | ||
193 | extractjpgs.pl - Save copies of all PDF JPG images to a directory | |
194 | ||
195 | =head1 SYNOPSIS | |
196 | ||
197 | extractjpgs.pl [options] infile.pdf outdirectory | |
198 | ||
199 | Options: | |
200 | -S --skip=imnum don't output the specified images (can be used mutliple times) | |
201 | -v --verbose print diagnostic messages | |
202 | -h --help verbose help message | |
203 | -V --version print CAM::PDF version | |
204 | ||
205 | C<imnum> is a comma-separated list of integers indicating the images | |
206 | in order that they appear in the PDF. Use F<listimages.pl> to retrieve | |
207 | the image numbers. | |
208 | ||
209 | =head1 DESCRIPTION | |
210 | ||
211 | Requires the ImageMagick B<convert> program to be available | |
212 | ||
213 | Searches the PDF for JPG images and saves them as individual files in the | |
214 | specified directory. The files are named C<E<lt>imnumE<gt>.jpg>. | |
215 | ||
216 | =head1 SEE ALSO | |
217 | ||
218 | CAM::PDF | |
219 | ||
220 | F<crunchjpgs.pl> | |
221 | ||
222 | F<listimages.pl> | |
223 | ||
224 | F<extractallimages.pl> | |
225 | ||
226 | F<uninlinepdfimages.pl> | |
227 | ||
228 | =head1 AUTHOR | |
229 | ||
230 | See L<CAM::PDF> | |
231 | ||
232 | =cut |