266 lines
12 KiB
Diff
266 lines
12 KiB
Diff
From 0bafbd9c1273fab0dc79fd20db0ffc4443683f96 Mon Sep 17 00:00:00 2001
|
|
From: Ken Sharp <ken.sharp@artifex.com>
|
|
Date: Mon, 29 Apr 2019 11:14:06 +0100
|
|
Subject: [PATCH] PDF interpreter - Decode ToUnicode entries of the form
|
|
/Identity-H/V
|
|
|
|
Bug #701003 "Text searchability broken due to omission of /ToUnicode /Identity-H"
|
|
|
|
The PDF references from 1.2 too 2.0 all state that the value associated
|
|
with a ToUnicode key in a FontDescriptor must be a stream object. However
|
|
this file (and one case seen previously, bug 687351) have FontDescriptor
|
|
dictionaries where the value associated with a /ToUnicode key is a
|
|
name object, in both cases /Identity-H.
|
|
|
|
Although this is clearly not legal, Acrobat not only tolerates it, it
|
|
actually uses it for search/copy/paste (see bug 701003 for details).
|
|
Without the key Acrobat is unable to successfully search the output file.
|
|
|
|
We can't simply preserve the name object as a ToUnicode value; when
|
|
handling ToUnicode we actually decode the CMap and build a
|
|
GlyphNames2Unicode map (an internal representation of the G2U data
|
|
produced by the Microsoft PostScript printer driver). When writing the
|
|
output file we use that information to get a Unicode value for each
|
|
character we write, and build a new ToUnicode CMap using that.
|
|
|
|
This commit tackles the problem by pre-scanning for a name object and
|
|
then checking to see if its Identity-H or Identity-V (although we have
|
|
not seen an Identity-V, there seems no reason why it wouldn't be
|
|
equally valid). If we find either of these then we construct a
|
|
GlyphNames2Unicode table for all possible values (0 - 65535) and store
|
|
that with the font as normal. When we write the output file we only
|
|
write the required entries for the subset font, so we write a now
|
|
completely legal ToUnicode CMap, and Acrobat is equally happy with that
|
|
as the original name.
|
|
|
|
If the ToUnicode value isn't a name object, or isn't one of the
|
|
identities then we proceed as before. This means we will print a
|
|
warning for non conforming ToUnicode entries and ignore them.
|
|
---
|
|
Resource/Init/pdf_font.ps | 200 ++++++++++++++++++++++++++++++----------------
|
|
1 file changed, 129 insertions(+), 71 deletions(-)
|
|
|
|
diff --git a/Resource/Init/pdf_font.ps b/Resource/Init/pdf_font.ps
|
|
index 0e802d3..964d54c 100644
|
|
--- a/Resource/Init/pdf_font.ps
|
|
+++ b/Resource/Init/pdf_font.ps
|
|
@@ -621,86 +621,144 @@ currentdict end readonly def
|
|
PDFDEBUG {
|
|
(.processToUnicode beg) =
|
|
} if
|
|
- 2 index /ToUnicode knownoget {
|
|
- dup type /dicttype eq { dup /File known not } { //true } ifelse {
|
|
- % We undefine wrong /Length and define /File in stream dictionaries.
|
|
- % Bug687351.pdf defines /ToUnicode /Identity-H, what is incorrect.
|
|
- ( **** Warning: Ignoring bad ToUnicode CMap.\n) pdfformatwarning
|
|
- pop
|
|
+
|
|
+ 2 index /ToUnicode knownoget
|
|
+ {
|
|
+ dup type /nametype eq {
|
|
+ % This is contrary to the specification but it seems that Acrobat at least will accept
|
|
+ % a ToUnicode with a value of Identity-H *and* will use that for search, copy/paste.
|
|
+ % We can't pass through a name, so the best we can do is build a GlyphNames2Unicode
|
|
+ % map matching that which would have been generated by a full 16-bit Identity CMap
|
|
+ %
|
|
+ % See bug numbers 701003 and 687351
|
|
+ %
|
|
+ dup /Identity-H eq 1 index /Identity-V eq or{
|
|
+ pop
|
|
+ 1 index /FontInfo .knownget not {
|
|
+ currentglobal 2 index dup gcheck setglobal
|
|
+ /FontInfo 5 dict dup 5 1 roll .forceput
|
|
+ setglobal
|
|
+ } if
|
|
+ dup /GlyphNames2Unicode .knownget not {
|
|
+ //true % No existing G2U, make one
|
|
+ } {
|
|
+ dup wcheck {
|
|
+ //false % Existing, writeable G2U, don't make new one
|
|
+ } {
|
|
+ pop //true % Existing read only G2U, make new one
|
|
+ } ifelse
|
|
+ } ifelse
|
|
+ {
|
|
+ currentglobal exch dup gcheck setglobal
|
|
+ dup /GlyphNames2Unicode 100 dict dup 4 1 roll .forceput
|
|
+ 3 2 roll setglobal
|
|
+ } if % font-res font-dict encoding|null font-info g2u
|
|
+
|
|
+ 0 1 65535{
|
|
+ % g2u index
|
|
+ dup dup 256 mod exch 256 idiv % g2u index lo-byte hi-byte
|
|
+ 2 string dup 0 4 -1 roll % g2u index lo-byte () () 0 hi-byte
|
|
+ put % g2u index lo-byte (x)
|
|
+ dup 1 % g2u index lo-byte (x) (x) 1
|
|
+ 4 -1 roll put % g2u index (x) (x) 1 lo-byte -> dict index (xx)
|
|
+ 2 index % g2u index (xx) dict
|
|
+ 3 1 roll % g2u g2u index (xx)
|
|
+ put % g2u
|
|
+ } for
|
|
+ pop % font-res font-dict encoding|null font-info
|
|
+ pop % font-res font-dict encoding|null
|
|
+ //false % We built a GlyphNames2Unicode table, don't need to process further
|
|
+ }{
|
|
+ //true % name is not Identity-V or H, fail by falling through
|
|
+ }ifelse
|
|
} {
|
|
- /PDFScanRules .getuserparam dup //null eq {
|
|
- pop //PDFScanRules_null
|
|
- } {
|
|
- 1 dict dup /PDFScanRules 4 -1 roll put
|
|
- } ifelse
|
|
- //PDFScanRules_true setuserparams
|
|
- PDFfile fileposition
|
|
- 3 -1 roll
|
|
- count 1 sub
|
|
- countdictstack
|
|
- { //false resolvestream
|
|
- % Following Acrobat we ignore everything outside
|
|
- % begincodespacerange .. endcmap.
|
|
- dup 0 (begincodespacerange) /SubFileDecode filter flushfile
|
|
- /CIDInit /ProcSet findresource begin
|
|
- //ToUnicodeCMapReader begin
|
|
- 12 dict begin
|
|
- /CMapType 2 def
|
|
- mark exch % emulate 'begincodespacerange'
|
|
- 0 (endcmap) /SubFileDecode filter cvx /begincmap cvx exch 2 .execn
|
|
- endcmap
|
|
- userdict /.lastToUnicode currentdict put
|
|
- end end end
|
|
- }
|
|
+ //true
|
|
+ } ifelse % not a name, try as a dictionary (as specified)
|
|
|
|
- PDFSTOPONERROR {
|
|
- { exec } 0 get
|
|
- //false
|
|
- 5 -2 roll
|
|
- 5
|
|
+ % If the ToUnicode isn't a name, or the name isn't Identity-V or -H then follow the specification
|
|
+ % If its not a dictionary type throw an error, otherwise decode it and build a GlyphNames2Unicode
|
|
+ %
|
|
+ {
|
|
+ dup type /dicttype eq { dup /File known not } { //true } ifelse {
|
|
+ % We undefine wrong /Length and define /File in stream dictionaries.
|
|
+ % Bug687351.pdf defines /ToUnicode /Identity-H, what is incorrect.
|
|
+ ( **** Warning: Ignoring bad ToUnicode CMap.\n) pdfformatwarning
|
|
+ pop
|
|
} {
|
|
- { stopped } 0 get
|
|
- 4 2 roll
|
|
- 4
|
|
- } ifelse
|
|
- array astore cvx exec
|
|
+ /PDFScanRules .getuserparam dup //null eq {
|
|
+ pop //PDFScanRules_null
|
|
+ } {
|
|
+ 1 dict dup /PDFScanRules 4 -1 roll put
|
|
+ } ifelse
|
|
+ //PDFScanRules_true setuserparams
|
|
+ PDFfile fileposition
|
|
+ 3 -1 roll
|
|
+ count 1 sub
|
|
+ countdictstack
|
|
+ { //false resolvestream
|
|
+ % Following Acrobat we ignore everything outside
|
|
+ % begincodespacerange .. endcmap.
|
|
+ dup 0 (begincodespacerange) /SubFileDecode filter flushfile
|
|
+ /CIDInit /ProcSet findresource begin
|
|
+ //ToUnicodeCMapReader begin
|
|
+ 12 dict begin
|
|
+ /CMapType 2 def
|
|
+ mark exch % emulate 'begincodespacerange'
|
|
+ 0 (endcmap) /SubFileDecode filter cvx /begincmap cvx exch 2 .execn
|
|
+ endcmap
|
|
+ userdict /.lastToUnicode currentdict put
|
|
+ end end end
|
|
+ }
|
|
|
|
- countdictstack exch sub 0 .max { end } repeat
|
|
- count exch sub 2 sub 0 .max { exch pop } repeat
|
|
- 3 1 roll % Stach the stop flag.
|
|
- PDFfile exch setfileposition
|
|
- setuserparams
|
|
- {
|
|
- ( **** Warning: Failed to read ToUnicode CMap.\n) pdfformatwarning
|
|
- } {
|
|
- 1 index /FontInfo .knownget not {
|
|
- currentglobal 2 index dup gcheck setglobal
|
|
- /FontInfo 5 dict dup 5 1 roll .forceput
|
|
- setglobal
|
|
- } if
|
|
- dup /GlyphNames2Unicode .knownget not {
|
|
- //true % No existing G2U, make one
|
|
+ PDFSTOPONERROR {
|
|
+ { exec } 0 get
|
|
+ //false
|
|
+ 5 -2 roll
|
|
+ 5
|
|
+ } {
|
|
+ { stopped } 0 get
|
|
+ 4 2 roll
|
|
+ 4
|
|
+ } ifelse
|
|
+ array astore cvx exec
|
|
+
|
|
+ countdictstack exch sub 0 .max { end } repeat
|
|
+ count exch sub 2 sub 0 .max { exch pop } repeat
|
|
+ 3 1 roll % Stach the stop flag.
|
|
+ PDFfile exch setfileposition
|
|
+ setuserparams
|
|
+ {
|
|
+ ( **** Warning: Failed to read ToUnicode CMap.\n) pdfformatwarning
|
|
} {
|
|
- dup wcheck {
|
|
- //false % Existing, writeable G2U, don't make new one
|
|
+ 1 index /FontInfo .knownget not {
|
|
+ currentglobal 2 index dup gcheck setglobal
|
|
+ /FontInfo 5 dict dup 5 1 roll .forceput
|
|
+ setglobal
|
|
+ } if
|
|
+ dup /GlyphNames2Unicode .knownget not {
|
|
+ //true % No existing G2U, make one
|
|
} {
|
|
- pop //true % Existing read only G2U, make new one
|
|
+ dup wcheck {
|
|
+ //false % Existing, writeable G2U, don't make new one
|
|
+ } {
|
|
+ pop //true % Existing read only G2U, make new one
|
|
+ } ifelse
|
|
} ifelse
|
|
+ {
|
|
+ currentglobal exch dup gcheck setglobal
|
|
+ dup /GlyphNames2Unicode 100 dict dup 4 1 roll .forceput
|
|
+ 3 2 roll setglobal
|
|
+ } if % font-res font-dict encoding|null font-info g2u
|
|
+ exch pop exch % font-res font-dict g2u encoding|null
|
|
+ userdict /.lastToUnicode get % font-res font-dict g2u Encoding|null CMap
|
|
+ .convert_ToUnicode-into-g2u % font-res font-dict
|
|
+ //null % font-res font-dict //null
|
|
} ifelse
|
|
- {
|
|
- currentglobal exch dup gcheck setglobal
|
|
- dup /GlyphNames2Unicode 100 dict dup 4 1 roll .forceput
|
|
- 3 2 roll setglobal
|
|
- } if % font-res font-dict encoding|null font-info g2u
|
|
- exch pop exch % font-res font-dict g2u encoding|null
|
|
- userdict /.lastToUnicode get % font-res font-dict g2u Encoding|null CMap
|
|
- .convert_ToUnicode-into-g2u % font-res font-dict
|
|
- //null % font-res font-dict //null
|
|
} ifelse
|
|
- } ifelse
|
|
- } if
|
|
- PDFDEBUG {
|
|
- (.processToUnicode end) =
|
|
+ } if
|
|
+ PDFDEBUG {
|
|
+ (.processToUnicode end) =
|
|
+ } if
|
|
} if
|
|
} if
|
|
} stopped
|
|
--
|
|
2.9.1
|
|
|