Révision 3541
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/macro/pdf/PDF2AnnotationsMacro.groovy (revision 3541) | ||
---|---|---|
1 |
package org.txm.macro.pdf |
|
2 |
// STANDARD DECLARATIONS |
|
3 |
|
|
4 |
import org.kohsuke.args4j.* |
|
5 |
import groovy.transform.Field |
|
6 |
import org.txm.rcp.swt.widget.parameters.* |
|
7 |
import org.apache.pdfbox.Loader |
|
8 |
import org.apache.pdfbox.text.PDFTextStripperByArea |
|
9 |
import org.apache.pdfbox.cos.COSName |
|
10 |
import java.awt.geom.Rectangle2D |
|
11 |
import java.awt.geom.Rectangle2D.Float |
|
12 |
import org.apache.pdfbox.cos.COSArray |
|
13 |
import org.apache.pdfbox.cos.COSFloat |
|
14 |
import org.apache.pdfbox.pdmodel.common.PDRectangle |
|
15 |
|
|
16 |
// BEGINNING OF PARAMETERS |
|
17 |
|
|
18 |
@Field @Option(name="input_file", usage=".pdf input file", widget="File", required=false, def="") |
|
19 |
def input_file |
|
20 |
|
|
21 |
@Field @Option(name="input_dir", usage="The directory containing the .pdf files to read", widget="Folder", required=false, def="") |
|
22 |
def input_dir |
|
23 |
|
|
24 |
|
|
25 |
// Open the parameters input dialog box |
|
26 |
if (!ParametersDialog.open(this)) return |
|
27 |
|
|
28 |
// END OF PARAMETERS |
|
29 |
|
|
30 |
ArrayList<String> highlightedTexts = new ArrayList<>() |
|
31 |
|
|
32 |
if (input_dir != null && input_dir.exists()) { |
|
33 |
nFiles = 0 |
|
34 |
input_dir.eachFileMatch(~/.*.pdf/) { f -> |
|
35 |
if (f.getAbsolutePath().toUpperCase().endsWith(".PDF")) { |
|
36 |
processFile(f) |
|
37 |
nFiles++ |
|
38 |
} |
|
39 |
} |
|
40 |
println "Processed "+nFiles+" files." |
|
41 |
} else { |
|
42 |
if (input_file.getAbsolutePath().toUpperCase().endsWith(".PDF")) { |
|
43 |
processFile(input_file) |
|
44 |
} |
|
45 |
} |
|
46 |
|
|
47 |
def processFile(input_file) { |
|
48 |
|
|
49 |
name = input_file.getName() |
|
50 |
dir = input_file.getParentFile() |
|
51 |
idx = name.lastIndexOf(".") |
|
52 |
if (idx > 0) name = name.substring(0, idx) |
|
53 |
outputFile = new File(input_file.getParentFile(), name + "-annotations.tsv") |
|
54 |
|
|
55 |
println "Processing "+name+"..." |
|
56 |
|
|
57 |
try { |
|
58 |
outputFile.withWriter("UTF-8") { writer -> |
|
59 |
|
|
60 |
doc = Loader.loadPDF(input_file) |
|
61 |
|
|
62 |
writer.println "page\ttype\thighlight\tcomment" |
|
63 |
|
|
64 |
for (npage = 0; npage < doc.getNumberOfPages(); ++npage) { |
|
65 |
|
|
66 |
page = doc.getPage(npage) |
|
67 |
al = page.getAnnotations() |
|
68 |
stripper = new PDFTextStripperByArea() |
|
69 |
stripper.setSortByPosition(true) |
|
70 |
|
|
71 |
al.each { |
|
72 |
if (it.getSubtype().equals("Highlight")) { |
|
73 |
s = (npage+1)+"\t"+it.getSubtype() |
|
74 |
writer.print s |
|
75 |
|
|
76 |
// adapted from https://www.anycodings.com/1questions/5047580/java-apache-pdfbox-extract-highlighted-text |
|
77 |
|
|
78 |
quadsArray = it.getCOSObject().getDictionaryObject(COSName.getPDFName("QuadPoints")) |
|
79 |
str = null |
|
80 |
|
|
81 |
for(int j=1, k=0; j<=(quadsArray.size()/8); j++) { |
|
82 |
|
|
83 |
ULX = quadsArray.get(0+k) |
|
84 |
ULY = quadsArray.get(1+k) |
|
85 |
URX = quadsArray.get(2+k) |
|
86 |
URY = quadsArray.get(3+k) |
|
87 |
LLX = quadsArray.get(4+k) |
|
88 |
LLY = quadsArray.get(5+k) |
|
89 |
LRX = quadsArray.get(6+k) |
|
90 |
LRY = quadsArray.get(7+k) |
|
91 |
|
|
92 |
k+=8 |
|
93 |
|
|
94 |
ulx = ULX.floatValue() - 1 // upper left x. |
|
95 |
uly = ULY.floatValue() // upper left y. |
|
96 |
width = URX.floatValue() - LLX.floatValue() // calculated by upperRightX - lowerLeftX. |
|
97 |
height = URY.floatValue() - LLY.floatValue() // calculated by upperRightY - lowerLeftY. |
|
98 |
|
|
99 |
PDRectangle pageSize = page.getMediaBox() |
|
100 |
uly = pageSize.getHeight() - uly |
|
101 |
|
|
102 |
rectangle_2 = new Rectangle2D.Double(ulx, uly, width, height) |
|
103 |
stripper.addRegion("highlightedRegion", rectangle_2) |
|
104 |
stripper.extractRegions(page) |
|
105 |
highlightedText = stripper.getTextForRegion("highlightedRegion").trim() |
|
106 |
|
|
107 |
if(j > 1) { |
|
108 |
str = str+" "+highlightedText |
|
109 |
} else { |
|
110 |
str = highlightedText |
|
111 |
} |
|
112 |
} |
|
113 |
if (str.length() > 0) { |
|
114 |
writer.print '\t"'+str.replaceAll('"', '""')+'"' |
|
115 |
} else { |
|
116 |
writer.print '\t""' |
|
117 |
} |
|
118 |
if (it.getContents() != null) { |
|
119 |
writer.println '\t"'+it.getContents().replaceAll('"', '""')+'"' |
|
120 |
} else { |
|
121 |
writer.println '\t""' |
|
122 |
} |
|
123 |
} else { |
|
124 |
if (it.getContents() != null) { |
|
125 |
s = (npage+1)+"\t"+it.getSubtype() |
|
126 |
writer.print s |
|
127 |
s = '\t""\t"'+it.getContents().replaceAll('"', '""')+'"' |
|
128 |
writer.println s |
|
129 |
} |
|
130 |
} |
|
131 |
} |
|
132 |
} |
|
133 |
doc.close() |
|
134 |
writer.close() |
|
135 |
} // writer |
|
136 |
} catch (Exception e) { |
|
137 |
e.printStackTrace() |
|
138 |
} |
|
139 |
} |
|
140 |
|
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/macro/pdf/PDF2TXTMacro.groovy (revision 3541) | ||
---|---|---|
15 | 15 |
@Field @Option(name="input_dir", usage="The directory containing the .pdf files to read", widget="Folder", required=false, def="") |
16 | 16 |
def input_dir |
17 | 17 |
|
18 |
@Field @Option(name="set_sort_by_position", usage="""The order of the text tokens in a PDF file may not be in the same as they appear visually on the screen. For example, a PDF writer may write out all text by font, so all bold or larger text, then make a second pass and write out the normal text. |
|
19 |
The default is to not sort by position. |
|
18 | 20 |
|
21 |
A PDF writer could choose to write each character in a different order. By default PDFBox does not sort the text tokens before processing them due to performance reasons.""", widget="Boolean", required=true, def="false") |
|
22 |
def set_sort_by_position |
|
23 |
|
|
19 | 24 |
// Open the parameters input dialog box |
20 | 25 |
if (!ParametersDialog.open(this)) return |
21 | 26 |
|
... | ... | |
24 | 29 |
if (input_dir != null && input_dir.exists()) { |
25 | 30 |
nFiles = 0 |
26 | 31 |
input_dir.eachFileMatch(~/.*.pdf/) { f -> |
27 |
name = f.getName() |
|
28 |
println "Processing "+name+"..." |
|
29 |
idx = name.lastIndexOf(".") |
|
30 |
if (idx > 0) name = name.substring(0, idx) |
|
31 |
outputFile = new File(f.getParentFile(), name + ".txt") |
|
32 |
|
|
33 |
pdfFile = f.getAbsolutePath() |
|
34 |
if (pdfFile.toUpperCase().endsWith(".PDF")) { |
|
35 |
textFile = pdfFile.substring(0, pdfFile.length() - 3) + "txt" |
|
36 |
try { |
|
37 |
outputFile.withWriter("UTF-8") { writer -> |
|
38 |
doc = Loader.loadPDF(f) |
|
39 |
strip = new PDFTextStripper() |
|
40 |
strip.setSortByPosition(true) |
|
41 |
strip.setPageStart("<pb/>") |
|
42 |
// strip.setParagraphStart("\n<p>") |
|
43 |
// strip.setParagraphEnd("</p>") |
|
44 |
writer.print strip.getText(doc) |
|
45 |
doc.close() |
|
46 |
writer.close() |
|
47 |
nFiles++ |
|
48 |
} |
|
49 |
} catch (Exception e) { |
|
50 |
e.printStackTrace() |
|
51 |
} |
|
32 |
if (f.getAbsolutePath().toUpperCase().endsWith(".PDF")) { |
|
33 |
processFile(f) |
|
34 |
nFiles++ |
|
52 | 35 |
} |
53 | 36 |
} |
54 | 37 |
println "Processed "+nFiles+" files." |
55 | 38 |
} else { |
39 |
if (input_file.getAbsolutePath().toUpperCase().endsWith(".PDF")) { |
|
40 |
processFile(input_file) |
|
41 |
} |
|
42 |
} |
|
43 |
|
|
44 |
def processFile(input_file) { |
|
45 |
|
|
56 | 46 |
name = input_file.getName() |
57 | 47 |
idx = name.lastIndexOf(".") |
58 | 48 |
if (idx > 0) name = name.substring(0, idx) |
... | ... | |
60 | 50 |
|
61 | 51 |
pdfFile = input_file.getAbsolutePath() |
62 | 52 |
if (pdfFile.toUpperCase().endsWith(".PDF")) { |
53 |
|
|
54 |
println "Processing "+name+"..." |
|
55 |
|
|
63 | 56 |
textFile = pdfFile.substring(0, pdfFile.length() - 3) + "txt" |
64 | 57 |
try { |
65 | 58 |
outputFile.withWriter("UTF-8") { writer -> |
66 | 59 |
doc = Loader.loadPDF(input_file) |
67 | 60 |
strip = new PDFTextStripper() |
61 |
|
|
62 |
/* |
|
68 | 63 |
println "SpacingTolerance = "+strip.getSpacingTolerance() |
69 | 64 |
// Set the space width-based tolerance value that is used to estimate where spaces in text should be added. Note that the default value for this has been determined from trial and error. Setting this value larger will reduce the number of spaces added. |
70 | 65 |
println "AverageCharTolerance = "+strip.getAverageCharTolerance() |
... | ... | |
73 | 68 |
// sets the multiple of whitespace character widths for the current text which the current line start can be indented from the previous line start beyond which the current line start is considered to be a paragraph start. The default value is 2.0. |
74 | 69 |
println "DropThreshold = "+strip.getDropThreshold() |
75 | 70 |
// sets the minimum whitespace, as a multiple of the max height of the current characters beyond which the current line start is considered to be a paragraph start. The default value is 2.5. |
76 |
strip.setSortByPosition(true) |
|
71 |
*/ |
|
72 |
|
|
73 |
/* |
|
74 |
println "SeparateByBeads = "+strip.getSeparateByBeads() |
|
75 |
// this will tell if the text stripper should separate by beads |
|
76 |
println "ArticleStart = "+strip.getArticleStart() |
|
77 |
// the string which will be used at the beginning of an article |
|
78 |
println "ArticleEnd = "+strip.getArticleEnd() |
|
79 |
// the string which will be used at the end of an article |
|
80 |
println "AddMoreFormatting = "+strip.getAddMoreFormatting() |
|
81 |
// this will tell if the text stripper should add some more text formatting |
|
82 |
println "LineSeparator = "+strip.getLineSeparator() |
|
83 |
// the desired line separator for output text. The line.separator system property is used if the line separator preference is not set explicitly |
|
84 |
println "WordSeparator = "+strip.getWordSeparator() |
|
85 |
// set the desired word separator for output text. The PDFBox text extraction algorithm will output a space character if there is enough space between two words. By default a space character is used. If you need and accurate count of characters that are found in a PDF document then you might want to set the word separator to the empty string |
|
86 |
*/ |
|
87 |
|
|
88 |
strip.setSortByPosition(set_sort_by_position) |
|
77 | 89 |
strip.setPageStart("<pb/>") |
78 | 90 |
// strip.setParagraphStart("\n<p>") |
79 | 91 |
// strip.setParagraphEnd("</p>") |
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/macro/pdf/PDF2PNGMacro.groovy (revision 3541) | ||
---|---|---|
26 | 26 |
if (input_dir != null && input_dir.exists()) { |
27 | 27 |
nFiles = 0 |
28 | 28 |
input_dir.eachFileMatch(~/.*.pdf/) { f -> |
29 |
name = f.getName() |
|
30 |
idx = name.lastIndexOf(".") |
|
31 |
if (idx > 0) name = name.substring(0, idx) |
|
32 |
dir = f.getParentFile() |
|
33 |
println "Processing "+name+"..." |
|
34 |
|
|
35 |
pdfFile = f.getAbsolutePath() |
|
36 |
if (pdfFile.toUpperCase().endsWith(".PDF")) { |
|
37 |
textFile = pdfFile.substring(0, pdfFile.length() - 3) + "txt" |
|
38 |
try { |
|
39 |
doc = Loader.loadPDF(f) |
|
40 |
pdfRenderer = new PDFRenderer(doc) |
|
41 |
for (page = 0; page < doc.getNumberOfPages(); ++page) { |
|
42 |
bim = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB) |
|
43 |
fos = new FileOutputStream(new File(dir, name + "-" + (page+1) + ".png")) |
|
44 |
ImageIOUtil.writeImage(bim, "png", fos, 300) |
|
45 |
} |
|
46 |
fos.close() |
|
47 |
doc.close() |
|
48 |
nFiles++ |
|
49 |
} catch (Exception e) { |
|
50 |
e.printStackTrace() |
|
51 |
} |
|
29 |
if (f.getAbsolutePath().toUpperCase().endsWith(".PDF")) { |
|
30 |
processFile(f) |
|
31 |
nFiles++ |
|
52 | 32 |
} |
53 | 33 |
} |
54 | 34 |
println "Processed "+nFiles+" files." |
55 | 35 |
} else { |
36 |
if (input_file.getAbsolutePath().toUpperCase().endsWith(".PDF")) { |
|
37 |
processFile(input_file) |
|
38 |
} |
|
39 |
} |
|
40 |
|
|
41 |
def processFile(input_file) { |
|
42 |
|
|
56 | 43 |
name = input_file.getName() |
57 | 44 |
dir = input_file.getParentFile() |
58 | 45 |
idx = name.lastIndexOf(".") |
... | ... | |
60 | 47 |
|
61 | 48 |
pdfFile = input_file.getAbsolutePath() |
62 | 49 |
if (pdfFile.toUpperCase().endsWith(".PDF")) { |
50 |
|
|
51 |
println "Processing "+name+"..." |
|
52 |
|
|
63 | 53 |
textFile = pdfFile.substring(0, pdfFile.length() - 3) + "txt" |
64 | 54 |
try { |
65 | 55 |
doc = Loader.loadPDF(input_file) |
Formats disponibles : Unified diff