Révision 3541

TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/macro/pdf/PDF2AnnotationsMacro.groovy (revision 3541)
1
package org.txm.macro.pdf
2
// STANDARD DECLARATIONS
3

  
4
import org.kohsuke.args4j.*
5
import groovy.transform.Field
6
import org.txm.rcp.swt.widget.parameters.*
7
import org.apache.pdfbox.Loader
8
import org.apache.pdfbox.text.PDFTextStripperByArea
9
import org.apache.pdfbox.cos.COSName
10
import java.awt.geom.Rectangle2D
11
import java.awt.geom.Rectangle2D.Float
12
import org.apache.pdfbox.cos.COSArray
13
import org.apache.pdfbox.cos.COSFloat
14
import org.apache.pdfbox.pdmodel.common.PDRectangle
15

  
16
// BEGINNING OF PARAMETERS
17

  
18
@Field @Option(name="input_file", usage=".pdf input file", widget="File", required=false, def="")
19
def input_file
20

  
21
@Field @Option(name="input_dir", usage="The directory containing the .pdf files to read", widget="Folder", required=false, def="")
22
def input_dir
23

  
24

  
25
// Open the parameters input dialog box
26
if (!ParametersDialog.open(this)) return
27

  
28
// END OF PARAMETERS
29

  
30
ArrayList<String> highlightedTexts = new ArrayList<>()
31

  
32
if (input_dir != null &&  input_dir.exists()) {
33
	nFiles = 0
34
	input_dir.eachFileMatch(~/.*.pdf/) { f ->
35
		if (f.getAbsolutePath().toUpperCase().endsWith(".PDF")) {
36
	 		processFile(f)
37
	 		nFiles++
38
		}
39
	}
40
	println "Processed "+nFiles+" files."
41
} else {
42
	if (input_file.getAbsolutePath().toUpperCase().endsWith(".PDF")) {
43
	 processFile(input_file)
44
	}
45
}
46

  
47
def processFile(input_file) {
48

  
49
		name = input_file.getName()
50
		dir = input_file.getParentFile()
51
		idx = name.lastIndexOf(".")
52
		if (idx > 0) name = name.substring(0, idx)
53
		outputFile = new File(input_file.getParentFile(), name + "-annotations.tsv")
54
		
55
		println "Processing "+name+"..."
56
		
57
		try {
58
			outputFile.withWriter("UTF-8") { writer ->
59
			
60
			doc = Loader.loadPDF(input_file)
61
			
62
			writer.println "page\ttype\thighlight\tcomment"
63
			
64
			for (npage = 0; npage < doc.getNumberOfPages(); ++npage) {
65

  
66
				page = doc.getPage(npage)
67
				al = page.getAnnotations()
68
				stripper = new PDFTextStripperByArea()
69
				stripper.setSortByPosition(true)
70
				
71
				al.each {
72
					if (it.getSubtype().equals("Highlight")) {
73
						s = (npage+1)+"\t"+it.getSubtype()
74
						writer.print s
75
						
76
						// adapted from https://www.anycodings.com/1questions/5047580/java-apache-pdfbox-extract-highlighted-text
77
				
78
						quadsArray = it.getCOSObject().getDictionaryObject(COSName.getPDFName("QuadPoints"))
79
            					str = null
80

  
81
            					for(int j=1, k=0; j<=(quadsArray.size()/8); j++) {
82

  
83
                					ULX = quadsArray.get(0+k)
84
                					ULY = quadsArray.get(1+k)
85
                					URX = quadsArray.get(2+k)
86
                					URY = quadsArray.get(3+k)
87
                					LLX = quadsArray.get(4+k)
88
                					LLY = quadsArray.get(5+k)
89
                					LRX = quadsArray.get(6+k)
90
                					LRY = quadsArray.get(7+k)
91

  
92
                					k+=8
93

  
94
                					ulx = ULX.floatValue() - 1			// upper left x.
95
                					uly = ULY.floatValue()				// upper left y.
96
                					width = URX.floatValue() - LLX.floatValue()	// calculated by upperRightX - lowerLeftX.
97
                					height = URY.floatValue() - LLY.floatValue()	// calculated by upperRightY - lowerLeftY.
98

  
99
                					PDRectangle pageSize = page.getMediaBox()
100
                					uly = pageSize.getHeight() - uly
101

  
102
                					rectangle_2 = new Rectangle2D.Double(ulx, uly, width, height)
103
                					stripper.addRegion("highlightedRegion", rectangle_2)
104
                					stripper.extractRegions(page)
105
                					highlightedText = stripper.getTextForRegion("highlightedRegion").trim()
106

  
107
                					if(j > 1) {
108
                					    str = str+" "+highlightedText
109
                					} else {
110
                					    str = highlightedText
111
                					}
112
            					}
113
            					if (str.length() > 0) {
114
            						writer.print '\t"'+str.replaceAll('"', '""')+'"'
115
            					} else {
116
            						writer.print '\t""'
117
            					}
118
            					if (it.getContents() != null) {
119
							writer.println '\t"'+it.getContents().replaceAll('"', '""')+'"'
120
						} else {
121
            						writer.println '\t""'
122
            					}
123
					} else {
124
						if (it.getContents() != null) {
125
							s = (npage+1)+"\t"+it.getSubtype()
126
							writer.print s
127
							s = '\t""\t"'+it.getContents().replaceAll('"', '""')+'"'
128
							writer.println s
129
            					}
130
					}
131
			    	}
132
			}
133
			doc.close()
134
			writer.close()
135
			} // writer
136
		} catch (Exception e) {
137
			e.printStackTrace()
138
		}
139
}
140

  
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/macro/pdf/PDF2TXTMacro.groovy (revision 3541)
15 15
@Field @Option(name="input_dir", usage="The directory containing the .pdf files to read", widget="Folder", required=false, def="")
16 16
def input_dir
17 17

  
18
@Field @Option(name="set_sort_by_position", usage="""The order of the text tokens in a PDF file may not be in the same as they appear visually on the screen. For example, a PDF writer may write out all text by font, so all bold or larger text, then make a second pass and write out the normal text.
19
The default is to not sort by position.
18 20

  
21
A PDF writer could choose to write each character in a different order. By default PDFBox does not sort the text tokens before processing them due to performance reasons.""", widget="Boolean", required=true, def="false")
22
def set_sort_by_position
23

  
19 24
// Open the parameters input dialog box
20 25
if (!ParametersDialog.open(this)) return
21 26

  
......
24 29
if (input_dir != null &&  input_dir.exists()) {
25 30
	nFiles = 0
26 31
	input_dir.eachFileMatch(~/.*.pdf/) { f ->
27
		name = f.getName()
28
		println "Processing "+name+"..."
29
		idx = name.lastIndexOf(".")
30
		if (idx > 0) name = name.substring(0, idx)
31
		outputFile = new File(f.getParentFile(), name + ".txt")
32

  
33
		pdfFile = f.getAbsolutePath()
34
		if (pdfFile.toUpperCase().endsWith(".PDF")) {
35
			textFile = pdfFile.substring(0, pdfFile.length() - 3) + "txt"
36
			try {
37
				outputFile.withWriter("UTF-8") { writer ->
38
				doc = Loader.loadPDF(f)
39
				strip = new PDFTextStripper()
40
				strip.setSortByPosition(true)
41
				strip.setPageStart("<pb/>")
42
				// strip.setParagraphStart("\n<p>")
43
				// strip.setParagraphEnd("</p>")
44
				writer.print strip.getText(doc)
45
				doc.close()
46
				writer.close()
47
				nFiles++
48
				}
49
			} catch (Exception e) {
50
			e.printStackTrace()
51
			}
32
		if (f.getAbsolutePath().toUpperCase().endsWith(".PDF")) {
33
	 		processFile(f)
34
	 		nFiles++
52 35
		}
53 36
	}
54 37
	println "Processed "+nFiles+" files."
55 38
} else {
39
	if (input_file.getAbsolutePath().toUpperCase().endsWith(".PDF")) {
40
	 processFile(input_file)
41
	}
42
}
43

  
44
def processFile(input_file) {
45

  
56 46
	name = input_file.getName()
57 47
	idx = name.lastIndexOf(".")
58 48
	if (idx > 0) name = name.substring(0, idx)
......
60 50

  
61 51
	pdfFile = input_file.getAbsolutePath()
62 52
	if (pdfFile.toUpperCase().endsWith(".PDF")) {
53
	
54
		println "Processing "+name+"..."
55
		
63 56
		textFile = pdfFile.substring(0, pdfFile.length() - 3) + "txt"
64 57
		try {
65 58
			outputFile.withWriter("UTF-8") { writer ->
66 59
			doc = Loader.loadPDF(input_file)
67 60
			strip = new PDFTextStripper()
61
			
62
/*
68 63
			println "SpacingTolerance = "+strip.getSpacingTolerance()
69 64
			// Set the space width-based tolerance value that is used to estimate where spaces in text should be added. Note that the default value for this has been determined from trial and error. Setting this value larger will reduce the number of spaces added.
70 65
			println "AverageCharTolerance = "+strip.getAverageCharTolerance()
......
73 68
			// sets the multiple of whitespace character widths for the current text which the current line start can be indented from the previous line start beyond which the current line start is considered to be a paragraph start. The default value is 2.0.
74 69
			println "DropThreshold = "+strip.getDropThreshold()
75 70
			// sets the minimum whitespace, as a multiple of the max height of the current characters beyond which the current line start is considered to be a paragraph start. The default value is 2.5.
76
			strip.setSortByPosition(true)
71
*/
72

  
73
/*
74
			println "SeparateByBeads = "+strip.getSeparateByBeads()
75
			// this will tell if the text stripper should separate by beads
76
			println "ArticleStart = "+strip.getArticleStart()
77
			// the string which will be used at the beginning of an article
78
			println "ArticleEnd = "+strip.getArticleEnd()
79
			// the string which will be used at the end of an article
80
			println "AddMoreFormatting = "+strip.getAddMoreFormatting()
81
			// this will tell if the text stripper should add some more text formatting
82
			println "LineSeparator = "+strip.getLineSeparator()
83
			// the desired line separator for output text. The line.separator system property is used if the line separator preference is not set explicitly
84
			println "WordSeparator = "+strip.getWordSeparator()
85
			// set the desired word separator for output text. The PDFBox text extraction algorithm will output a space character if there is enough space between two words. By default a space character is used. If you need and accurate count of characters that are found in a PDF document then you might want to set the word separator to the empty string
86
*/
87

  
88
			strip.setSortByPosition(set_sort_by_position)
77 89
			strip.setPageStart("<pb/>")
78 90
			// strip.setParagraphStart("\n<p>")
79 91
			// strip.setParagraphEnd("</p>")
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/macro/pdf/PDF2PNGMacro.groovy (revision 3541)
26 26
if (input_dir != null &&  input_dir.exists()) {
27 27
	nFiles = 0
28 28
	input_dir.eachFileMatch(~/.*.pdf/) { f ->
29
		name = f.getName()
30
		idx = name.lastIndexOf(".")
31
		if (idx > 0) name = name.substring(0, idx)
32
		dir = f.getParentFile()
33
		println "Processing "+name+"..."
34

  
35
		pdfFile = f.getAbsolutePath()
36
		if (pdfFile.toUpperCase().endsWith(".PDF")) {
37
			textFile = pdfFile.substring(0, pdfFile.length() - 3) + "txt"
38
			try {
39
				doc = Loader.loadPDF(f)
40
				pdfRenderer = new PDFRenderer(doc)
41
				for (page = 0; page < doc.getNumberOfPages(); ++page) {
42
    					bim = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB)
43
    					fos = new FileOutputStream(new File(dir, name + "-" + (page+1) + ".png"))
44
					ImageIOUtil.writeImage(bim, "png", fos, 300)
45
				}
46
				fos.close()
47
				doc.close()
48
				nFiles++
49
			} catch (Exception e) {
50
				e.printStackTrace()
51
			}
29
		if (f.getAbsolutePath().toUpperCase().endsWith(".PDF")) {
30
	 		processFile(f)
31
	 		nFiles++
52 32
		}
53 33
	}
54 34
	println "Processed "+nFiles+" files."
55 35
} else {
36
	if (input_file.getAbsolutePath().toUpperCase().endsWith(".PDF")) {
37
	 processFile(input_file)
38
	}
39
}
40

  
41
def processFile(input_file) {
42

  
56 43
	name = input_file.getName()
57 44
	dir = input_file.getParentFile()
58 45
	idx = name.lastIndexOf(".")
......
60 47

  
61 48
	pdfFile = input_file.getAbsolutePath()
62 49
	if (pdfFile.toUpperCase().endsWith(".PDF")) {
50
	
51
		println "Processing "+name+"..."
52
		
63 53
		textFile = pdfFile.substring(0, pdfFile.length() - 3) + "txt"
64 54
		try {
65 55
			doc = Loader.loadPDF(input_file)

Formats disponibles : Unified diff