/ - Diff - Plateforme TXM - Forge du Centre Blaise Pascal

Révision 3541

     package org.txm.macro.pdf
     // STANDARD DECLARATIONS
     import org.kohsuke.args4j.*
     import groovy.transform.Field
     import org.txm.rcp.swt.widget.parameters.*
     import org.apache.pdfbox.Loader
     import org.apache.pdfbox.text.PDFTextStripperByArea
     import org.apache.pdfbox.cos.COSName
     import java.awt.geom.Rectangle2D
     import java.awt.geom.Rectangle2D.Float
     import org.apache.pdfbox.cos.COSArray
     import org.apache.pdfbox.cos.COSFloat
     import org.apache.pdfbox.pdmodel.common.PDRectangle
     // BEGINNING OF PARAMETERS
     @Field @Option(name="input_file", usage=".pdf input file", widget="File", required=false, def="")
     def input_file
     @Field @Option(name="input_dir", usage="The directory containing the .pdf files to read", widget="Folder", required=false, def="")
     def input_dir
     // Open the parameters input dialog box
     if (!ParametersDialog.open(this)) return
     // END OF PARAMETERS
     ArrayList<String> highlightedTexts = new ArrayList<>()
     if (input_dir != null &&  input_dir.exists()) {
     	nFiles = 0
     	input_dir.eachFileMatch(~/.*.pdf/) { f ->
     		if (f.getAbsolutePath().toUpperCase().endsWith(".PDF")) {
     	 		processFile(f)
     	 		nFiles++
+    		}
+    	}
     	println "Processed "+nFiles+" files."
     } else {
     	if (input_file.getAbsolutePath().toUpperCase().endsWith(".PDF")) {
     	 processFile(input_file)
+    	}
+    }
     def processFile(input_file) {
     		name = input_file.getName()
     		dir = input_file.getParentFile()
     		idx = name.lastIndexOf(".")
     		if (idx > 0) name = name.substring(0, idx)
     		outputFile = new File(input_file.getParentFile(), name + "-annotations.tsv")
     		println "Processing "+name+"..."
     		try {
     			outputFile.withWriter("UTF-8") { writer ->
     			doc = Loader.loadPDF(input_file)
     			writer.println "page\ttype\thighlight\tcomment"
     			for (npage = 0; npage < doc.getNumberOfPages(); ++npage) {
     				page = doc.getPage(npage)
     				al = page.getAnnotations()
     				stripper = new PDFTextStripperByArea()
     				stripper.setSortByPosition(true)
     				al.each {
     					if (it.getSubtype().equals("Highlight")) {
     						s = (npage+1)+"\t"+it.getSubtype()
     						writer.print s
     						// adapted from https://www.anycodings.com/1questions/5047580/java-apache-pdfbox-extract-highlighted-text
     						quadsArray = it.getCOSObject().getDictionaryObject(COSName.getPDFName("QuadPoints"))
                 					str = null
                 					for(int j=1, k=0; j<=(quadsArray.size()/8); j++) {
                     					ULX = quadsArray.get(0+k)
                     					ULY = quadsArray.get(1+k)
                     					URX = quadsArray.get(2+k)
                     					URY = quadsArray.get(3+k)
                     					LLX = quadsArray.get(4+k)
                     					LLY = quadsArray.get(5+k)
                     					LRX = quadsArray.get(6+k)
                     					LRY = quadsArray.get(7+k)
                     					k+=8
                     					ulx = ULX.floatValue() - 1			// upper left x.
                     					uly = ULY.floatValue()				// upper left y.
                     					width = URX.floatValue() - LLX.floatValue()	// calculated by upperRightX - lowerLeftX.
                     					height = URY.floatValue() - LLY.floatValue()	// calculated by upperRightY - lowerLeftY.
                     					PDRectangle pageSize = page.getMediaBox()
                     					uly = pageSize.getHeight() - uly
                     					rectangle_2 = new Rectangle2D.Double(ulx, uly, width, height)
                     					stripper.addRegion("highlightedRegion", rectangle_2)
                     					stripper.extractRegions(page)
                     					highlightedText = stripper.getTextForRegion("highlightedRegion").trim()
                     					if(j > 1) {
                     					    str = str+" "+highlightedText
                     					} else {
                     					    str = highlightedText
+                    					}
+                					}
                 					if (str.length() > 0) {
                 						writer.print '\t"'+str.replaceAll('"', '""')+'"'
                 					} else {
                 						writer.print '\t""'
+                					}
                 					if (it.getContents() != null) {
     							writer.println '\t"'+it.getContents().replaceAll('"', '""')+'"'
     						} else {
                 						writer.println '\t""'
+                					}
     					} else {
     						if (it.getContents() != null) {
     							s = (npage+1)+"\t"+it.getSubtype()
     							writer.print s
     							s = '\t""\t"'+it.getContents().replaceAll('"', '""')+'"'
     							writer.println s
+                					}
+    					}
+    			    	}
+    			}
     			doc.close()
     			writer.close()
     			} // writer
     		} catch (Exception e) {
     			e.printStackTrace()
+    		}
+    }

     @Field @Option(name="input_dir", usage="The directory containing the .pdf files to read", widget="Folder", required=false, def="")
     def input_dir
     @Field @Option(name="set_sort_by_position", usage="""The order of the text tokens in a PDF file may not be in the same as they appear visually on the screen. For example, a PDF writer may write out all text by font, so all bold or larger text, then make a second pass and write out the normal text.
     The default is to not sort by position.
     A PDF writer could choose to write each character in a different order. By default PDFBox does not sort the text tokens before processing them due to performance reasons.""", widget="Boolean", required=true, def="false")
     def set_sort_by_position
     // Open the parameters input dialog box
     if (!ParametersDialog.open(this)) return
-...
     if (input_dir != null &&  input_dir.exists()) {
     	nFiles = 0
     	input_dir.eachFileMatch(~/.*.pdf/) { f ->
     		name = f.getName()
     		println "Processing "+name+"..."
     		idx = name.lastIndexOf(".")
     		if (idx > 0) name = name.substring(0, idx)
     		outputFile = new File(f.getParentFile(), name + ".txt")
     		pdfFile = f.getAbsolutePath()
     		if (pdfFile.toUpperCase().endsWith(".PDF")) {
     			textFile = pdfFile.substring(0, pdfFile.length() - 3) + "txt"
     			try {
     				outputFile.withWriter("UTF-8") { writer ->
     				doc = Loader.loadPDF(f)
     				strip = new PDFTextStripper()
     				strip.setSortByPosition(true)
     				strip.setPageStart("<pb/>")
     				// strip.setParagraphStart("\n<p>")
     				// strip.setParagraphEnd("</p>")
     				writer.print strip.getText(doc)
     				doc.close()
     				writer.close()
     				nFiles++
+    				}
     			} catch (Exception e) {
     			e.printStackTrace()
+    			}
     		if (f.getAbsolutePath().toUpperCase().endsWith(".PDF")) {
     	 		processFile(f)
     	 		nFiles++
+    		}
+    	}
     	println "Processed "+nFiles+" files."
     } else {
     	if (input_file.getAbsolutePath().toUpperCase().endsWith(".PDF")) {
     	 processFile(input_file)
+    	}
+    }
     def processFile(input_file) {
     	name = input_file.getName()
     	idx = name.lastIndexOf(".")
     	if (idx > 0) name = name.substring(0, idx)
-...
     	pdfFile = input_file.getAbsolutePath()
     	if (pdfFile.toUpperCase().endsWith(".PDF")) {
     		println "Processing "+name+"..."
     		textFile = pdfFile.substring(0, pdfFile.length() - 3) + "txt"
     		try {
     			outputFile.withWriter("UTF-8") { writer ->
     			doc = Loader.loadPDF(input_file)
     			strip = new PDFTextStripper()
     /*
     			println "SpacingTolerance = "+strip.getSpacingTolerance()
     			// Set the space width-based tolerance value that is used to estimate where spaces in text should be added. Note that the default value for this has been determined from trial and error. Setting this value larger will reduce the number of spaces added.
     			println "AverageCharTolerance = "+strip.getAverageCharTolerance()
-...
     			// sets the multiple of whitespace character widths for the current text which the current line start can be indented from the previous line start beyond which the current line start is considered to be a paragraph start. The default value is 2.0.
     			println "DropThreshold = "+strip.getDropThreshold()
     			// sets the minimum whitespace, as a multiple of the max height of the current characters beyond which the current line start is considered to be a paragraph start. The default value is 2.5.
     			strip.setSortByPosition(true)
     */
     /*
     			println "SeparateByBeads = "+strip.getSeparateByBeads()
     			// this will tell if the text stripper should separate by beads
     			println "ArticleStart = "+strip.getArticleStart()
     			// the string which will be used at the beginning of an article
     			println "ArticleEnd = "+strip.getArticleEnd()
     			// the string which will be used at the end of an article
     			println "AddMoreFormatting = "+strip.getAddMoreFormatting()
     			// this will tell if the text stripper should add some more text formatting
     			println "LineSeparator = "+strip.getLineSeparator()
     			// the desired line separator for output text. The line.separator system property is used if the line separator preference is not set explicitly
     			println "WordSeparator = "+strip.getWordSeparator()
     			// set the desired word separator for output text. The PDFBox text extraction algorithm will output a space character if there is enough space between two words. By default a space character is used. If you need and accurate count of characters that are found in a PDF document then you might want to set the word separator to the empty string
     */
     			strip.setSortByPosition(set_sort_by_position)
     			strip.setPageStart("<pb/>")
     			// strip.setParagraphStart("\n<p>")
     			// strip.setParagraphEnd("</p>")

     if (input_dir != null &&  input_dir.exists()) {
     	nFiles = 0
     	input_dir.eachFileMatch(~/.*.pdf/) { f ->
     		name = f.getName()
     		idx = name.lastIndexOf(".")
     		if (idx > 0) name = name.substring(0, idx)
     		dir = f.getParentFile()
     		println "Processing "+name+"..."
     		pdfFile = f.getAbsolutePath()
     		if (pdfFile.toUpperCase().endsWith(".PDF")) {
     			textFile = pdfFile.substring(0, pdfFile.length() - 3) + "txt"
     			try {
     				doc = Loader.loadPDF(f)
     				pdfRenderer = new PDFRenderer(doc)
     				for (page = 0; page < doc.getNumberOfPages(); ++page) {
         					bim = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB)
         					fos = new FileOutputStream(new File(dir, name + "-" + (page+1) + ".png"))
     					ImageIOUtil.writeImage(bim, "png", fos, 300)
+    				}
     				fos.close()
     				doc.close()
     				nFiles++
     			} catch (Exception e) {
     				e.printStackTrace()
+    			}
     		if (f.getAbsolutePath().toUpperCase().endsWith(".PDF")) {
     	 		processFile(f)
     	 		nFiles++
+    		}
+    	}
     	println "Processed "+nFiles+" files."
     } else {
     	if (input_file.getAbsolutePath().toUpperCase().endsWith(".PDF")) {
     	 processFile(input_file)
+    	}
+    }
     def processFile(input_file) {
     	name = input_file.getName()
     	dir = input_file.getParentFile()
     	idx = name.lastIndexOf(".")
-...
     	pdfFile = input_file.getAbsolutePath()
     	if (pdfFile.toUpperCase().endsWith(".PDF")) {
     		println "Processing "+name+"..."
     		textFile = pdfFile.substring(0, pdfFile.length() - 3) + "txt"
     		try {
     			doc = Loader.loadPDF(input_file)

Formats disponibles : Unified diff

Laboratoire ICAR » Plateforme TXM

Révision 3541