Révision 3896
| TXM/trunk/bundles/org.txm.core/src/java/org/txm/importer/ConvertDocument.java (revision 3896) | ||
|---|---|---|
| 1 |
package org.txm.importer; |
|
| 2 |
import java.io.File; |
|
| 3 |
import java.util.Arrays; |
|
| 4 |
import java.util.List; |
|
| 5 |
|
|
| 6 |
import org.jodconverter.core.office.OfficeException; |
|
| 7 |
import org.jodconverter.core.office.OfficeManager; |
|
| 8 |
import org.jodconverter.local.JodConverter; |
|
| 9 |
import org.jodconverter.local.office.LocalOfficeManager; |
|
| 10 |
|
|
| 11 |
public class ConvertDocument {
|
|
| 12 |
|
|
| 13 |
boolean DEBUG = false; |
|
| 14 |
List<String> supportedInput = Arrays.asList("odt", "doc", "docx", "html", "pdf"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$
|
|
| 15 |
List<String> supportedOutput = Arrays.asList("odt", "doc", "pdf"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$
|
|
| 16 |
OfficeManager officeManager = null; |
|
| 17 |
|
|
| 18 |
public ConvertDocument(String officeHome) throws OfficeException {
|
|
| 19 |
//officeManager.setConnectionProtocol(OfficeConnectionProtocol.PIPE); |
|
| 20 |
//officeManager.setPipeNames("office1", "office2");
|
|
| 21 |
//officeManager.setTaskExecutionTimeout(30000L); |
|
| 22 |
officeManager = LocalOfficeManager.builder().build(); |
|
| 23 |
|
|
| 24 |
try {
|
|
| 25 |
officeManager.start(); |
|
| 26 |
} catch(Exception e) {
|
|
| 27 |
officeManager.stop(); |
|
| 28 |
org.txm.utils.logger.Log.printStackTrace(e); |
|
| 29 |
} |
|
| 30 |
} |
|
| 31 |
|
|
| 32 |
public ConvertDocument() throws OfficeException {
|
|
| 33 |
|
|
| 34 |
officeManager = LocalOfficeManager.builder().build(); |
|
| 35 |
try {
|
|
| 36 |
officeManager.start(); |
|
| 37 |
} catch(Exception e) {
|
|
| 38 |
officeManager.stop(); |
|
| 39 |
org.txm.utils.logger.Log.printStackTrace(e); |
|
| 40 |
} |
|
| 41 |
} |
|
| 42 |
|
|
| 43 |
public void setDebug(boolean b) {
|
|
| 44 |
DEBUG = b; |
|
| 45 |
} |
|
| 46 |
|
|
| 47 |
public void stop() throws OfficeException {
|
|
| 48 |
officeManager.stop(); |
|
| 49 |
} |
|
| 50 |
|
|
| 51 |
public File toODT(File document, File outdir) throws Exception {
|
|
| 52 |
return auto(document, outdir, "odt"); //$NON-NLS-1$ |
|
| 53 |
} |
|
| 54 |
|
|
| 55 |
public File toDOC(File document, File outdir) throws Exception {
|
|
| 56 |
return auto(document, outdir, "doc"); //$NON-NLS-1$ |
|
| 57 |
} |
|
| 58 |
|
|
| 59 |
public File toPDF(File document, File outdir) throws Exception {
|
|
| 60 |
return auto(document, outdir, "pdf"); //$NON-NLS-1$ |
|
| 61 |
} |
|
| 62 |
|
|
| 63 |
public File auto(File document, File outdir, String ext) throws Exception {
|
|
| 64 |
|
|
| 65 |
//get filename without ext |
|
| 66 |
int idx = document.getName().lastIndexOf("."); //$NON-NLS-1$
|
|
| 67 |
if (idx == -1) return null; |
|
| 68 |
String name = document.getName().substring(0, idx-1); |
|
| 69 |
|
|
| 70 |
File outfile = new File(outdir, name+".odt"); //$NON-NLS-1$ |
|
| 71 |
try {
|
|
| 72 |
return autoFile(document, outfile, ext); |
|
| 73 |
} catch(Exception e) {
|
|
| 74 |
if (e instanceof OfficeException) {
|
|
| 75 |
OfficeException oe = (OfficeException) e; |
|
| 76 |
if (oe.getMessage().contains("officeHome not set")) {
|
|
| 77 |
throw new OfficeException("Could not find LibreOffice or OpenOffice 3.x installation. Reason="+e.getMessage());
|
|
| 78 |
} else if (oe.getMessage().contains("is already running")) {
|
|
| 79 |
String pid = oe.getMessage(); |
|
| 80 |
if (pid.indexOf("pid") > 0) {
|
|
| 81 |
pid = pid.substring(pid.indexOf("pid"));
|
|
| 82 |
} |
|
| 83 |
throw new OfficeException("Could not run LibreOffice or OpenOffice because a 'soffice' process is already running (process id="+pid+"). Please kill the process and restart. Reason="+e.getMessage());
|
|
| 84 |
} else |
|
| 85 |
throw e; |
|
| 86 |
} else |
|
| 87 |
throw e; |
|
| 88 |
} |
|
| 89 |
} |
|
| 90 |
|
|
| 91 |
public File autoFile(File document, File outdir, String ext) throws Exception {
|
|
| 92 |
|
|
| 93 |
ConvertDocument converter = new ConvertDocument(); |
|
| 94 |
if ("doc".equals(ext.toLowerCase())) {
|
|
| 95 |
return converter.toDOC(document, outdir); |
|
| 96 |
} else if ("odt".equals(ext.toLowerCase())) {
|
|
| 97 |
return converter.toODT(document, outdir); |
|
| 98 |
} if ("pdf".equals(ext.toLowerCase())) {
|
|
| 99 |
return converter.toPDF(document, outdir); |
|
| 100 |
} else {
|
|
| 101 |
return null; |
|
| 102 |
} |
|
| 103 |
} |
|
| 104 |
|
|
| 105 |
/** |
|
| 106 |
* @param args |
|
| 107 |
* @throws OfficeException |
|
| 108 |
*/ |
|
| 109 |
public static void main(String[] args) throws OfficeException {
|
|
| 110 |
//converter.convert(infile, outfile, new ) |
|
| 111 |
// String[] srcexts = { "doc" };
|
|
| 112 |
// String[] exts = { "odt" };//, "odt", "doc", "docx", "html", "pdf"}; // html pdf
|
|
| 113 |
// for(String srcext : srcexts) {
|
|
| 114 |
// File infile = new File("/home/mdecorde/Bureau/matrice/témoignages/CONVERSIONS/orig."+srcext);
|
|
| 115 |
// for(String ext : exts) {
|
|
| 116 |
// System.out.println("Convert from "+srcext+" to "+ext);
|
|
| 117 |
// OfficeManager officeManager = null; |
|
| 118 |
// try {
|
|
| 119 |
// officeManager = new DefaultOfficeManagerConfiguration().buildOfficeManager(); |
|
| 120 |
// officeManager.start(); |
|
| 121 |
// |
|
| 122 |
// OfficeDocumentConverter converter = new OfficeDocumentConverter(officeManager); |
|
| 123 |
// System.out.println(converter.getFormatRegistry().getClass()); |
|
| 124 |
// File outfile = new File("/home/mdecorde/Bureau/matrice/témoignages/CONVERSIONS/jod/"+srcext+"."+ext);
|
|
| 125 |
// converter.convert(infile, outfile, converter.getFormatRegistry().getFormatByExtension(ext)); |
|
| 126 |
// |
|
| 127 |
// } catch(Exception e) {
|
|
| 128 |
// System.out.println(e.getLocalizedMessage()); |
|
| 129 |
// } finally {
|
|
| 130 |
// officeManager.stop(); |
|
| 131 |
// } |
|
| 132 |
// } |
|
| 133 |
// } |
|
| 134 |
|
|
| 135 |
File infile = new File("/home/mdecorde/TEMP/Corpus_Riverains_TXM/EP27_txm.doc"); //$NON-NLS-1$
|
|
| 136 |
File outdir = new File("/home/mdecorde/TEMP"); //$NON-NLS-1$
|
|
| 137 |
JodConverter.convert(infile).to(new File(outdir, "test.odt")); |
|
| 138 |
//System.out.println("result: "+ConvertDocument.toODT(infile, outdir));
|
|
| 139 |
ConvertDocument convert = new ConvertDocument(); |
|
| 140 |
try {
|
|
| 141 |
System.out.println("result: "+convert.toODT(infile, outdir)); //$NON-NLS-1$
|
|
| 142 |
} catch(Exception e) { org.txm.utils.logger.Log.printStackTrace(e);}
|
|
| 143 |
convert.stop(); |
|
| 144 |
} |
|
| 145 |
|
|
| 146 |
} |
|
| TXM/trunk/bundles/org.txm.libs.jodconverter/src/org/txm/libs/jodconverter/ConvertDocument.java (revision 3896) | ||
|---|---|---|
| 7 | 7 |
import org.jodconverter.core.office.OfficeUtils; |
| 8 | 8 |
import org.jodconverter.local.JodConverter; |
| 9 | 9 |
import org.jodconverter.local.office.LocalOfficeManager; |
| 10 |
import org.jodconverter.local.office.LocalOfficeManager.Builder; |
|
| 10 | 11 |
|
| 11 | 12 |
public class ConvertDocument {
|
| 12 | 13 |
|
| 13 |
public static void test2(File inputFile, File outputFile) throws OfficeException {
|
|
| 14 |
// Create an office manager using the default configuration. |
|
| 15 |
// The default port is 2002. Note that when an office manager |
|
| 16 |
// is installed, it will be the one used by default when |
|
| 17 |
// a converter is created. |
|
| 18 |
final LocalOfficeManager officeManager = LocalOfficeManager.install(); |
|
| 14 |
public static void convert(File inputFile, File outputFile) throws OfficeException {
|
|
| 15 |
convert(inputFile, outputFile, null); |
|
| 16 |
} |
|
| 17 |
|
|
| 18 |
public static void convert(File inputFile, File outputFile, String officePath) throws OfficeException {
|
|
| 19 |
|
|
| 20 |
Builder builder = LocalOfficeManager.builder().install(); |
|
| 21 |
if (officePath != null && officePath.length() > 0) {
|
|
| 22 |
builder.officeHome(officePath); |
|
| 23 |
} |
|
| 24 |
OfficeManager officeManager = builder.build(); |
|
| 19 | 25 |
try {
|
| 20 |
|
|
| 21 |
// Start an office process and connect to the started instance (on port 2002). |
|
| 22 |
officeManager.start(); |
|
| 23 |
|
|
| 24 |
// Convert |
|
| 25 |
JodConverter |
|
| 26 |
.convert(inputFile) |
|
| 27 |
.to(outputFile) |
|
| 28 |
.execute(); |
|
| 26 |
// Start an office process and connect to the started instance (on port 2002). |
|
| 27 |
officeManager.start(); |
|
| 28 |
// Convert |
|
| 29 |
JodConverter |
|
| 30 |
.convert(inputFile) |
|
| 31 |
.to(outputFile) |
|
| 32 |
.execute(); |
|
| 29 | 33 |
} finally {
|
| 30 |
// Stop the office process
|
|
| 31 |
OfficeUtils.stopQuietly(officeManager);
|
|
| 34 |
// Stop the office process
|
|
| 35 |
OfficeUtils.stopQuietly(officeManager);
|
|
| 32 | 36 |
} |
| 33 | 37 |
} |
| 34 | 38 |
|
| 35 |
public static void test1(File inputFile, File outputFile) throws OfficeException {
|
|
| 39 |
public static void setOfficeHome(String officeHome) {
|
|
| 36 | 40 |
|
| 37 |
OfficeManager officeManager = LocalOfficeManager.builder() |
|
| 38 |
.install() |
|
| 39 |
.officeHome("/opt/libreoffice7.5")
|
|
| 40 |
.build(); |
|
| 41 |
try {
|
|
| 42 |
// Start an office process and connect to the started instance (on port 2002). |
|
| 43 |
officeManager.start(); |
|
| 44 |
// Convert |
|
| 45 |
JodConverter |
|
| 46 |
.convert(inputFile) |
|
| 47 |
.to(outputFile) |
|
| 48 |
.execute(); |
|
| 49 |
} finally {
|
|
| 50 |
// Stop the office process |
|
| 51 |
OfficeUtils.stopQuietly(officeManager); |
|
| 52 |
} |
|
| 41 |
if (officeHome == null) {
|
|
| 42 |
System.setProperty("office.home", "");
|
|
| 43 |
} else {
|
|
| 44 |
System.setProperty("office.home", officeHome);
|
|
| 45 |
|
|
| 46 |
Builder builder = LocalOfficeManager.builder().install(); |
|
| 47 |
builder.officeHome(officeHome); |
|
| 48 |
} |
|
| 53 | 49 |
} |
| 54 | 50 |
|
| 55 | 51 |
public static void main(String[] args) throws OfficeException {
|
| 56 | 52 |
|
| 57 |
File inputFile = new File("/home/mdecorde/Documents/FICHE d'inscription à une formation ENS - DECORDE.doc");
|
|
| 58 |
File outputFile = new File("/home/mdecorde/Documents/FICHE d'inscription à une formation ENS - DECORDE.odt");
|
|
| 53 |
File inputFile = new File("/home/mdecorde/xml/doc/Formulaire_SFT 2022 - Decorde.docx");
|
|
| 54 |
File outputFile = new File("/home/mdecorde/xml/doc/Formulaire_SFT 2022 - Decorde.odt");
|
|
| 59 | 55 |
|
| 60 |
test2(inputFile, outputFile);
|
|
| 56 |
convert(inputFile, outputFile);
|
|
| 61 | 57 |
} |
| 62 | 58 |
} |
| TXM/trunk/bundles/org.txm.libs.jodconverter/META-INF/MANIFEST.MF (revision 3896) | ||
|---|---|---|
| 15 | 15 |
org.jodconverter.local.office, |
| 16 | 16 |
org.jodconverter.local.office.utils, |
| 17 | 17 |
org.jodconverter.local.process, |
| 18 |
org.jodconverter.local.task |
|
| 18 |
org.jodconverter.local.task, |
|
| 19 |
org.txm.libs.jodconverter |
|
| 19 | 20 |
Require-Bundle: org.txm.libs.gson;bundle-version="2.8.6" |
| 20 | 21 |
Bundle-Vendor: JodConverter |
| 21 | 22 |
Automatic-Module-Name: org.txm.libs.jodconverter |
| TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/text/Text2TXTMacro.groovy (revision 3896) | ||
|---|---|---|
| 1 | 1 |
package org.txm.macro.text; |
| 2 | 2 |
// STANDARD DECLARATIONS |
| 3 | 3 |
|
| 4 |
import org.kohsuke.args4j.* |
|
| 5 | 4 |
import groovy.transform.Field |
| 6 |
import org.txm.rcp.swt.widget.parameters.* |
|
| 7 |
import org.txm.doc.*; |
|
| 8 |
import org.txm.importer.*; |
|
| 5 |
import org.txm.libs.jodconverter.ConvertDocument |
|
| 9 | 6 |
|
| 10 | 7 |
// BEGINNING OF PARAMETERS |
| 11 | 8 |
@Field @Option(name="inputDirectory", usage="the directory containing the DOC/ODT/RTF files to convert", widget="Folder", required=true, def="") |
| ... | ... | |
| 19 | 16 |
|
| 20 | 17 |
boolean debug = false; |
| 21 | 18 |
|
| 22 |
ConvertDocument converter; |
|
| 23 | 19 |
def files = [] |
| 24 | 20 |
try {
|
| 25 |
converter = new ConvertDocument(); |
|
| 26 | 21 |
inputDirectory.eachFileMatch(~/.+\.$extension/) { docFile ->
|
| 27 | 22 |
String name = docFile.getName() |
| 28 | 23 |
name = name.substring(0, name.lastIndexOf("."))
|
| 29 | 24 |
def txtFile = new File(docFile.getParentFile(), name+".txt") |
| 30 |
converter.autoFile(docFile, txtFile, "txt")
|
|
| 25 |
ConvertDocument.convert(docFile, txtFile)
|
|
| 31 | 26 |
files << docFile |
| 32 | 27 |
} |
| 33 | 28 |
} catch(Exception e) {
|
| 34 | 29 |
println "Error while processing directory: "+e; |
| 35 | 30 |
if (debug) e.printStackTrace(); |
| 36 | 31 |
} |
| 37 |
if (converter != null) converter.stop(); |
|
| 38 | 32 |
|
| 39 | 33 |
println "Processed directory: $inputDirectory" |
| 40 | 34 |
println "files: "+files |
| TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/text/SetOfficeMacro.groovy (revision 3896) | ||
|---|---|---|
| 1 |
// STANDARD DECLARATIONS |
|
| 2 |
package org.txm.macro.office |
|
| 3 |
|
|
| 4 |
import org.kohsuke.args4j.* |
|
| 5 |
import groovy.transform.Field |
|
| 6 |
import org.txm.rcp.swt.widget.parameters.* |
|
| 7 |
import org.txm.libs.jodconverter.ConvertDocument |
|
| 8 |
|
|
| 9 |
// BEGINNING OF PARAMETERS |
|
| 10 |
|
|
| 11 |
@Field @Option(name="office_path", usage="Path to LibreOffice or OpenOffice installation directory", widget="Folder", required=false, def="libreoffice or openoffice install directory") |
|
| 12 |
def office_path |
|
| 13 |
|
|
| 14 |
// Open the parameters input dialog box |
|
| 15 |
if (!ParametersDialog.open(this)) return; |
|
| 16 |
|
|
| 17 |
// END OF PARAMETERS |
|
| 18 |
|
|
| 19 |
if (office_path == null) {
|
|
| 20 |
println "No path to office directory given." |
|
| 21 |
return |
|
| 22 |
} |
|
| 23 |
|
|
| 24 |
if (!office_path.exists()) {
|
|
| 25 |
println "'$office_path' directory not found." |
|
| 26 |
return |
|
| 27 |
} |
|
| 28 |
|
|
| 29 |
if (!office_path.isDirectory()) {
|
|
| 30 |
println "'$office_path' exists but is not a directory." |
|
| 31 |
return |
|
| 32 |
} |
|
| 33 |
|
|
| 34 |
if (!office_path.canExecute()) {
|
|
| 35 |
println "'$office_path' exists but has not sufficent rights to be used." |
|
| 36 |
return |
|
| 37 |
} |
|
| 38 |
|
|
| 39 |
def old = System.getProperty("office.home")
|
|
| 40 |
ConvertDocument.setOfficeHome(office_path.getAbsolutePath()) |
|
| 41 |
println "Office path set to '"+System.getProperty("office.home")+"'."
|
|
| 42 |
if (old != null) {
|
|
| 43 |
println " Previous path was '${old}'."
|
|
| 44 |
} |
|
| TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/prototypes/export/office/SetOfficeMacro.groovy (revision 3896) | ||
|---|---|---|
| 1 |
// STANDARD DECLARATIONS |
|
| 2 |
package org.txm.macro.office |
|
| 3 |
|
|
| 4 |
import org.kohsuke.args4j.* |
|
| 5 |
import groovy.transform.Field |
|
| 6 |
import org.txm.rcp.swt.widget.parameters.* |
|
| 7 |
|
|
| 8 |
// BEGINNING OF PARAMETERS |
|
| 9 |
|
|
| 10 |
@Field @Option(name="office_path", usage="Path to LibreOffice or OpenOffice installation directory", widget="Folder", required=false, def="libreoffice or openoffice install directory") |
|
| 11 |
def office_path |
|
| 12 |
|
|
| 13 |
// Open the parameters input dialog box |
|
| 14 |
if (!ParametersDialog.open(this)) return; |
|
| 15 |
|
|
| 16 |
// END OF PARAMETERS |
|
| 17 |
|
|
| 18 |
if (office_path == null) {
|
|
| 19 |
println "No path to office directory given." |
|
| 20 |
return |
|
| 21 |
} |
|
| 22 |
|
|
| 23 |
if (!office_path.exists()) {
|
|
| 24 |
println "'$office_path' directory not found." |
|
| 25 |
return |
|
| 26 |
} |
|
| 27 |
|
|
| 28 |
if (!office_path.isDirectory()) {
|
|
| 29 |
println "'$office_path' exists but is not a directory." |
|
| 30 |
return |
|
| 31 |
} |
|
| 32 |
|
|
| 33 |
if (!office_path.canExecute()) {
|
|
| 34 |
println "'$office_path' exists but has not sufficent rights to be used." |
|
| 35 |
return |
|
| 36 |
} |
|
| 37 |
|
|
| 38 |
def old = System.getProperty("office.home")
|
|
| 39 |
System.setProperty("office.home", office_path.getAbsolutePath())
|
|
| 40 |
println "Office path set to '"+System.getProperty("office.home")+"'."
|
|
| 41 |
if (old != null) {
|
|
| 42 |
println " Previous path was '${old}'."
|
|
| 43 |
} |
|
| TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/transcription/Transana2TRS.groovy (revision 3896) | ||
|---|---|---|
| 58 | 58 |
import java.text.ParseException; |
| 59 | 59 |
import java.text.SimpleDateFormat; |
| 60 | 60 |
import java.util.Date; |
| 61 |
import org.txm.libs.jodconverter.ConvertDocument |
|
| 61 | 62 |
|
| 62 | 63 |
class Transana2TRS {
|
| 64 |
|
|
| 63 | 65 |
boolean debug = false; |
| 64 | 66 |
boolean isDirectory = false; |
| 65 | 67 |
File outDir; |
| 66 | 68 |
File dtd; |
| 67 |
ConvertDocument converter; |
|
| 68 | 69 |
def monitor |
| 69 | 70 |
|
| 70 | 71 |
Transana2TRS(File outDir, File dtd, boolean debug, def monitor) {
|
| ... | ... | |
| 92 | 93 |
boolean ret = true |
| 93 | 94 |
try {
|
| 94 | 95 |
println "* Processing $dir directory" |
| 95 |
converter = new ConvertDocument(); |
|
| 96 | 96 |
def files = dir.listFiles() |
| 97 | 97 |
def okfiles = []; |
| 98 | 98 |
if (files != null) |
| ... | ... | |
| 126 | 126 |
println "Error while processing directory: "+e; |
| 127 | 127 |
if (debug) e.printStackTrace(); |
| 128 | 128 |
} |
| 129 |
if (converter != null) converter.stop(); |
|
| 130 | 129 |
return ret; |
| 131 | 130 |
} |
| 132 | 131 |
|
| ... | ... | |
| 202 | 201 |
boolean DOCtoHTML(File docFile, File htmlFile) {
|
| 203 | 202 |
println "*** ODT -> HTML" |
| 204 | 203 |
try {
|
| 205 |
if (!isDirectory) converter = new ConvertDocument(); |
|
| 206 |
converter.setDebug(debug) |
|
| 207 |
converter.autoFile(docFile, htmlFile, "html") |
|
| 204 |
ConvertDocument.convert(docFile, htmlFile) |
|
| 208 | 205 |
} catch(Exception e) {
|
| 209 | 206 |
println "Error while converting $docFile : $e" |
| 210 | 207 |
if (debug) e.printStackTrace() |
| 211 | 208 |
} finally {
|
| 212 |
if (!isDirectory && converter != null) converter.stop() |
|
| 213 | 209 |
} |
| 214 | 210 |
return htmlFile.exists() && htmlFile.length() > 0 |
| 215 | 211 |
} |
| TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/transcription/TextTranscription2TRS.groovy (revision 3896) | ||
|---|---|---|
| 36 | 36 |
|
| 37 | 37 |
import org.txm.scripts.importer.*; |
| 38 | 38 |
import org.xml.sax.Attributes; |
| 39 |
import org.txm.importer.ConvertDocument; |
|
| 40 | 39 |
import org.txm.importer.scripts.filters.*; |
| 41 | 40 |
|
| 42 | 41 |
import java.io.File; |
| ... | ... | |
| 56 | 55 |
import java.text.ParseException; |
| 57 | 56 |
import java.text.SimpleDateFormat; |
| 58 | 57 |
import java.util.Date; |
| 58 |
import org.txm.libs.jodconverter.ConvertDocument |
|
| 59 |
|
|
| 59 | 60 |
/** |
| 60 | 61 |
* BUGS: |
| 61 | 62 |
* - n'a pas repéré les timings autour des commentaires |
| ... | ... | |
| 68 | 69 |
boolean debug = false; |
| 69 | 70 |
boolean isDirectory = false; |
| 70 | 71 |
File outDir; |
| 71 |
ConvertDocument converter; |
|
| 72 | 72 |
def monitor |
| 73 | 73 |
|
| 74 | 74 |
TextTranscription2TRS(File outDir, boolean debug, def monitor) {
|
| ... | ... | |
| 95 | 95 |
boolean ret = true |
| 96 | 96 |
try {
|
| 97 | 97 |
println "* Processing $dir directory" |
| 98 |
converter = new ConvertDocument(); |
|
| 99 | 98 |
def files = dir.listFiles() |
| 100 | 99 |
def okfiles = []; |
| 101 | 100 |
if (files != null) |
| ... | ... | |
| 129 | 128 |
println "Error while processing directory: "+e; |
| 130 | 129 |
if (debug) e.printStackTrace(); |
| 131 | 130 |
} |
| 132 |
if (converter != null) converter.stop(); |
|
| 133 | 131 |
return ret; |
| 134 | 132 |
} |
| 135 | 133 |
|
| ... | ... | |
| 198 | 196 |
boolean DOCtoTXT(File docFile, File txtFile) {
|
| 199 | 197 |
println "*** ODT -> TXT" |
| 200 | 198 |
try {
|
| 201 |
if (!isDirectory) converter = new ConvertDocument(); |
|
| 202 |
converter.setDebug(debug) |
|
| 203 |
converter.autoFile(docFile, txtFile, "txt") |
|
| 199 |
ConvertDocument.convert(docFile, txtFile) |
|
| 204 | 200 |
} catch(Exception e) {
|
| 205 | 201 |
println "Error while converting $docFile : $e" |
| 206 | 202 |
if (debug) e.printStackTrace() |
| 207 |
} finally {
|
|
| 208 |
if (!isDirectory && converter != null) converter.stop() |
|
| 209 | 203 |
} |
| 210 | 204 |
return txtFile.exists() && txtFile.length() > 0 |
| 211 | 205 |
} |
| 212 | 206 |
|
| 213 |
// boolean HTMLtoHTMLforTidy(File htmlFile, File htmlFile2) {
|
|
| 214 |
// println "*** HTML -> HTML for tidy" |
|
| 215 |
// try {
|
|
| 216 |
// println "replace TABS with 4 spaces" |
|
| 217 |
// String text2 = htmlFile.getText("UTF-8")
|
|
| 218 |
// text2 = text2.replaceAll(" ", " ")
|
|
| 219 |
// text2 = text2.replaceAll("’", "'")
|
|
| 220 |
// text2 = text2.replaceAll("’", "'")
|
|
| 221 |
// text2 = text2.replaceAll("\t", " ")
|
|
| 222 |
// text2 = text2.replaceAll("\n", ' ')
|
|
| 223 |
// text2 = text2.replaceAll("\r\n", ' ')
|
|
| 224 |
// htmlFile2.withWriter("UTF-8") { writer ->
|
|
| 225 |
// writer.write(text2); |
|
| 226 |
// } |
|
| 227 |
// } catch(Exception e) {
|
|
| 228 |
// println "Error while preparing HTML of $htmlFile : $e" |
|
| 229 |
// if (debug) e.printStackTrace() |
|
| 230 |
// } |
|
| 231 |
// return htmlFile2.exists() && htmlFile2.length() > 0 |
|
| 232 |
// } |
|
| 233 |
// |
|
| 234 |
// boolean HTMLtoXHTML(File htmlFile2, File xhtmlFile) {
|
|
| 235 |
// println "*** HTML for tidy -> XHTML" |
|
| 236 |
// try {
|
|
| 237 |
// Tidy tidy = new Tidy(); // obtain a new Tidy instance |
|
| 238 |
// tidy.setXHTML(true); // set desired config options using tidy setters |
|
| 239 |
// tidy.setInputEncoding("UTF-8")
|
|
| 240 |
// tidy.setOutputEncoding("UTF-8")
|
|
| 241 |
// tidy.setShowErrors(100) |
|
| 242 |
// tidy.setShowWarnings(debug) |
|
| 243 |
// tidy.setTabsize(10) |
|
| 244 |
// tidy.setWraplen(9999) |
|
| 245 |
// tidy.setForceOutput(true) // Tidy won't stop if error are found |
|
| 246 |
// xhtmlFile.withWriter("UTF-8") { out ->
|
|
| 247 |
// def input = new InputStreamReader(htmlFile2.toURI().toURL().newInputStream(), "UTF-8") |
|
| 248 |
// tidy.parse(input, out); // run tidy, providing an input and output stream |
|
| 249 |
// } |
|
| 250 |
// if (xhtmlFile.exists()) {
|
|
| 251 |
// // JTidy produced a "0x0" char. removing them |
|
| 252 |
// // fix separated < and / ??? |
|
| 253 |
// def c = Character.toChars(0)[0] |
|
| 254 |
// String txttmp = xhtmlFile.getText("UTF-8");
|
|
| 255 |
// xhtmlFile.withWriter("UTF-8") { out ->
|
|
| 256 |
// out.write(txttmp.replace("<\n/", "</").replace("<\r\n/", "</"))
|
|
| 257 |
// } |
|
| 258 |
// } |
|
| 259 |
// } catch(Exception e) {
|
|
| 260 |
// println "Error while applying JTidy: "+e |
|
| 261 |
// if (debug) e.printStackTrace() |
|
| 262 |
// } |
|
| 263 |
// return xhtmlFile.exists() && xhtmlFile.length() > 0 |
|
| 264 |
// } |
|
| 265 |
|
|
| 266 | 207 |
boolean TXTtoTRS(File txtFile, File trsFile) {
|
| 267 | 208 |
println "*** TXT -> TRS" |
| 268 | 209 |
try {
|
| TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/scripts/clix/treetagger-wrapper-definition.xml (revision 3896) | ||
|---|---|---|
| 1 |
<?xml version="1.0"> |
|
| 1 |
<?xml version="1.0"?>
|
|
| 2 | 2 |
<application name="TreeTagger" version="0.0.0" desc="Tag files"> |
| 3 | 3 |
<progs> |
| 4 | 4 |
<prog exec="tree-tagger" version="0.4.6" desc="Tag"> |
| ... | ... | |
| 10 | 10 |
<arg state="optional" type="none" name="prob" desc="Print tag probabilities"/> |
| 11 | 11 |
<arg state="optional" type="none" name="ignore-prefix" desc=" Ignore prefix when guessing pos for unknown words"/> |
| 12 | 12 |
<arg state="optional" type="none" name="no-unknown" desc="Print the token rather than [unknown] for unknown lemma"/> |
| 13 |
<arg state="optional" type="none" name="cap-heuristics" desc="Look up unknown capitalized words in the list of lower-case words"/> |
|
| 14 | 13 |
<arg state="optional" type="none" name="hyphen-heuristics" desc="Turn on the heuristics fur guessing the parts of speech of unknown hyphenated words"/> |
| 15 | 14 |
<arg state="optional" type="none" name="quiet" desc="quiet mode"/> |
| 16 | 15 |
<arg state="optional" type="none" name="pt-with-prob" desc="pretagging with probabilities"/> |
| TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/doc/docLoader.groovy (revision 3896) | ||
|---|---|---|
| 133 | 133 |
if (infile.isDirectory()) {
|
| 134 | 134 |
File unzipDir = new File(docfiles, infile.getName()) |
| 135 | 135 |
infile.renameTo(unzipDir) |
| 136 |
|
|
| 136 |
|
|
| 137 | 137 |
//println "zipdir "+unzipDir |
| 138 | 138 |
StylesToCSS converter = new StylesToCSS(unzipDir); |
| 139 | 139 |
if (!converter.process(new File(unzipDir, "style.css"))) {
|
| 140 | 140 |
println "WARNING: Failed to build css file of $unzipDir" |
| 141 | 141 |
} |
| 142 | 142 |
// and get the soft page breaks and styles parents |
| 143 |
def parentStyles = converter.parentStyles;
|
|
| 143 |
def parentStyles = converter.parentStyles |
|
| 144 | 144 |
def beforebreaks = converter.beforebreaks |
| 145 | 145 |
def afterbreaks = converter.afterbreaks |
| 146 |
|
|
| 146 |
|
|
| 147 | 147 |
//println "BEFORES: "+beforebreaks |
| 148 | 148 |
//println "AFTERS: "+afterbreaks |
| 149 | 149 |
//println "PARENTS: "+parentStyles |
| 150 |
|
|
| 150 |
|
|
| 151 | 151 |
// se servir de ça pour insérer <pb/> et remplacer styles automatiques |
| 152 | 152 |
File xmlFile = new File(txmDir, unzipDir.getName().substring(6)) |
| 153 | 153 |
//println "PATCH : $xmlFile" |
| TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/scripts/doc/DocumentToTei.groovy (revision 3896) | ||
|---|---|---|
| 30 | 30 |
import java.io.File; |
| 31 | 31 |
|
| 32 | 32 |
import org.txm.importer.ApplyXsl2; |
| 33 |
import org.txm.importer.ConvertDocument; |
|
| 34 | 33 |
import org.txm.utils.zip.Zip |
| 35 | 34 |
import org.txm.utils.FileUtils |
| 35 |
import org.txm.libs.jodconverter.ConvertDocument |
|
| 36 | 36 |
|
| 37 | 37 |
/** |
| 38 | 38 |
* The Class DocumentToTei. |
| ... | ... | |
| 49 | 49 |
if (!xsldir.exists()) { println "XslDir does not exists: "+xsldir; return false;}
|
| 50 | 50 |
if (!xslOdtTei.exists()) { println "xslOdtTei file does not exists: "+xslOdtTei; return false;}
|
| 51 | 51 |
|
| 52 |
ConvertDocument converter; |
|
| 53 | 52 |
for (File file : files) {
|
| 54 | 53 |
print "." |
| 55 | 54 |
String name = file.getName(); |
| ... | ... | |
| 69 | 68 |
File odtFile = File.createTempFile("workflowdocx", "sfsdf.odt", outdir);
|
| 70 | 69 |
|
| 71 | 70 |
try {
|
| 72 |
converter = new ConvertDocument(); |
|
| 73 |
converter.setDebug(DEBUG) |
|
| 74 |
converter.autoFile(file, odtFile, "odt") |
|
| 71 |
ConvertDocument.convert(file, odtFile) |
|
| 75 | 72 |
if (!new DocumentToTei().run(odtFile, xslOdtTei, teifile)) {
|
| 76 | 73 |
println "Docx to Odt to Tei failed: "+file |
| 77 | 74 |
odtFile.delete(); |
| ... | ... | |
| 79 | 76 |
} catch(Exception e) { println "DOCX to ODT to TEI failed: $file: $e"; }
|
| 80 | 77 |
finally {
|
| 81 | 78 |
odtFile.delete(); |
| 82 |
if (converter != null) converter.stop(); |
|
| 83 | 79 |
} |
| 84 | 80 |
|
| 85 | 81 |
} else if (FileUtils.isExtension(file, "doc")) {
|
| ... | ... | |
| 88 | 84 |
File odtFile = File.createTempFile("workflowdoc", "sfsdf.odt", outdir);
|
| 89 | 85 |
|
| 90 | 86 |
try {
|
| 91 |
converter = new ConvertDocument(); |
|
| 92 |
converter.setDebug(DEBUG) |
|
| 93 |
converter.autoFile(file, odtFile, "odt") |
|
| 87 |
ConvertDocument.convert(file, odtFile) |
|
| 94 | 88 |
if (!new DocumentToTei().run(odtFile, xslOdtTei, teifile)) {
|
| 95 | 89 |
println "DOC to ODT to TEI failed: "+file |
| 96 | 90 |
odtFile.delete(); |
| ... | ... | |
| 98 | 92 |
} catch(Exception e) { println "DOC to ODT to TEI failed: $file: $e"; }
|
| 99 | 93 |
finally {
|
| 100 | 94 |
odtFile.delete(); |
| 101 |
if (converter != null) converter.stop(); |
|
| 102 | 95 |
} |
| 103 |
|
|
| 104 | 96 |
} |
| 105 | 97 |
} |
| 106 | 98 |
|
| TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/scripts/doc/Workflow.groovy (revision 3896) | ||
|---|---|---|
| 32 | 32 |
// |
| 33 | 33 |
package org.txm.scripts.doc |
| 34 | 34 |
|
| 35 |
import org.txm.importer.ConvertDocument;
|
|
| 35 |
import org.txm.libs.jodconverter.ConvertDocument
|
|
| 36 | 36 |
|
| 37 | 37 |
// TODO: Auto-generated Javadoc |
| 38 | 38 |
/* (non-Javadoc) |
| ... | ... | |
| 55 | 55 |
|
| 56 | 56 |
println "start" |
| 57 | 57 |
for (File file : srcdir.listFiles()) {
|
| 58 |
ConvertDocument converter = new ConvertDocument(); |
|
| 59 | 58 |
try {
|
| 60 | 59 |
File teifile = new File(outdir, file.getName()+".xml"); |
| 61 | 60 |
if (file.getName().endsWith(".odt")) {
|
| ... | ... | |
| 66 | 65 |
} else if (file.getName().endsWith(".docx")) {
|
| 67 | 66 |
File odtFile = File.createTempFile("workflowodt", "sfsdf.odt", srcdir);
|
| 68 | 67 |
// convert doc to odt |
| 69 |
converter.autoFile(file, odtFile, "odt")
|
|
| 68 |
ConvertDocument.convert(file, odtFile)
|
|
| 70 | 69 |
if (!new DocumentToTei().run(odtFile, xslOdtTei, teifile)) {
|
| 71 | 70 |
println "Docx to Odt to Tei failed: "+file |
| 72 | 71 |
odtFile.delete(); |
| ... | ... | |
| 76 | 75 |
} else if (file.getName().endsWith(".doc")) {
|
| 77 | 76 |
File odtFile = File.createTempFile("workflowodt", "sfsdf.odt", srcdir);
|
| 78 | 77 |
// convert doc to odt |
| 79 |
converter.autoFile(file, odtFile, "odt")
|
|
| 78 |
ConvertDocument.convert(file, odtFile)
|
|
| 80 | 79 |
if (!new DocumentToTei().run(odtFile, xslOdtTei, teifile)) {
|
| 81 | 80 |
println "Doc to Odt to Tei failed: "+file |
| 82 | 81 |
odtFile.delete(); |
| ... | ... | |
| 86 | 85 |
} |
| 87 | 86 |
} |
| 88 | 87 |
catch(Exception e) { e.printStackTrace() }
|
| 89 |
finally { converter.stop(); }
|
|
| 88 |
finally { }
|
|
| 90 | 89 |
} |
| 91 | 90 |
|
| 92 | 91 |
println "--Done" |
| TXM/trunk/bundles/org.txm.treetagger.rcp/src/org/txm/treetagger/rcp/preferences/TreeTaggerPreferencePage.java (revision 3896) | ||
|---|---|---|
| 71 | 71 |
runOptions.setLayout(new GridLayout(3, false)); |
| 72 | 72 |
|
| 73 | 73 |
this.addField(new BooleanFieldEditor(TreeTaggerPreferences.OPTIONS_UNKNOWN, "Print the token rather than <unknown> for unknown lemma", runOptions)); |
| 74 |
this.addField(new BooleanFieldEditor(TreeTaggerPreferences.OPTIONS_CAPHEURISTIC, "Look up unknown capitalized words in the list of lower-case words", runOptions)); |
|
| 75 | 74 |
this.addField(new BooleanFieldEditor(TreeTaggerPreferences.OPTIONS_HYPHENHEURISTIC, "Turn on the heuristics fur guessing the parts of speech of unknown hyphenated words", runOptions)); |
| 76 | 75 |
this.addField(new BooleanFieldEditor(TreeTaggerPreferences.OPTIONS_PROB, "Print tag probabilities", runOptions)); |
| 77 | 76 |
this.addField(new FileFieldEditor(TreeTaggerPreferences.OPTIONS_LEX, "Read auxiliary lexicon entries from a file", runOptions)); |
| TXM/trunk/bundles/org.txm.rcp/src/main/java/org/txm/rcp/corpuswizard/ImportWizard.java (revision 3896) | ||
|---|---|---|
| 105 | 105 |
project.setAnnotate(e.isRunning()); |
| 106 | 106 |
} |
| 107 | 107 |
|
| 108 |
if (project.getAnnotate()) {
|
|
| 109 |
Log.info(TXMUIMessages.TheAnnotateImportParameterHasBeenActivatedSinceTreeTaggerIsInstalled); |
|
| 110 |
} |
|
| 111 |
else {
|
|
| 112 |
Log.info(TXMUIMessages.TheAnnotateImportParameterWasNotActivatedSinceTreeTaggerIsNotInstalled); |
|
| 113 |
} |
|
| 108 |
// if (project.getAnnotate()) {
|
|
| 109 |
// Log.info(TXMUIMessages.TheAnnotateImportParameterHasBeenActivatedSinceTreeTaggerIsInstalled);
|
|
| 110 |
// }
|
|
| 111 |
// else {
|
|
| 112 |
// Log.info(TXMUIMessages.TheAnnotateImportParameterWasNotActivatedSinceTreeTaggerIsNotInstalled);
|
|
| 113 |
// }
|
|
| 114 | 114 |
|
| 115 | 115 |
File importxml = new File(path, "import.xml"); //$NON-NLS-1$ |
| 116 | 116 |
if (importxml.exists()) {
|
| ... | ... | |
| 132 | 132 |
// Log.info(TXMUIMessages.abort); |
| 133 | 133 |
// return false; |
| 134 | 134 |
// } |
| 135 |
|
|
| 136 |
|
|
| 137 | 135 |
} |
| 138 | 136 |
|
| 139 | 137 |
// if (!project.hasEditionDefinition("default")) {
|
| TXM/trunk/bundles/org.txm.utils.core/src/org/txm/utils/treetagger/TreeTagger.java (revision 3896) | ||
|---|---|---|
| 217 | 217 |
this.isnounknown = false; |
| 218 | 218 |
} |
| 219 | 219 |
|
| 220 |
// Look up unknown capitalized words in the list of lower-case words |
|
| 221 |
/** The iscapheuristics. */ |
|
| 222 |
private Boolean iscapheuristics = false; |
|
| 223 |
|
|
| 224 |
/** |
|
| 225 |
* Setcapheuristics. |
|
| 226 |
*/ |
|
| 227 |
public void setcapheuristics() {
|
|
| 228 |
this.iscapheuristics = true; |
|
| 229 |
} |
|
| 230 |
|
|
| 231 |
/** |
|
| 232 |
* Unsetcapheuristics. |
|
| 233 |
*/ |
|
| 234 |
public void unsetcapheuristics() {
|
|
| 235 |
this.iscapheuristics = false; |
|
| 236 |
} |
|
| 237 |
|
|
| 238 | 220 |
// Turn on the heuristics fur guessing the parts of speech of unknown |
| 239 | 221 |
// hyphenated words |
| 240 | 222 |
/** The ishyphenheuristics. */ |
| ... | ... | |
| 481 | 463 |
args.add("-ignore-prefix"); //$NON-NLS-1$
|
| 482 | 464 |
if (isnounknown) |
| 483 | 465 |
args.add("-no-unknown"); //$NON-NLS-1$
|
| 484 |
if (iscapheuristics) |
|
| 485 |
args.add("-cap-heuristics"); //$NON-NLS-1$
|
|
| 486 | 466 |
if (ishyphenheuristics) |
| 487 | 467 |
args.add("-hyphen-heuristics"); //$NON-NLS-1$
|
| 488 | 468 |
if (isquiet) |
| TXM/trunk/features/org.txm.core.feature/feature.xml (revision 3896) | ||
|---|---|---|
| 192 | 192 |
version="0.0.0" |
| 193 | 193 |
unpack="false"/> |
| 194 | 194 |
|
| 195 |
<plugin |
|
| 196 |
id="org.txm.libs.jodconverter" |
|
| 197 |
download-size="0" |
|
| 198 |
install-size="0" |
|
| 199 |
version="0.0.0" |
|
| 200 |
unpack="false"/> |
|
| 201 |
|
|
| 195 | 202 |
</feature> |
Formats disponibles : Unified diff