Révision 3896
TXM/trunk/bundles/org.txm.core/src/java/org/txm/importer/ConvertDocument.java (revision 3896) | ||
---|---|---|
1 |
package org.txm.importer; |
|
2 |
import java.io.File; |
|
3 |
import java.util.Arrays; |
|
4 |
import java.util.List; |
|
5 |
|
|
6 |
import org.jodconverter.core.office.OfficeException; |
|
7 |
import org.jodconverter.core.office.OfficeManager; |
|
8 |
import org.jodconverter.local.JodConverter; |
|
9 |
import org.jodconverter.local.office.LocalOfficeManager; |
|
10 |
|
|
11 |
public class ConvertDocument { |
|
12 |
|
|
13 |
boolean DEBUG = false; |
|
14 |
List<String> supportedInput = Arrays.asList("odt", "doc", "docx", "html", "pdf"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ |
|
15 |
List<String> supportedOutput = Arrays.asList("odt", "doc", "pdf"); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ |
|
16 |
OfficeManager officeManager = null; |
|
17 |
|
|
18 |
public ConvertDocument(String officeHome) throws OfficeException { |
|
19 |
//officeManager.setConnectionProtocol(OfficeConnectionProtocol.PIPE); |
|
20 |
//officeManager.setPipeNames("office1", "office2"); |
|
21 |
//officeManager.setTaskExecutionTimeout(30000L); |
|
22 |
officeManager = LocalOfficeManager.builder().build(); |
|
23 |
|
|
24 |
try { |
|
25 |
officeManager.start(); |
|
26 |
} catch(Exception e) { |
|
27 |
officeManager.stop(); |
|
28 |
org.txm.utils.logger.Log.printStackTrace(e); |
|
29 |
} |
|
30 |
} |
|
31 |
|
|
32 |
public ConvertDocument() throws OfficeException { |
|
33 |
|
|
34 |
officeManager = LocalOfficeManager.builder().build(); |
|
35 |
try { |
|
36 |
officeManager.start(); |
|
37 |
} catch(Exception e) { |
|
38 |
officeManager.stop(); |
|
39 |
org.txm.utils.logger.Log.printStackTrace(e); |
|
40 |
} |
|
41 |
} |
|
42 |
|
|
43 |
public void setDebug(boolean b) { |
|
44 |
DEBUG = b; |
|
45 |
} |
|
46 |
|
|
47 |
public void stop() throws OfficeException { |
|
48 |
officeManager.stop(); |
|
49 |
} |
|
50 |
|
|
51 |
public File toODT(File document, File outdir) throws Exception { |
|
52 |
return auto(document, outdir, "odt"); //$NON-NLS-1$ |
|
53 |
} |
|
54 |
|
|
55 |
public File toDOC(File document, File outdir) throws Exception { |
|
56 |
return auto(document, outdir, "doc"); //$NON-NLS-1$ |
|
57 |
} |
|
58 |
|
|
59 |
public File toPDF(File document, File outdir) throws Exception { |
|
60 |
return auto(document, outdir, "pdf"); //$NON-NLS-1$ |
|
61 |
} |
|
62 |
|
|
63 |
public File auto(File document, File outdir, String ext) throws Exception { |
|
64 |
|
|
65 |
//get filename without ext |
|
66 |
int idx = document.getName().lastIndexOf("."); //$NON-NLS-1$ |
|
67 |
if (idx == -1) return null; |
|
68 |
String name = document.getName().substring(0, idx-1); |
|
69 |
|
|
70 |
File outfile = new File(outdir, name+".odt"); //$NON-NLS-1$ |
|
71 |
try { |
|
72 |
return autoFile(document, outfile, ext); |
|
73 |
} catch(Exception e) { |
|
74 |
if (e instanceof OfficeException) { |
|
75 |
OfficeException oe = (OfficeException) e; |
|
76 |
if (oe.getMessage().contains("officeHome not set")) { |
|
77 |
throw new OfficeException("Could not find LibreOffice or OpenOffice 3.x installation. Reason="+e.getMessage()); |
|
78 |
} else if (oe.getMessage().contains("is already running")) { |
|
79 |
String pid = oe.getMessage(); |
|
80 |
if (pid.indexOf("pid") > 0) { |
|
81 |
pid = pid.substring(pid.indexOf("pid")); |
|
82 |
} |
|
83 |
throw new OfficeException("Could not run LibreOffice or OpenOffice because a 'soffice' process is already running (process id="+pid+"). Please kill the process and restart. Reason="+e.getMessage()); |
|
84 |
} else |
|
85 |
throw e; |
|
86 |
} else |
|
87 |
throw e; |
|
88 |
} |
|
89 |
} |
|
90 |
|
|
91 |
public File autoFile(File document, File outdir, String ext) throws Exception { |
|
92 |
|
|
93 |
ConvertDocument converter = new ConvertDocument(); |
|
94 |
if ("doc".equals(ext.toLowerCase())) { |
|
95 |
return converter.toDOC(document, outdir); |
|
96 |
} else if ("odt".equals(ext.toLowerCase())) { |
|
97 |
return converter.toODT(document, outdir); |
|
98 |
} if ("pdf".equals(ext.toLowerCase())) { |
|
99 |
return converter.toPDF(document, outdir); |
|
100 |
} else { |
|
101 |
return null; |
|
102 |
} |
|
103 |
} |
|
104 |
|
|
105 |
/** |
|
106 |
* @param args |
|
107 |
* @throws OfficeException |
|
108 |
*/ |
|
109 |
public static void main(String[] args) throws OfficeException { |
|
110 |
//converter.convert(infile, outfile, new ) |
|
111 |
// String[] srcexts = { "doc" }; |
|
112 |
// String[] exts = { "odt" };//, "odt", "doc", "docx", "html", "pdf"}; // html pdf |
|
113 |
// for(String srcext : srcexts) { |
|
114 |
// File infile = new File("/home/mdecorde/Bureau/matrice/témoignages/CONVERSIONS/orig."+srcext); |
|
115 |
// for(String ext : exts) { |
|
116 |
// System.out.println("Convert from "+srcext+" to "+ext); |
|
117 |
// OfficeManager officeManager = null; |
|
118 |
// try { |
|
119 |
// officeManager = new DefaultOfficeManagerConfiguration().buildOfficeManager(); |
|
120 |
// officeManager.start(); |
|
121 |
// |
|
122 |
// OfficeDocumentConverter converter = new OfficeDocumentConverter(officeManager); |
|
123 |
// System.out.println(converter.getFormatRegistry().getClass()); |
|
124 |
// File outfile = new File("/home/mdecorde/Bureau/matrice/témoignages/CONVERSIONS/jod/"+srcext+"."+ext); |
|
125 |
// converter.convert(infile, outfile, converter.getFormatRegistry().getFormatByExtension(ext)); |
|
126 |
// |
|
127 |
// } catch(Exception e) { |
|
128 |
// System.out.println(e.getLocalizedMessage()); |
|
129 |
// } finally { |
|
130 |
// officeManager.stop(); |
|
131 |
// } |
|
132 |
// } |
|
133 |
// } |
|
134 |
|
|
135 |
File infile = new File("/home/mdecorde/TEMP/Corpus_Riverains_TXM/EP27_txm.doc"); //$NON-NLS-1$ |
|
136 |
File outdir = new File("/home/mdecorde/TEMP"); //$NON-NLS-1$ |
|
137 |
JodConverter.convert(infile).to(new File(outdir, "test.odt")); |
|
138 |
//System.out.println("result: "+ConvertDocument.toODT(infile, outdir)); |
|
139 |
ConvertDocument convert = new ConvertDocument(); |
|
140 |
try { |
|
141 |
System.out.println("result: "+convert.toODT(infile, outdir)); //$NON-NLS-1$ |
|
142 |
} catch(Exception e) { org.txm.utils.logger.Log.printStackTrace(e);} |
|
143 |
convert.stop(); |
|
144 |
} |
|
145 |
|
|
146 |
} |
TXM/trunk/bundles/org.txm.libs.jodconverter/src/org/txm/libs/jodconverter/ConvertDocument.java (revision 3896) | ||
---|---|---|
7 | 7 |
import org.jodconverter.core.office.OfficeUtils; |
8 | 8 |
import org.jodconverter.local.JodConverter; |
9 | 9 |
import org.jodconverter.local.office.LocalOfficeManager; |
10 |
import org.jodconverter.local.office.LocalOfficeManager.Builder; |
|
10 | 11 |
|
11 | 12 |
public class ConvertDocument { |
12 | 13 |
|
13 |
public static void test2(File inputFile, File outputFile) throws OfficeException { |
|
14 |
// Create an office manager using the default configuration. |
|
15 |
// The default port is 2002. Note that when an office manager |
|
16 |
// is installed, it will be the one used by default when |
|
17 |
// a converter is created. |
|
18 |
final LocalOfficeManager officeManager = LocalOfficeManager.install(); |
|
14 |
public static void convert(File inputFile, File outputFile) throws OfficeException { |
|
15 |
convert(inputFile, outputFile, null); |
|
16 |
} |
|
17 |
|
|
18 |
public static void convert(File inputFile, File outputFile, String officePath) throws OfficeException { |
|
19 |
|
|
20 |
Builder builder = LocalOfficeManager.builder().install(); |
|
21 |
if (officePath != null && officePath.length() > 0) { |
|
22 |
builder.officeHome(officePath); |
|
23 |
} |
|
24 |
OfficeManager officeManager = builder.build(); |
|
19 | 25 |
try { |
20 |
|
|
21 |
// Start an office process and connect to the started instance (on port 2002). |
|
22 |
officeManager.start(); |
|
23 |
|
|
24 |
// Convert |
|
25 |
JodConverter |
|
26 |
.convert(inputFile) |
|
27 |
.to(outputFile) |
|
28 |
.execute(); |
|
26 |
// Start an office process and connect to the started instance (on port 2002). |
|
27 |
officeManager.start(); |
|
28 |
// Convert |
|
29 |
JodConverter |
|
30 |
.convert(inputFile) |
|
31 |
.to(outputFile) |
|
32 |
.execute(); |
|
29 | 33 |
} finally { |
30 |
// Stop the office process
|
|
31 |
OfficeUtils.stopQuietly(officeManager);
|
|
34 |
// Stop the office process
|
|
35 |
OfficeUtils.stopQuietly(officeManager);
|
|
32 | 36 |
} |
33 | 37 |
} |
34 | 38 |
|
35 |
public static void test1(File inputFile, File outputFile) throws OfficeException {
|
|
39 |
public static void setOfficeHome(String officeHome) {
|
|
36 | 40 |
|
37 |
OfficeManager officeManager = LocalOfficeManager.builder() |
|
38 |
.install() |
|
39 |
.officeHome("/opt/libreoffice7.5") |
|
40 |
.build(); |
|
41 |
try { |
|
42 |
// Start an office process and connect to the started instance (on port 2002). |
|
43 |
officeManager.start(); |
|
44 |
// Convert |
|
45 |
JodConverter |
|
46 |
.convert(inputFile) |
|
47 |
.to(outputFile) |
|
48 |
.execute(); |
|
49 |
} finally { |
|
50 |
// Stop the office process |
|
51 |
OfficeUtils.stopQuietly(officeManager); |
|
52 |
} |
|
41 |
if (officeHome == null) { |
|
42 |
System.setProperty("office.home", ""); |
|
43 |
} else { |
|
44 |
System.setProperty("office.home", officeHome); |
|
45 |
|
|
46 |
Builder builder = LocalOfficeManager.builder().install(); |
|
47 |
builder.officeHome(officeHome); |
|
48 |
} |
|
53 | 49 |
} |
54 | 50 |
|
55 | 51 |
public static void main(String[] args) throws OfficeException { |
56 | 52 |
|
57 |
File inputFile = new File("/home/mdecorde/Documents/FICHE d'inscription à une formation ENS - DECORDE.doc");
|
|
58 |
File outputFile = new File("/home/mdecorde/Documents/FICHE d'inscription à une formation ENS - DECORDE.odt");
|
|
53 |
File inputFile = new File("/home/mdecorde/xml/doc/Formulaire_SFT 2022 - Decorde.docx");
|
|
54 |
File outputFile = new File("/home/mdecorde/xml/doc/Formulaire_SFT 2022 - Decorde.odt");
|
|
59 | 55 |
|
60 |
test2(inputFile, outputFile);
|
|
56 |
convert(inputFile, outputFile);
|
|
61 | 57 |
} |
62 | 58 |
} |
TXM/trunk/bundles/org.txm.libs.jodconverter/META-INF/MANIFEST.MF (revision 3896) | ||
---|---|---|
15 | 15 |
org.jodconverter.local.office, |
16 | 16 |
org.jodconverter.local.office.utils, |
17 | 17 |
org.jodconverter.local.process, |
18 |
org.jodconverter.local.task |
|
18 |
org.jodconverter.local.task, |
|
19 |
org.txm.libs.jodconverter |
|
19 | 20 |
Require-Bundle: org.txm.libs.gson;bundle-version="2.8.6" |
20 | 21 |
Bundle-Vendor: JodConverter |
21 | 22 |
Automatic-Module-Name: org.txm.libs.jodconverter |
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/text/Text2TXTMacro.groovy (revision 3896) | ||
---|---|---|
1 | 1 |
package org.txm.macro.text; |
2 | 2 |
// STANDARD DECLARATIONS |
3 | 3 |
|
4 |
import org.kohsuke.args4j.* |
|
5 | 4 |
import groovy.transform.Field |
6 |
import org.txm.rcp.swt.widget.parameters.* |
|
7 |
import org.txm.doc.*; |
|
8 |
import org.txm.importer.*; |
|
5 |
import org.txm.libs.jodconverter.ConvertDocument |
|
9 | 6 |
|
10 | 7 |
// BEGINNING OF PARAMETERS |
11 | 8 |
@Field @Option(name="inputDirectory", usage="the directory containing the DOC/ODT/RTF files to convert", widget="Folder", required=true, def="") |
... | ... | |
19 | 16 |
|
20 | 17 |
boolean debug = false; |
21 | 18 |
|
22 |
ConvertDocument converter; |
|
23 | 19 |
def files = [] |
24 | 20 |
try { |
25 |
converter = new ConvertDocument(); |
|
26 | 21 |
inputDirectory.eachFileMatch(~/.+\.$extension/) { docFile -> |
27 | 22 |
String name = docFile.getName() |
28 | 23 |
name = name.substring(0, name.lastIndexOf(".")) |
29 | 24 |
def txtFile = new File(docFile.getParentFile(), name+".txt") |
30 |
converter.autoFile(docFile, txtFile, "txt")
|
|
25 |
ConvertDocument.convert(docFile, txtFile)
|
|
31 | 26 |
files << docFile |
32 | 27 |
} |
33 | 28 |
} catch(Exception e) { |
34 | 29 |
println "Error while processing directory: "+e; |
35 | 30 |
if (debug) e.printStackTrace(); |
36 | 31 |
} |
37 |
if (converter != null) converter.stop(); |
|
38 | 32 |
|
39 | 33 |
println "Processed directory: $inputDirectory" |
40 | 34 |
println "files: "+files |
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/text/SetOfficeMacro.groovy (revision 3896) | ||
---|---|---|
1 |
// STANDARD DECLARATIONS |
|
2 |
package org.txm.macro.office |
|
3 |
|
|
4 |
import org.kohsuke.args4j.* |
|
5 |
import groovy.transform.Field |
|
6 |
import org.txm.rcp.swt.widget.parameters.* |
|
7 |
import org.txm.libs.jodconverter.ConvertDocument |
|
8 |
|
|
9 |
// BEGINNING OF PARAMETERS |
|
10 |
|
|
11 |
@Field @Option(name="office_path", usage="Path to LibreOffice or OpenOffice installation directory", widget="Folder", required=false, def="libreoffice or openoffice install directory") |
|
12 |
def office_path |
|
13 |
|
|
14 |
// Open the parameters input dialog box |
|
15 |
if (!ParametersDialog.open(this)) return; |
|
16 |
|
|
17 |
// END OF PARAMETERS |
|
18 |
|
|
19 |
if (office_path == null) { |
|
20 |
println "No path to office directory given." |
|
21 |
return |
|
22 |
} |
|
23 |
|
|
24 |
if (!office_path.exists()) { |
|
25 |
println "'$office_path' directory not found." |
|
26 |
return |
|
27 |
} |
|
28 |
|
|
29 |
if (!office_path.isDirectory()) { |
|
30 |
println "'$office_path' exists but is not a directory." |
|
31 |
return |
|
32 |
} |
|
33 |
|
|
34 |
if (!office_path.canExecute()) { |
|
35 |
println "'$office_path' exists but has not sufficent rights to be used." |
|
36 |
return |
|
37 |
} |
|
38 |
|
|
39 |
def old = System.getProperty("office.home") |
|
40 |
ConvertDocument.setOfficeHome(office_path.getAbsolutePath()) |
|
41 |
println "Office path set to '"+System.getProperty("office.home")+"'." |
|
42 |
if (old != null) { |
|
43 |
println " Previous path was '${old}'." |
|
44 |
} |
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/prototypes/export/office/SetOfficeMacro.groovy (revision 3896) | ||
---|---|---|
1 |
// STANDARD DECLARATIONS |
|
2 |
package org.txm.macro.office |
|
3 |
|
|
4 |
import org.kohsuke.args4j.* |
|
5 |
import groovy.transform.Field |
|
6 |
import org.txm.rcp.swt.widget.parameters.* |
|
7 |
|
|
8 |
// BEGINNING OF PARAMETERS |
|
9 |
|
|
10 |
@Field @Option(name="office_path", usage="Path to LibreOffice or OpenOffice installation directory", widget="Folder", required=false, def="libreoffice or openoffice install directory") |
|
11 |
def office_path |
|
12 |
|
|
13 |
// Open the parameters input dialog box |
|
14 |
if (!ParametersDialog.open(this)) return; |
|
15 |
|
|
16 |
// END OF PARAMETERS |
|
17 |
|
|
18 |
if (office_path == null) { |
|
19 |
println "No path to office directory given." |
|
20 |
return |
|
21 |
} |
|
22 |
|
|
23 |
if (!office_path.exists()) { |
|
24 |
println "'$office_path' directory not found." |
|
25 |
return |
|
26 |
} |
|
27 |
|
|
28 |
if (!office_path.isDirectory()) { |
|
29 |
println "'$office_path' exists but is not a directory." |
|
30 |
return |
|
31 |
} |
|
32 |
|
|
33 |
if (!office_path.canExecute()) { |
|
34 |
println "'$office_path' exists but has not sufficent rights to be used." |
|
35 |
return |
|
36 |
} |
|
37 |
|
|
38 |
def old = System.getProperty("office.home") |
|
39 |
System.setProperty("office.home", office_path.getAbsolutePath()) |
|
40 |
println "Office path set to '"+System.getProperty("office.home")+"'." |
|
41 |
if (old != null) { |
|
42 |
println " Previous path was '${old}'." |
|
43 |
} |
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/transcription/Transana2TRS.groovy (revision 3896) | ||
---|---|---|
58 | 58 |
import java.text.ParseException; |
59 | 59 |
import java.text.SimpleDateFormat; |
60 | 60 |
import java.util.Date; |
61 |
import org.txm.libs.jodconverter.ConvertDocument |
|
61 | 62 |
|
62 | 63 |
class Transana2TRS { |
64 |
|
|
63 | 65 |
boolean debug = false; |
64 | 66 |
boolean isDirectory = false; |
65 | 67 |
File outDir; |
66 | 68 |
File dtd; |
67 |
ConvertDocument converter; |
|
68 | 69 |
def monitor |
69 | 70 |
|
70 | 71 |
Transana2TRS(File outDir, File dtd, boolean debug, def monitor) { |
... | ... | |
92 | 93 |
boolean ret = true |
93 | 94 |
try { |
94 | 95 |
println "* Processing $dir directory" |
95 |
converter = new ConvertDocument(); |
|
96 | 96 |
def files = dir.listFiles() |
97 | 97 |
def okfiles = []; |
98 | 98 |
if (files != null) |
... | ... | |
126 | 126 |
println "Error while processing directory: "+e; |
127 | 127 |
if (debug) e.printStackTrace(); |
128 | 128 |
} |
129 |
if (converter != null) converter.stop(); |
|
130 | 129 |
return ret; |
131 | 130 |
} |
132 | 131 |
|
... | ... | |
202 | 201 |
boolean DOCtoHTML(File docFile, File htmlFile) { |
203 | 202 |
println "*** ODT -> HTML" |
204 | 203 |
try { |
205 |
if (!isDirectory) converter = new ConvertDocument(); |
|
206 |
converter.setDebug(debug) |
|
207 |
converter.autoFile(docFile, htmlFile, "html") |
|
204 |
ConvertDocument.convert(docFile, htmlFile) |
|
208 | 205 |
} catch(Exception e) { |
209 | 206 |
println "Error while converting $docFile : $e" |
210 | 207 |
if (debug) e.printStackTrace() |
211 | 208 |
} finally { |
212 |
if (!isDirectory && converter != null) converter.stop() |
|
213 | 209 |
} |
214 | 210 |
return htmlFile.exists() && htmlFile.length() > 0 |
215 | 211 |
} |
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/transcription/TextTranscription2TRS.groovy (revision 3896) | ||
---|---|---|
36 | 36 |
|
37 | 37 |
import org.txm.scripts.importer.*; |
38 | 38 |
import org.xml.sax.Attributes; |
39 |
import org.txm.importer.ConvertDocument; |
|
40 | 39 |
import org.txm.importer.scripts.filters.*; |
41 | 40 |
|
42 | 41 |
import java.io.File; |
... | ... | |
56 | 55 |
import java.text.ParseException; |
57 | 56 |
import java.text.SimpleDateFormat; |
58 | 57 |
import java.util.Date; |
58 |
import org.txm.libs.jodconverter.ConvertDocument |
|
59 |
|
|
59 | 60 |
/** |
60 | 61 |
* BUGS: |
61 | 62 |
* - n'a pas repéré les timings autour des commentaires |
... | ... | |
68 | 69 |
boolean debug = false; |
69 | 70 |
boolean isDirectory = false; |
70 | 71 |
File outDir; |
71 |
ConvertDocument converter; |
|
72 | 72 |
def monitor |
73 | 73 |
|
74 | 74 |
TextTranscription2TRS(File outDir, boolean debug, def monitor) { |
... | ... | |
95 | 95 |
boolean ret = true |
96 | 96 |
try { |
97 | 97 |
println "* Processing $dir directory" |
98 |
converter = new ConvertDocument(); |
|
99 | 98 |
def files = dir.listFiles() |
100 | 99 |
def okfiles = []; |
101 | 100 |
if (files != null) |
... | ... | |
129 | 128 |
println "Error while processing directory: "+e; |
130 | 129 |
if (debug) e.printStackTrace(); |
131 | 130 |
} |
132 |
if (converter != null) converter.stop(); |
|
133 | 131 |
return ret; |
134 | 132 |
} |
135 | 133 |
|
... | ... | |
198 | 196 |
boolean DOCtoTXT(File docFile, File txtFile) { |
199 | 197 |
println "*** ODT -> TXT" |
200 | 198 |
try { |
201 |
if (!isDirectory) converter = new ConvertDocument(); |
|
202 |
converter.setDebug(debug) |
|
203 |
converter.autoFile(docFile, txtFile, "txt") |
|
199 |
ConvertDocument.convert(docFile, txtFile) |
|
204 | 200 |
} catch(Exception e) { |
205 | 201 |
println "Error while converting $docFile : $e" |
206 | 202 |
if (debug) e.printStackTrace() |
207 |
} finally { |
|
208 |
if (!isDirectory && converter != null) converter.stop() |
|
209 | 203 |
} |
210 | 204 |
return txtFile.exists() && txtFile.length() > 0 |
211 | 205 |
} |
212 | 206 |
|
213 |
// boolean HTMLtoHTMLforTidy(File htmlFile, File htmlFile2) { |
|
214 |
// println "*** HTML -> HTML for tidy" |
|
215 |
// try { |
|
216 |
// println "replace TABS with 4 spaces" |
|
217 |
// String text2 = htmlFile.getText("UTF-8") |
|
218 |
// text2 = text2.replaceAll(" ", " ") |
|
219 |
// text2 = text2.replaceAll("’", "'") |
|
220 |
// text2 = text2.replaceAll("’", "'") |
|
221 |
// text2 = text2.replaceAll("\t", " ") |
|
222 |
// text2 = text2.replaceAll("\n", ' ') |
|
223 |
// text2 = text2.replaceAll("\r\n", ' ') |
|
224 |
// htmlFile2.withWriter("UTF-8") { writer -> |
|
225 |
// writer.write(text2); |
|
226 |
// } |
|
227 |
// } catch(Exception e) { |
|
228 |
// println "Error while preparing HTML of $htmlFile : $e" |
|
229 |
// if (debug) e.printStackTrace() |
|
230 |
// } |
|
231 |
// return htmlFile2.exists() && htmlFile2.length() > 0 |
|
232 |
// } |
|
233 |
// |
|
234 |
// boolean HTMLtoXHTML(File htmlFile2, File xhtmlFile) { |
|
235 |
// println "*** HTML for tidy -> XHTML" |
|
236 |
// try { |
|
237 |
// Tidy tidy = new Tidy(); // obtain a new Tidy instance |
|
238 |
// tidy.setXHTML(true); // set desired config options using tidy setters |
|
239 |
// tidy.setInputEncoding("UTF-8") |
|
240 |
// tidy.setOutputEncoding("UTF-8") |
|
241 |
// tidy.setShowErrors(100) |
|
242 |
// tidy.setShowWarnings(debug) |
|
243 |
// tidy.setTabsize(10) |
|
244 |
// tidy.setWraplen(9999) |
|
245 |
// tidy.setForceOutput(true) // Tidy won't stop if error are found |
|
246 |
// xhtmlFile.withWriter("UTF-8") { out -> |
|
247 |
// def input = new InputStreamReader(htmlFile2.toURI().toURL().newInputStream(), "UTF-8") |
|
248 |
// tidy.parse(input, out); // run tidy, providing an input and output stream |
|
249 |
// } |
|
250 |
// if (xhtmlFile.exists()) { |
|
251 |
// // JTidy produced a "0x0" char. removing them |
|
252 |
// // fix separated < and / ??? |
|
253 |
// def c = Character.toChars(0)[0] |
|
254 |
// String txttmp = xhtmlFile.getText("UTF-8"); |
|
255 |
// xhtmlFile.withWriter("UTF-8") { out -> |
|
256 |
// out.write(txttmp.replace("<\n/", "</").replace("<\r\n/", "</")) |
|
257 |
// } |
|
258 |
// } |
|
259 |
// } catch(Exception e) { |
|
260 |
// println "Error while applying JTidy: "+e |
|
261 |
// if (debug) e.printStackTrace() |
|
262 |
// } |
|
263 |
// return xhtmlFile.exists() && xhtmlFile.length() > 0 |
|
264 |
// } |
|
265 |
|
|
266 | 207 |
boolean TXTtoTRS(File txtFile, File trsFile) { |
267 | 208 |
println "*** TXT -> TRS" |
268 | 209 |
try { |
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/scripts/clix/treetagger-wrapper-definition.xml (revision 3896) | ||
---|---|---|
1 |
<?xml version="1.0"> |
|
1 |
<?xml version="1.0"?>
|
|
2 | 2 |
<application name="TreeTagger" version="0.0.0" desc="Tag files"> |
3 | 3 |
<progs> |
4 | 4 |
<prog exec="tree-tagger" version="0.4.6" desc="Tag"> |
... | ... | |
10 | 10 |
<arg state="optional" type="none" name="prob" desc="Print tag probabilities"/> |
11 | 11 |
<arg state="optional" type="none" name="ignore-prefix" desc=" Ignore prefix when guessing pos for unknown words"/> |
12 | 12 |
<arg state="optional" type="none" name="no-unknown" desc="Print the token rather than [unknown] for unknown lemma"/> |
13 |
<arg state="optional" type="none" name="cap-heuristics" desc="Look up unknown capitalized words in the list of lower-case words"/> |
|
14 | 13 |
<arg state="optional" type="none" name="hyphen-heuristics" desc="Turn on the heuristics fur guessing the parts of speech of unknown hyphenated words"/> |
15 | 14 |
<arg state="optional" type="none" name="quiet" desc="quiet mode"/> |
16 | 15 |
<arg state="optional" type="none" name="pt-with-prob" desc="pretagging with probabilities"/> |
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/doc/docLoader.groovy (revision 3896) | ||
---|---|---|
133 | 133 |
if (infile.isDirectory()) { |
134 | 134 |
File unzipDir = new File(docfiles, infile.getName()) |
135 | 135 |
infile.renameTo(unzipDir) |
136 |
|
|
136 |
|
|
137 | 137 |
//println "zipdir "+unzipDir |
138 | 138 |
StylesToCSS converter = new StylesToCSS(unzipDir); |
139 | 139 |
if (!converter.process(new File(unzipDir, "style.css"))) { |
140 | 140 |
println "WARNING: Failed to build css file of $unzipDir" |
141 | 141 |
} |
142 | 142 |
// and get the soft page breaks and styles parents |
143 |
def parentStyles = converter.parentStyles;
|
|
143 |
def parentStyles = converter.parentStyles |
|
144 | 144 |
def beforebreaks = converter.beforebreaks |
145 | 145 |
def afterbreaks = converter.afterbreaks |
146 |
|
|
146 |
|
|
147 | 147 |
//println "BEFORES: "+beforebreaks |
148 | 148 |
//println "AFTERS: "+afterbreaks |
149 | 149 |
//println "PARENTS: "+parentStyles |
150 |
|
|
150 |
|
|
151 | 151 |
// se servir de ça pour insérer <pb/> et remplacer styles automatiques |
152 | 152 |
File xmlFile = new File(txmDir, unzipDir.getName().substring(6)) |
153 | 153 |
//println "PATCH : $xmlFile" |
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/scripts/doc/DocumentToTei.groovy (revision 3896) | ||
---|---|---|
30 | 30 |
import java.io.File; |
31 | 31 |
|
32 | 32 |
import org.txm.importer.ApplyXsl2; |
33 |
import org.txm.importer.ConvertDocument; |
|
34 | 33 |
import org.txm.utils.zip.Zip |
35 | 34 |
import org.txm.utils.FileUtils |
35 |
import org.txm.libs.jodconverter.ConvertDocument |
|
36 | 36 |
|
37 | 37 |
/** |
38 | 38 |
* The Class DocumentToTei. |
... | ... | |
49 | 49 |
if (!xsldir.exists()) { println "XslDir does not exists: "+xsldir; return false;} |
50 | 50 |
if (!xslOdtTei.exists()) { println "xslOdtTei file does not exists: "+xslOdtTei; return false;} |
51 | 51 |
|
52 |
ConvertDocument converter; |
|
53 | 52 |
for (File file : files) { |
54 | 53 |
print "." |
55 | 54 |
String name = file.getName(); |
... | ... | |
69 | 68 |
File odtFile = File.createTempFile("workflowdocx", "sfsdf.odt", outdir); |
70 | 69 |
|
71 | 70 |
try { |
72 |
converter = new ConvertDocument(); |
|
73 |
converter.setDebug(DEBUG) |
|
74 |
converter.autoFile(file, odtFile, "odt") |
|
71 |
ConvertDocument.convert(file, odtFile) |
|
75 | 72 |
if (!new DocumentToTei().run(odtFile, xslOdtTei, teifile)) { |
76 | 73 |
println "Docx to Odt to Tei failed: "+file |
77 | 74 |
odtFile.delete(); |
... | ... | |
79 | 76 |
} catch(Exception e) { println "DOCX to ODT to TEI failed: $file: $e"; } |
80 | 77 |
finally { |
81 | 78 |
odtFile.delete(); |
82 |
if (converter != null) converter.stop(); |
|
83 | 79 |
} |
84 | 80 |
|
85 | 81 |
} else if (FileUtils.isExtension(file, "doc")) { |
... | ... | |
88 | 84 |
File odtFile = File.createTempFile("workflowdoc", "sfsdf.odt", outdir); |
89 | 85 |
|
90 | 86 |
try { |
91 |
converter = new ConvertDocument(); |
|
92 |
converter.setDebug(DEBUG) |
|
93 |
converter.autoFile(file, odtFile, "odt") |
|
87 |
ConvertDocument.convert(file, odtFile) |
|
94 | 88 |
if (!new DocumentToTei().run(odtFile, xslOdtTei, teifile)) { |
95 | 89 |
println "DOC to ODT to TEI failed: "+file |
96 | 90 |
odtFile.delete(); |
... | ... | |
98 | 92 |
} catch(Exception e) { println "DOC to ODT to TEI failed: $file: $e"; } |
99 | 93 |
finally { |
100 | 94 |
odtFile.delete(); |
101 |
if (converter != null) converter.stop(); |
|
102 | 95 |
} |
103 |
|
|
104 | 96 |
} |
105 | 97 |
} |
106 | 98 |
|
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/scripts/doc/Workflow.groovy (revision 3896) | ||
---|---|---|
32 | 32 |
// |
33 | 33 |
package org.txm.scripts.doc |
34 | 34 |
|
35 |
import org.txm.importer.ConvertDocument;
|
|
35 |
import org.txm.libs.jodconverter.ConvertDocument
|
|
36 | 36 |
|
37 | 37 |
// TODO: Auto-generated Javadoc |
38 | 38 |
/* (non-Javadoc) |
... | ... | |
55 | 55 |
|
56 | 56 |
println "start" |
57 | 57 |
for (File file : srcdir.listFiles()) { |
58 |
ConvertDocument converter = new ConvertDocument(); |
|
59 | 58 |
try { |
60 | 59 |
File teifile = new File(outdir, file.getName()+".xml"); |
61 | 60 |
if (file.getName().endsWith(".odt")) { |
... | ... | |
66 | 65 |
} else if (file.getName().endsWith(".docx")) { |
67 | 66 |
File odtFile = File.createTempFile("workflowodt", "sfsdf.odt", srcdir); |
68 | 67 |
// convert doc to odt |
69 |
converter.autoFile(file, odtFile, "odt")
|
|
68 |
ConvertDocument.convert(file, odtFile)
|
|
70 | 69 |
if (!new DocumentToTei().run(odtFile, xslOdtTei, teifile)) { |
71 | 70 |
println "Docx to Odt to Tei failed: "+file |
72 | 71 |
odtFile.delete(); |
... | ... | |
76 | 75 |
} else if (file.getName().endsWith(".doc")) { |
77 | 76 |
File odtFile = File.createTempFile("workflowodt", "sfsdf.odt", srcdir); |
78 | 77 |
// convert doc to odt |
79 |
converter.autoFile(file, odtFile, "odt")
|
|
78 |
ConvertDocument.convert(file, odtFile)
|
|
80 | 79 |
if (!new DocumentToTei().run(odtFile, xslOdtTei, teifile)) { |
81 | 80 |
println "Doc to Odt to Tei failed: "+file |
82 | 81 |
odtFile.delete(); |
... | ... | |
86 | 85 |
} |
87 | 86 |
} |
88 | 87 |
catch(Exception e) { e.printStackTrace() } |
89 |
finally { converter.stop(); }
|
|
88 |
finally { } |
|
90 | 89 |
} |
91 | 90 |
|
92 | 91 |
println "--Done" |
TXM/trunk/bundles/org.txm.treetagger.rcp/src/org/txm/treetagger/rcp/preferences/TreeTaggerPreferencePage.java (revision 3896) | ||
---|---|---|
71 | 71 |
runOptions.setLayout(new GridLayout(3, false)); |
72 | 72 |
|
73 | 73 |
this.addField(new BooleanFieldEditor(TreeTaggerPreferences.OPTIONS_UNKNOWN, "Print the token rather than <unknown> for unknown lemma", runOptions)); |
74 |
this.addField(new BooleanFieldEditor(TreeTaggerPreferences.OPTIONS_CAPHEURISTIC, "Look up unknown capitalized words in the list of lower-case words", runOptions)); |
|
75 | 74 |
this.addField(new BooleanFieldEditor(TreeTaggerPreferences.OPTIONS_HYPHENHEURISTIC, "Turn on the heuristics fur guessing the parts of speech of unknown hyphenated words", runOptions)); |
76 | 75 |
this.addField(new BooleanFieldEditor(TreeTaggerPreferences.OPTIONS_PROB, "Print tag probabilities", runOptions)); |
77 | 76 |
this.addField(new FileFieldEditor(TreeTaggerPreferences.OPTIONS_LEX, "Read auxiliary lexicon entries from a file", runOptions)); |
TXM/trunk/bundles/org.txm.rcp/src/main/java/org/txm/rcp/corpuswizard/ImportWizard.java (revision 3896) | ||
---|---|---|
105 | 105 |
project.setAnnotate(e.isRunning()); |
106 | 106 |
} |
107 | 107 |
|
108 |
if (project.getAnnotate()) { |
|
109 |
Log.info(TXMUIMessages.TheAnnotateImportParameterHasBeenActivatedSinceTreeTaggerIsInstalled); |
|
110 |
} |
|
111 |
else { |
|
112 |
Log.info(TXMUIMessages.TheAnnotateImportParameterWasNotActivatedSinceTreeTaggerIsNotInstalled); |
|
113 |
} |
|
108 |
// if (project.getAnnotate()) {
|
|
109 |
// Log.info(TXMUIMessages.TheAnnotateImportParameterHasBeenActivatedSinceTreeTaggerIsInstalled);
|
|
110 |
// }
|
|
111 |
// else {
|
|
112 |
// Log.info(TXMUIMessages.TheAnnotateImportParameterWasNotActivatedSinceTreeTaggerIsNotInstalled);
|
|
113 |
// }
|
|
114 | 114 |
|
115 | 115 |
File importxml = new File(path, "import.xml"); //$NON-NLS-1$ |
116 | 116 |
if (importxml.exists()) { |
... | ... | |
132 | 132 |
// Log.info(TXMUIMessages.abort); |
133 | 133 |
// return false; |
134 | 134 |
// } |
135 |
|
|
136 |
|
|
137 | 135 |
} |
138 | 136 |
|
139 | 137 |
// if (!project.hasEditionDefinition("default")) { |
TXM/trunk/bundles/org.txm.utils.core/src/org/txm/utils/treetagger/TreeTagger.java (revision 3896) | ||
---|---|---|
217 | 217 |
this.isnounknown = false; |
218 | 218 |
} |
219 | 219 |
|
220 |
// Look up unknown capitalized words in the list of lower-case words |
|
221 |
/** The iscapheuristics. */ |
|
222 |
private Boolean iscapheuristics = false; |
|
223 |
|
|
224 |
/** |
|
225 |
* Setcapheuristics. |
|
226 |
*/ |
|
227 |
public void setcapheuristics() { |
|
228 |
this.iscapheuristics = true; |
|
229 |
} |
|
230 |
|
|
231 |
/** |
|
232 |
* Unsetcapheuristics. |
|
233 |
*/ |
|
234 |
public void unsetcapheuristics() { |
|
235 |
this.iscapheuristics = false; |
|
236 |
} |
|
237 |
|
|
238 | 220 |
// Turn on the heuristics fur guessing the parts of speech of unknown |
239 | 221 |
// hyphenated words |
240 | 222 |
/** The ishyphenheuristics. */ |
... | ... | |
481 | 463 |
args.add("-ignore-prefix"); //$NON-NLS-1$ |
482 | 464 |
if (isnounknown) |
483 | 465 |
args.add("-no-unknown"); //$NON-NLS-1$ |
484 |
if (iscapheuristics) |
|
485 |
args.add("-cap-heuristics"); //$NON-NLS-1$ |
|
486 | 466 |
if (ishyphenheuristics) |
487 | 467 |
args.add("-hyphen-heuristics"); //$NON-NLS-1$ |
488 | 468 |
if (isquiet) |
TXM/trunk/features/org.txm.core.feature/feature.xml (revision 3896) | ||
---|---|---|
192 | 192 |
version="0.0.0" |
193 | 193 |
unpack="false"/> |
194 | 194 |
|
195 |
<plugin |
|
196 |
id="org.txm.libs.jodconverter" |
|
197 |
download-size="0" |
|
198 |
install-size="0" |
|
199 |
version="0.0.0" |
|
200 |
unpack="false"/> |
|
201 |
|
|
195 | 202 |
</feature> |
Formats disponibles : Unified diff