Révision 3364
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/macro/export/ExportTextContentMacro.groovy (revision 3364) | ||
---|---|---|
1 |
// Copyright © 2020 ENS de Lyon, CNRS, University of Franche-Comté |
|
1 |
// Copyright © 2020-2022 ENS de Lyon, CNRS, University of Franche-Comté
|
|
2 | 2 |
// @author mdecorde |
3 |
// @author sheiden |
|
3 | 4 |
|
4 | 5 |
// STANDARD DECLARATIONS |
5 | 6 |
package org.txm.macro.export |
... | ... | |
21 | 22 |
|
22 | 23 |
// PARAMETERS |
23 | 24 |
|
24 |
@Field @Option(name="exportDirectory", usage="results directory", widget="Folder", required=true, def="export")
|
|
25 |
File exportDirectory
|
|
25 |
@Field @Option(name="outputDirectory", usage="results output directory", widget="Folder", required=true, def="")
|
|
26 |
File outputDirectory
|
|
26 | 27 |
|
27 |
@Field @Option(name="lineSeparatorStructureName", usage="name of the structure to use to produce the output lines", widget="String", required=false, def="p")
|
|
28 |
def lineSeparatorStructureName
|
|
28 |
@Field @Option(name="wordProperty", usage="word property to export", widget="String", required=true, def="word")
|
|
29 |
def wordProperty
|
|
29 | 30 |
|
31 |
@Field @Option(name="oneWordPerLine", usage="output one word per line", widget="Boolean", required=false, def="false") |
|
32 |
def oneWordPerLine |
|
33 |
|
|
34 |
@Field @Option(name="oneSentencePerLine", usage="output one sentence per line", widget="Boolean", required=false, def="true") |
|
35 |
def oneSentencePerLine |
|
36 |
|
|
37 |
@Field @Option(name="sentenceStructureName", usage="name of the structure encoding sentences", widget="String", required=false, def="") |
|
38 |
def sentenceStructureName |
|
39 |
|
|
30 | 40 |
if (!ParametersDialog.open(this)) return |
31 | 41 |
|
32 | 42 |
// BEGINNING |
33 | 43 |
|
34 |
if (!exportDirectory.exists()) exportDirectory.mkdirs()
|
|
44 |
if (!outputDirectory.exists()) outputDirectory.mkdirs()
|
|
35 | 45 |
|
36 | 46 |
def corpus = corpusViewSelection |
47 |
def corpusName = corpus.getName() |
|
37 | 48 |
def CQI = CQPSearchEngine.getCqiClient() |
38 | 49 |
|
39 |
def lineSeparatorStructure = corpus.getStructuralUnit(lineSeparatorStructureName) |
|
50 |
if (wordProperty == null || !(wordProperty.length() > 0)) { |
|
51 |
println "** Please provide a word property name in parameter 'wordProperty', for example 'word'. Aborting..." |
|
52 |
return 1 |
|
53 |
} |
|
40 | 54 |
|
41 |
if (lineSeparatorStructure == null) {
|
|
42 |
println "** The $lineSeparatorStructureName structure does not exist in the $corpus corpus"
|
|
43 |
return |
|
55 |
if (oneSentencePerLine && (sentenceStructureName == null || !(sentenceStructureName.length() > 0))) {
|
|
56 |
println "** Please provide a name for the structure encoding sentences in parameter 'sentenceStructureName', or uncheck parameter 'oneSentencePerLine'. Aborting..."
|
|
57 |
return 1
|
|
44 | 58 |
} |
45 | 59 |
|
46 |
def breaks_pos = Arrays.asList(corpus.query(new CQLQuery("[]</"+lineSeparatorStructureName+">"),"test", false).getEnds()) |
|
47 |
println "End of structure positions = "+breaks_pos |
|
60 |
if (oneSentencePerLine) { |
|
48 | 61 |
|
49 |
println "Exporting $corpus text content to $exportDirectory..."
|
|
62 |
lineSeparatorStructure = corpus.getStructuralUnit(sentenceStructureName)
|
|
50 | 63 |
|
51 |
def wordProperty = corpus.getWordProperty() |
|
64 |
if (lineSeparatorStructure == null) { |
|
65 |
println "** No $sentenceStructureName structure in the $corpus corpus. Aborting..." |
|
66 |
return 1 |
|
67 |
} |
|
68 |
|
|
69 |
breaks_pos = Arrays.asList(corpus.query(new CQLQuery("[]</"+sentenceStructureName+">"),"test", false).getEnds()) |
|
70 |
} |
|
71 |
|
|
72 |
println "Exporting $corpus text content to $outputDirectory..." |
|
73 |
|
|
74 |
def wordPropertyI = corpus.getProperty(wordProperty) |
|
75 |
|
|
76 |
if (wordPropertyI == null) { |
|
77 |
println "** No '$wordProperty' word property in the $corpus corpus. Aborting..." |
|
78 |
return 1 |
|
79 |
} |
|
80 |
|
|
52 | 81 |
def textidProperty = corpus.getStructuralUnit("text").getProperty("id") |
53 | 82 |
def textStartBoundaries = corpus.getTextStartLimits() |
54 | 83 |
def textEndBoundaries = corpus.getTextEndLimits() |
55 | 84 |
int[] struct_pos = CQI.cpos2Struc(textidProperty.getQualifiedName(), textStartBoundaries) |
56 | 85 |
String[] textids = CQI.struc2Str(textidProperty.getQualifiedName(), struct_pos) |
86 |
|
|
57 | 87 |
if (textStartBoundaries.size() == 1) { |
58 | 88 |
println "1 text" |
59 | 89 |
} else { |
... | ... | |
64 | 94 |
int start = textStartBoundaries[i] |
65 | 95 |
int end = textEndBoundaries[i] |
66 | 96 |
|
67 |
File txtFile = new File(exportDirectory, textids[i]+".txt")
|
|
97 |
File txtFile = new File(outputDirectory, textids[i]+".txt")
|
|
68 | 98 |
print "." |
69 | 99 |
def writer = txtFile.newWriter("UTF-8") |
70 | 100 |
int[] positions = new int[end - start + 1] |
... | ... | |
72 | 102 |
for (int p : start..end) { |
73 | 103 |
positions[c++] = p |
74 | 104 |
} |
75 |
int[] idx = CQI.cpos2Id(wordProperty.getQualifiedName(), positions) |
|
76 |
def words = CQI.id2Str(wordProperty.getQualifiedName(), idx) |
|
105 |
int[] idx = CQI.cpos2Id(wordPropertyI.getQualifiedName(), positions)
|
|
106 |
def words = CQI.id2Str(wordPropertyI.getQualifiedName(), idx)
|
|
77 | 107 |
def tmp = [] |
78 | 108 |
for (int j = 0 ; j < positions.length ; j++) { |
79 | 109 |
int p = positions[j] |
80 | 110 |
tmp << words[j] |
81 |
if (breaks_pos.contains(p)) { |
|
82 |
writer.println LangFormater.format(tmp.join(" "), corpus.getLang()) |
|
111 |
if (oneSentencePerLine && breaks_pos.contains(p)) { |
|
112 |
if (oneWordPerLine) { |
|
113 |
tmp.each { word -> writer.println word } |
|
114 |
} else { |
|
115 |
writer.println LangFormater.format(tmp.join(" "), corpus.getLang()) |
|
116 |
} |
|
83 | 117 |
tmp = [] |
84 | 118 |
} |
85 | 119 |
} |
86 | 120 |
if (tmp.size() > 0) { |
87 |
writer.println LangFormater.format(tmp.join(" "), corpus.getLang()) |
|
121 |
if (oneWordPerLine) { |
|
122 |
tmp.each { word -> writer.println word } |
|
123 |
} else { |
|
124 |
writer.println LangFormater.format(tmp.join(" "), corpus.getLang()) |
|
125 |
} |
|
88 | 126 |
} |
89 | 127 |
writer.close() |
90 | 128 |
} |
91 | 129 |
|
92 |
println "\nDone, result saved in "+exportDirectory.getAbsolutePath() |
|
130 |
println "\nDone, result saved in "+outputDirectory.getAbsolutePath() |
Formats disponibles : Unified diff