Révision 3364
| TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/macro/export/ExportTextContentMacro.groovy (revision 3364) | ||
|---|---|---|
| 1 |
// Copyright © 2020 ENS de Lyon, CNRS, University of Franche-Comté |
|
| 1 |
// Copyright © 2020-2022 ENS de Lyon, CNRS, University of Franche-Comté
|
|
| 2 | 2 |
// @author mdecorde |
| 3 |
// @author sheiden |
|
| 3 | 4 |
|
| 4 | 5 |
// STANDARD DECLARATIONS |
| 5 | 6 |
package org.txm.macro.export |
| ... | ... | |
| 21 | 22 |
|
| 22 | 23 |
// PARAMETERS |
| 23 | 24 |
|
| 24 |
@Field @Option(name="exportDirectory", usage="results directory", widget="Folder", required=true, def="export")
|
|
| 25 |
File exportDirectory
|
|
| 25 |
@Field @Option(name="outputDirectory", usage="results output directory", widget="Folder", required=true, def="")
|
|
| 26 |
File outputDirectory
|
|
| 26 | 27 |
|
| 27 |
@Field @Option(name="lineSeparatorStructureName", usage="name of the structure to use to produce the output lines", widget="String", required=false, def="p")
|
|
| 28 |
def lineSeparatorStructureName
|
|
| 28 |
@Field @Option(name="wordProperty", usage="word property to export", widget="String", required=true, def="word")
|
|
| 29 |
def wordProperty
|
|
| 29 | 30 |
|
| 31 |
@Field @Option(name="oneWordPerLine", usage="output one word per line", widget="Boolean", required=false, def="false") |
|
| 32 |
def oneWordPerLine |
|
| 33 |
|
|
| 34 |
@Field @Option(name="oneSentencePerLine", usage="output one sentence per line", widget="Boolean", required=false, def="true") |
|
| 35 |
def oneSentencePerLine |
|
| 36 |
|
|
| 37 |
@Field @Option(name="sentenceStructureName", usage="name of the structure encoding sentences", widget="String", required=false, def="") |
|
| 38 |
def sentenceStructureName |
|
| 39 |
|
|
| 30 | 40 |
if (!ParametersDialog.open(this)) return |
| 31 | 41 |
|
| 32 | 42 |
// BEGINNING |
| 33 | 43 |
|
| 34 |
if (!exportDirectory.exists()) exportDirectory.mkdirs()
|
|
| 44 |
if (!outputDirectory.exists()) outputDirectory.mkdirs()
|
|
| 35 | 45 |
|
| 36 | 46 |
def corpus = corpusViewSelection |
| 47 |
def corpusName = corpus.getName() |
|
| 37 | 48 |
def CQI = CQPSearchEngine.getCqiClient() |
| 38 | 49 |
|
| 39 |
def lineSeparatorStructure = corpus.getStructuralUnit(lineSeparatorStructureName) |
|
| 50 |
if (wordProperty == null || !(wordProperty.length() > 0)) {
|
|
| 51 |
println "** Please provide a word property name in parameter 'wordProperty', for example 'word'. Aborting..." |
|
| 52 |
return 1 |
|
| 53 |
} |
|
| 40 | 54 |
|
| 41 |
if (lineSeparatorStructure == null) {
|
|
| 42 |
println "** The $lineSeparatorStructureName structure does not exist in the $corpus corpus"
|
|
| 43 |
return |
|
| 55 |
if (oneSentencePerLine && (sentenceStructureName == null || !(sentenceStructureName.length() > 0))) {
|
|
| 56 |
println "** Please provide a name for the structure encoding sentences in parameter 'sentenceStructureName', or uncheck parameter 'oneSentencePerLine'. Aborting..."
|
|
| 57 |
return 1
|
|
| 44 | 58 |
} |
| 45 | 59 |
|
| 46 |
def breaks_pos = Arrays.asList(corpus.query(new CQLQuery("[]</"+lineSeparatorStructureName+">"),"test", false).getEnds())
|
|
| 47 |
println "End of structure positions = "+breaks_pos |
|
| 60 |
if (oneSentencePerLine) {
|
|
| 48 | 61 |
|
| 49 |
println "Exporting $corpus text content to $exportDirectory..."
|
|
| 62 |
lineSeparatorStructure = corpus.getStructuralUnit(sentenceStructureName)
|
|
| 50 | 63 |
|
| 51 |
def wordProperty = corpus.getWordProperty() |
|
| 64 |
if (lineSeparatorStructure == null) {
|
|
| 65 |
println "** No $sentenceStructureName structure in the $corpus corpus. Aborting..." |
|
| 66 |
return 1 |
|
| 67 |
} |
|
| 68 |
|
|
| 69 |
breaks_pos = Arrays.asList(corpus.query(new CQLQuery("[]</"+sentenceStructureName+">"),"test", false).getEnds())
|
|
| 70 |
} |
|
| 71 |
|
|
| 72 |
println "Exporting $corpus text content to $outputDirectory..." |
|
| 73 |
|
|
| 74 |
def wordPropertyI = corpus.getProperty(wordProperty) |
|
| 75 |
|
|
| 76 |
if (wordPropertyI == null) {
|
|
| 77 |
println "** No '$wordProperty' word property in the $corpus corpus. Aborting..." |
|
| 78 |
return 1 |
|
| 79 |
} |
|
| 80 |
|
|
| 52 | 81 |
def textidProperty = corpus.getStructuralUnit("text").getProperty("id")
|
| 53 | 82 |
def textStartBoundaries = corpus.getTextStartLimits() |
| 54 | 83 |
def textEndBoundaries = corpus.getTextEndLimits() |
| 55 | 84 |
int[] struct_pos = CQI.cpos2Struc(textidProperty.getQualifiedName(), textStartBoundaries) |
| 56 | 85 |
String[] textids = CQI.struc2Str(textidProperty.getQualifiedName(), struct_pos) |
| 86 |
|
|
| 57 | 87 |
if (textStartBoundaries.size() == 1) {
|
| 58 | 88 |
println "1 text" |
| 59 | 89 |
} else {
|
| ... | ... | |
| 64 | 94 |
int start = textStartBoundaries[i] |
| 65 | 95 |
int end = textEndBoundaries[i] |
| 66 | 96 |
|
| 67 |
File txtFile = new File(exportDirectory, textids[i]+".txt")
|
|
| 97 |
File txtFile = new File(outputDirectory, textids[i]+".txt")
|
|
| 68 | 98 |
print "." |
| 69 | 99 |
def writer = txtFile.newWriter("UTF-8")
|
| 70 | 100 |
int[] positions = new int[end - start + 1] |
| ... | ... | |
| 72 | 102 |
for (int p : start..end) {
|
| 73 | 103 |
positions[c++] = p |
| 74 | 104 |
} |
| 75 |
int[] idx = CQI.cpos2Id(wordProperty.getQualifiedName(), positions) |
|
| 76 |
def words = CQI.id2Str(wordProperty.getQualifiedName(), idx) |
|
| 105 |
int[] idx = CQI.cpos2Id(wordPropertyI.getQualifiedName(), positions)
|
|
| 106 |
def words = CQI.id2Str(wordPropertyI.getQualifiedName(), idx)
|
|
| 77 | 107 |
def tmp = [] |
| 78 | 108 |
for (int j = 0 ; j < positions.length ; j++) {
|
| 79 | 109 |
int p = positions[j] |
| 80 | 110 |
tmp << words[j] |
| 81 |
if (breaks_pos.contains(p)) {
|
|
| 82 |
writer.println LangFormater.format(tmp.join(" "), corpus.getLang())
|
|
| 111 |
if (oneSentencePerLine && breaks_pos.contains(p)) {
|
|
| 112 |
if (oneWordPerLine) {
|
|
| 113 |
tmp.each { word -> writer.println word }
|
|
| 114 |
} else {
|
|
| 115 |
writer.println LangFormater.format(tmp.join(" "), corpus.getLang())
|
|
| 116 |
} |
|
| 83 | 117 |
tmp = [] |
| 84 | 118 |
} |
| 85 | 119 |
} |
| 86 | 120 |
if (tmp.size() > 0) {
|
| 87 |
writer.println LangFormater.format(tmp.join(" "), corpus.getLang())
|
|
| 121 |
if (oneWordPerLine) {
|
|
| 122 |
tmp.each { word -> writer.println word }
|
|
| 123 |
} else {
|
|
| 124 |
writer.println LangFormater.format(tmp.join(" "), corpus.getLang())
|
|
| 125 |
} |
|
| 88 | 126 |
} |
| 89 | 127 |
writer.close() |
| 90 | 128 |
} |
| 91 | 129 |
|
| 92 |
println "\nDone, result saved in "+exportDirectory.getAbsolutePath() |
|
| 130 |
println "\nDone, result saved in "+outputDirectory.getAbsolutePath() |
|
Formats disponibles : Unified diff