Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / macro / export / ExportTextContentMacro.groovy @ 2769

History | View | Annotate | Download (2.6 kB)

1
// STANDARD DECLARATIONS
2
package org.txm.macro.export
3

    
4
import org.txm.searchengine.cqp.CQPSearchEngine
5
import org.txm.searchengine.cqp.corpus.*
6
import org.txm.searchengine.cqp.corpus.query.CQLQuery
7
import org.txm.Toolbox
8
import org.txm.utils.i18n.LangFormater;
9
import org.apache.commons.lang.StringUtils;
10
import org.kohsuke.args4j.*
11
import groovy.transform.Field
12
import org.txm.rcp.swt.widget.parameters.*
13

    
14
println "corpora selection: "+corpusViewSelection
15

    
16
if (!(corpusViewSelection instanceof CQPCorpus)) {
17
        println "Please select a corpus"
18
        return;
19
}
20

    
21
// PARAMETERS
22

    
23
@Field @Option(name="exportDirectory", usage="Result directory ", widget="Folder", required=true, def="export")
24
File exportDirectory
25

    
26
@Field @Option(name="lineSeparatorStructureName", usage="line separator structure", widget="String", required=false, def="p")
27
def lineSeparatorStructureName
28

    
29
if (!ParametersDialog.open(this)) return
30

    
31
// BEGINNING
32

    
33
if (!exportDirectory.exists()) exportDirectory.mkdirs()
34

    
35
CQPCorpus corpus = corpusViewSelection
36
CQI = CQPSearchEngine.getCqiClient();
37

    
38
def lineSeparatorStructure = corpus.getStructuralUnit(lineSeparatorStructureName)
39

    
40
if (lineSeparatorStructure == null) {
41
        println "The $lineSeparatorStructureName structure does not exists in the $corpus corpus"
42
        return;
43
}
44

    
45
def breaks_pos = Arrays.asList(corpus.query(new CQLQuery("[]</"+lineSeparatorStructureName+">"),"test", false).getEnds())
46
println breaks_pos
47

    
48
println "Exporting $corpus text content to $exportDirectory"
49

    
50
def wordProperty = corpus.getWordProperty()
51
def textidProperty = corpus.getStructuralUnit("text").getProperty("id")
52
def textStartBoundaries = corpus.getTextStartLimits();
53
def textEndBoundaries = corpus.getTextEndLimits();
54
int[] struct_pos = CQI.cpos2Struc(textidProperty.getQualifiedName(), textStartBoundaries)
55
String[] textids =  CQI.struc2Str(textidProperty.getQualifiedName(), struct_pos)
56
println ""+textStartBoundaries.size()+" texts"
57

    
58
for (int i = 0 ; i < textStartBoundaries.size() ; i++) {
59
        int start = textStartBoundaries[i];
60
        int end = textEndBoundaries[i]
61

    
62
        File txtFile = new File(exportDirectory, textids[i]+".txt")
63
        print ".."
64
        def writer = txtFile.newWriter("UTF-8")
65
        int[] positions = new int[end - start + 1]
66
        int c = 0;
67
        for (int p : start..end) {
68
                positions[c++] = p
69
        }
70
        int[] idx = CQI.cpos2Id(wordProperty.getQualifiedName(), positions)
71
        def words = CQI.id2Str(wordProperty.getQualifiedName(), idx)
72
        for (int j = 0 ; j < positions.length ; j++) {
73
                int p = positions[j]
74
                if (breaks_pos.contains(p)) words[j] = words[j] +"\n" 
75
        }
76
        writer.println LangFormater.format(StringUtils.join(words, " "),
77
                                corpus.getLang());
78
        writer.close();
79
}  
80

    
81
println "\nDone, result saved in "+exportDirectory.getAbsolutePath()