Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / factiva / FactivaMail2Alceste.groovy @ 187

History | View | Annotate | Download (4.4 kB)

1
package org.txm.importer.factiva
2
/**
3
 * 
4
 * This is a Groovy port of the converter made by Pierre Ratinaud
5
 * #Author: Pierre Ratinaud
6
 * #Copyright (c) 2012 Pierre Ratinaud
7
 * #Lisense: GNU/GPL
8
 * 
9
 * @author mdecorde
10
 *
11
 */
12
class FactivaMail2Alceste {
13

    
14
        def txtdir;
15
        def fileout;
16
        def encodage_in = 'UTF-8'
17
        def encodage_out = 'UTF-8'
18

    
19
        def metadata = ["BY":"Author",
20
                "ART":"CaptionsDescriptionGraphics",
21
                "CLM":"Column",
22
                "CT":"Contact",
23
                "CY":"Copyright",
24
                "CX":"Correction",
25
                "CR":"Credit",
26
                "ED":"Edition",
27
                "HD":"Headline",
28
                "LA":"Language",
29
                "PG":"Page",
30
                "PD":"PublicationDate",
31
                "ET":"PublicationTime",
32
                "RBBCM":"ReutersCodes",
33
                "SE":"Section",
34
                "SC":"SourceCode",
35
                "GC":"SourceGroupCode",
36
                "NGC":"SourceGroupName",
37
                "SN":"SourceName",
38
                "TD":"TextFollowingLeadParagraphs",
39
                "VOL":"Volume",
40
                "WC":"WordCount"]
41

    
42
        def starts = ["LP":"LeadParagraph"]
43

    
44
        def ends = ["RF":"Reference",
45
                "AN":"AccessionNumber",
46
                "PUB":"PublisherName",
47
                "RE":"RegionCodeDescriptor",
48
                "IPC":"InformationProviderCodes",
49
                "IN":"IndustryCodeDescriptor",
50
                "CO":"CompanyCodeName",
51
                "IPD":"InformationProviderDescriptors",
52
                "NS":"SubjectCodeDescriptor",
53
                "DJIC":"DowJonesCodes",
54
                "DJID":"DowJonesDescriptors"]
55
        
56
        def ucis = []
57
        def writeTextOK = false
58
        def nextLineIsAnEndMetadataValue = false
59
        def endsKey
60

    
61
        FactivaMail2Alceste(File txtdir, File fileout, String encodage_in, String encodage_out) {
62
                this.txtdir = txtdir;
63
                this.fileout = fileout;
64
                this.encodage_in = encodage_in;
65
                this.encodage_out = encodage_out;
66
        }
67

    
68
        def toAttribute(String str) {
69
                return str.replaceAll(' ','')
70
//                .replaceAll('\'','').replaceAll(":", "").replaceAll("\\|", "").replaceAll("_","")
71
//                         .replaceAll('´','').replaceAll('’','').replaceAll('-','').toLowerCase().replaceAll("/", "").replaceAll("\\.", "")
72
//                         .replaceAll(" \"", "")
73
        }
74
        
75
        def writeText(String str) {
76
                if (writeTextOK) ucis[-1][1] += str+"\n";
77
        }
78
        
79
        /**
80
         * parser de texte pour factiva
81
         * @return liste d'uci (= list header + corp)
82
         */
83
        def parsetxt(txt) {
84
                def lines = txt.split("\n") // met le texte dans une liste de lignes
85
                def keepline = false
86
                
87
                for (String line : lines)  { //pour chaque ligne du texte...
88
                        if (line.length() == 0) continue;
89
                        
90
                        if (nextLineIsAnEndMetadataValue) {
91
                                //println "DATA: "+line
92
                                ucis[-1][0] += " *${endsKey}_"+toAttribute(line)
93
                                nextLineIsAnEndMetadataValue = false;
94
                        } else if (line.startsWith('---------------------------------------------------------------')) {// si la ligne commence avec...
95
                                ucis << ['****',''] // nouveau texte
96
                        } else {
97
                                int idxSpace = line.indexOf(" ")
98
                                if (0 <= idxSpace && idxSpace < 5) { // maybe a key
99
                                        String head = line.substring(0, idxSpace)
100
                                        String tail = line.substring(idxSpace+1)
101
                                        String tail2;
102
                                        int idxTab = tail.indexOf("\t");
103

    
104
                                        if (metadata.containsKey(head)) {
105
                                                if (idxTab == 0 && tail.length() > 0) {
106
                                                        ucis[-1][0] += " *${head}_"+toAttribute(tail.substring(1));
107
                                                } else if (tail.length() > 0){
108
                                                        writeText(line)
109
                                                }
110
                                        } else if (ends.containsKey(head)) {
111
                                                if (idxTab == 0) nextLineIsAnEndMetadataValue = true;
112
                                                endsKey = head;
113
                                                writeTextOK = false; // stop writing text
114
                                        } else if (starts.containsKey(head)) {
115
                                                writeTextOK = true; // start writing text
116
                                        } else {
117
                                                writeText(line)
118
                                        }
119
                                } else {
120
                                        writeText(line)
121
                                }
122
                        }
123
                }
124
                return ucis
125
        }
126

    
127
        def doparse() {
128
                def files = txtdir.listFiles() // liste des fichiers dans txtdir
129
                fileout.withWriter("UTF-8") { writer ->
130
                        for (File f : files) { // pour chaque fichier en entree...
131
                                if (f.isHidden() || !f.canRead() || !f.isFile() || f.getName() == "import.xml") {
132
                                        continue;
133
                                }
134
                                println "parse file: $f"
135
                                for (def uci : parsetxt(f.getText("UTF-8"))) { // on récupère les lignes
136
                                        writer.println uci[0] // header
137
                                        writer.println uci[1] // corp
138
                                }
139
                        }
140
                }
141
                return true;
142
        }
143

    
144
        public static void main(String[] args) {
145
                File  txtdir = new File("/home/mdecorde/xml/factivatxt/") // repertoire des textes
146
                File fileout = new File("/home/mdecorde/xml/factivatxt/montagne-out.txt")
147
                String encodage_in = "UTF-8"
148
                String encodage_out = "UTF-8"
149
                def converter = new FactivaMail2Alceste(txtdir, fileout, encodage_in, encodage_out);
150

    
151
                println 'Done: '+converter.doparse()
152
        }
153
}