Statistics
| Revision:

root / tmp / org.txm.core / src / java / org / txm / scripts / importer / HTMLIndexer.groovy @ 2877

History | View | Annotate | Download (4.6 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate:$
25
// $LastChangedRevision:$
26
// $LastChangedBy:$ 
27
//
28
package org.txm.scripts.importer
29

    
30
import java.util.Collections;
31
import java.io.File;
32
import org.txm.utils.io.FileCopy;
33
import org.txm.utils.*;
34
import java.io.File;
35
import java.util.HashMap;
36
import java.util.ArrayList;
37
import java.io.File;
38
import javax.xml.stream.*;
39
import java.net.URL;
40

    
41
// TODO: Auto-generated Javadoc
42
/**
43
 * The Class HTMLIndexer.
44
 */
45
class HTMLIndexer {
46
        
47
        /** The idxprefix. */
48
        static String idxprefix = "IDX-";
49
        
50
        /** The index. */
51
        HashMap<String, ArrayList<String>> index = new HashMap<String, ArrayList<String>>();
52
        
53
        /**
54
         * Process html dir.
55
         *
56
         * @param htmlDirectory the html directory
57
         * @return true, if successful
58
         */
59
        private boolean processHTMLDir(File htmlDirectory)
60
        {
61
                ArrayList<File> htmlfiles = DeleteDir.scanDirectory(htmlDirectory, true, true)
62
                Collections.sort(htmlfiles);
63
                
64
                for (File htmlFile : htmlfiles) {//get all indexes
65
                        if (htmlFile.getName().endsWith(".html")) {
66
                                processHTMLFile(htmlFile);
67
                        }
68
                }
69
                
70
                ArrayList<String> tokens = new ArrayList<String>(index.keySet());
71
                Collections.sort(tokens);
72
                
73
                //fix doubles like étiquette&étiquettes
74
                for (int i = 0 ; i < tokens.size() ; i++) {
75
                        String t1 = tokens.get(i);
76
                        String t2 = tokens.get(i+1);
77
                        if (t1.equals(t2.substring(0, t2.length() -1))) {
78
                                tokens.remove(i+1);
79
                                index.get(t1).addAll(index.get(t2));
80
                                //i--;
81
                        }
82
                }
83
                
84
                tokens = new ArrayList<String>(index.keySet());
85
                for (String token : tokens) {
86
                        println("Token: "+token);
87
                        println(index.get(token));
88
                }
89
        }
90
        
91
        /**
92
         * Process html file.
93
         *
94
         * @param htmlFile the html file
95
         * @return true, if successful
96
         */
97
        private boolean processHTMLFile(File htmlFile)
98
        {
99
                
100
                
101
                String lasttoken;
102
                String page;
103
                
104
                def inputData = null;
105
                def factory = null;
106
                
107
                try {
108
                        URL url = htmlFile.toURI().toURL();
109
                        println "process html file "+url;
110
                        inputData = url.openStream();
111
                        factory = XMLInputFactory.newInstance();
112
                        //factory.setXMLResolver resolver
113
                        //factory.setProperty(factory.IS_VALIDATING, false)
114
                        factory.setProperty("javax.xml.stream.supportDTD", false);
115
                        factory.setProperty("javax.xml.stream.isReplacingEntityReferences", false);
116
                        
117
                        XMLStreamReader parser = factory.createXMLStreamReader(inputData);
118
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
119
                        {
120
                                //println "parse"
121
                                switch (event) {
122
                                        case XMLStreamConstants.START_ELEMENT:
123
                                                //println "elem "+parser.getLocalName()
124
                                                if (parser.getLocalName() == "div")
125
                                                        if (parser.getAttributeValue(null, "id") != null && parser.getAttributeValue(null, "id").startsWith("index-body"))
126
                                                                page = parser.getAttributeValue(null, "id")
127
                                                                
128
                                                String id = parser.getAttributeValue(null, "id");
129
                                                if (id != null && id.startsWith(idxprefix))
130
                                                {
131
                                                        if (!index.containsKey(lasttoken))
132
                                                                index.put(lasttoken, new ArrayList<String>());
133
                                                        index.get(lasttoken).add(htmlFile.getName()+"#"+id)
134
                                                }
135
                                        
136
                                                break;
137
                                        
138
                                        case XMLStreamConstants.CHARACTERS:
139
                                                String text = parser.getText().trim();
140
                                                if (text.length() > 0) {
141
                                                        def texts = text.split(" ");
142
                                                        lasttoken = texts[texts.size()-1];
143
                                                        if (lasttoken.endsWith("."))
144
                                                                lasttoken = lasttoken.substring(0, lasttoken.length() -1)
145
                                                }
146
                                }
147
                        }
148
                        
149
                }
150
                catch(Exception e){println("File "+htmlFile+"\n"+e); 
151
                        if (parser != null) parser.close();
152
                if (inputData != null) inputData.close();
153
                return false;}
154
                
155
                if (parser != null) parser.close();
156
                if (inputData != null) inputData.close();
157
                return true;
158
        }
159
        
160
        /**
161
         * The main method.
162
         *
163
         * @param args the arguments
164
         */
165
        public static void main(String[] args)
166
        {
167
                File htmlDirectory = new File("/home/mdecorde/xml/html")
168
                new HTMLIndexer().processHTMLDir(htmlDirectory);
169
        }
170
}