Statistics
| Revision:

root / tmp / org.txm.core / src / java / org / txm / scripts / importer / HTMLIndexer.groovy @ 1688

History | View | Annotate | Download (4.6 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate:$
25
// $LastChangedRevision:$
26
// $LastChangedBy:$ 
27
//
28
package org.txm.scripts.importer
29

    
30
import java.util.Collections;
31
import java.io.File;
32
import org.txm.utils.io.FileCopy;
33
import org.txm.utils.*;
34
import java.io.File;
35
import java.util.HashMap;
36
import java.util.ArrayList;
37
import java.io.File;
38
import javax.xml.stream.*;
39
import java.net.URL;
40

    
41
// TODO: Auto-generated Javadoc
42
/**
43
 * The Class HTMLIndexer.
44
 */
45
class HTMLIndexer {
46
        
47
        /** The idxprefix. */
48
        static String idxprefix = "IDX-";
49
        
50
        /** The index. */
51
        HashMap<String, ArrayList<String>> index = new HashMap<String, ArrayList<String>>();
52
        
53
        /**
54
         * Process html dir.
55
         *
56
         * @param htmlDirectory the html directory
57
         * @return true, if successful
58
         */
59
        private boolean processHTMLDir(File htmlDirectory)
60
        {
61
                ArrayList<File> htmlfiles = DeleteDir.scanDirectory(htmlDirectory, true)
62
                Collections.sort(htmlfiles);
63
                
64
                for(File htmlFile : htmlfiles)//get all indexes
65
                {
66
                        if(htmlFile.getName().endsWith(".html"))
67
                                processHTMLFile(htmlFile);
68
                }
69
                
70
                ArrayList<String> tokens = new ArrayList<String>(index.keySet());
71
                Collections.sort(tokens);
72
                
73
                //fix doubles like étiquette&étiquettes
74
                for(int i = 0 ; i < tokens.size() ; i++)
75
                {
76
                        String t1 = tokens.get(i);
77
                        String t2 = tokens.get(i+1);
78
                        if(t1.equals(t2.substring(0, t2.length() -1)))
79
                        {
80
                                tokens.remove(i+1);
81
                                index.get(t1).addAll(index.get(t2));
82
                                //i--;
83
                        }
84
                }
85
                
86
                tokens = new ArrayList<String>(index.keySet());
87
                for(String token : tokens)
88
                {
89
                        println("Token: "+token);
90
                        println(index.get(token));
91
                }
92
        }
93
        
94
        /**
95
         * Process html file.
96
         *
97
         * @param htmlFile the html file
98
         * @return true, if successful
99
         */
100
        private boolean processHTMLFile(File htmlFile)
101
        {
102
                
103
                
104
                String lasttoken;
105
                String page;
106
                
107
                def inputData = null;
108
                def factory = null;
109
                
110
                try
111
                {
112
                        URL url = htmlFile.toURI().toURL();
113
                        println "process html file "+url;
114
                        inputData = url.openStream();
115
                        factory = XMLInputFactory.newInstance();
116
                        //factory.setXMLResolver resolver
117
                        //factory.setProperty(factory.IS_VALIDATING, false)
118
                        factory.setProperty("javax.xml.stream.supportDTD", false);
119
                        factory.setProperty("javax.xml.stream.isReplacingEntityReferences", false);
120
                        
121
                        XMLStreamReader parser = factory.createXMLStreamReader(inputData);
122
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
123
                        {
124
                                //println "parse"
125
                                switch (event) 
126
                                {
127
                                        case XMLStreamConstants.START_ELEMENT:
128
                                                //println "elem "+parser.getLocalName()
129
                                                if(parser.getLocalName() == "div")
130
                                                        if(parser.getAttributeValue(null, "id") != null && parser.getAttributeValue(null, "id").startsWith("index-body"))
131
                                                                page = parser.getAttributeValue(null, "id")
132
                                                                
133
                                                String id = parser.getAttributeValue(null, "id");
134
                                                if(id != null && id.startsWith(idxprefix))
135
                                                {
136
                                                        if(!index.containsKey(lasttoken))
137
                                                                index.put(lasttoken, new ArrayList<String>());
138
                                                        index.get(lasttoken).add(htmlFile.getName()+"#"+id)
139
                                                }
140
                                        
141
                                                break;
142
                                        
143
                                        case XMLStreamConstants.CHARACTERS:
144
                                                String text = parser.getText().trim();
145
                                                if(text.length() > 0)
146
                                                {
147
                                                        def texts = text.split(" ");
148
                                                        lasttoken = texts[texts.size()-1];
149
                                                        if(lasttoken.endsWith("."))
150
                                                                lasttoken = lasttoken.substring(0, lasttoken.length() -1)
151
                                                }
152
                                }
153
                        }
154
                        
155
                }
156
                catch(Exception e){println("File "+htmlFile+"\n"+e); 
157
                        if (parser != null) parser.close();
158
                if (inputData != null) inputData.close();
159
                return false;}
160
                
161
                if (parser != null) parser.close();
162
                if (inputData != null) inputData.close();
163
                return true;
164
        }
165
        
166
        /**
167
         * The main method.
168
         *
169
         * @param args the arguments
170
         */
171
        public static void main(String[] args)
172
        {
173
                File htmlDirectory = new File("/home/mdecorde/xml/html")
174
                new HTMLIndexer().processHTMLDir(htmlDirectory);
175
        }
176
}