Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / importer / HTMLIndexer.groovy @ 187

History | View | Annotate | Download (4.4 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
24
// $LastChangedDate:$
25
// $LastChangedRevision:$
26
// $LastChangedBy:$ 
27
//
28
package org.txm.importer
29

    
30
import java.util.Collections;
31
import java.io.File;
32
import org.txm.utils.FileCopy;
33
import org.txm.utils.*;
34
import java.io.File;
35
import java.util.HashMap;
36
import java.util.ArrayList;
37
import java.io.File;
38
import javax.xml.stream.*;
39
import java.net.URL;
40

    
41
// TODO: Auto-generated Javadoc
42
/**
43
 * The Class HTMLIndexer.
44
 */
45
class HTMLIndexer {
46
        
47
        /** The idxprefix. */
48
        static String idxprefix = "IDX-";
49
        
50
        /** The index. */
51
        HashMap<String, ArrayList<String>> index = new HashMap<String, ArrayList<String>>();
52
        
53
        /**
54
         * Process html dir.
55
         *
56
         * @param htmlDirectory the html directory
57
         * @return true, if successful
58
         */
59
        private boolean processHTMLDir(File htmlDirectory)
60
        {
61
                ArrayList<File> htmlfiles = DeleteDir.scanDirectory(htmlDirectory, true)
62
                Collections.sort(htmlfiles);
63
                
64
                for(File htmlFile : htmlfiles)//get all indexes
65
                {
66
                        if(htmlFile.getName().endsWith(".html"))
67
                                processHTMLFile(htmlFile);
68
                }
69
                
70
                ArrayList<String> tokens = new ArrayList<String>(index.keySet());
71
                Collections.sort(tokens);
72
                
73
                //fix doubles like étiquette&étiquettes
74
                for(int i = 0 ; i < tokens.size() ; i++)
75
                {
76
                        String t1 = tokens.get(i);
77
                        String t2 = tokens.get(i+1);
78
                        if(t1.equals(t2.substring(0, t2.length() -1)))
79
                        {
80
                                tokens.remove(i+1);
81
                                index.get(t1).addAll(index.get(t2));
82
                                //i--;
83
                        }
84
                }
85
                
86
                tokens = new ArrayList<String>(index.keySet());
87
                for(String token : tokens)
88
                {
89
                        println("Token: "+token);
90
                        println(index.get(token));
91
                }
92
        }
93
        
94
        /**
95
         * Process html file.
96
         *
97
         * @param htmlFile the html file
98
         * @return true, if successful
99
         */
100
        private boolean processHTMLFile(File htmlFile)
101
        {
102
                
103
                
104
                String lasttoken;
105
                String page;
106
                
107
                try
108
                {
109
                        URL url = htmlFile.toURI().toURL();
110
                        println "process html file "+url;
111
                        def inputData = url.openStream();
112
                        def factory = XMLInputFactory.newInstance();
113
                        //factory.setXMLResolver resolver
114
                        //factory.setProperty(factory.IS_VALIDATING, false)
115
                        factory.setProperty("javax.xml.stream.supportDTD", false);
116
                        factory.setProperty("javax.xml.stream.isReplacingEntityReferences", false);
117
                        
118
                        XMLStreamReader parser = factory.createXMLStreamReader(inputData);
119
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
120
                        {
121
                                //println "parse"
122
                                switch (event) 
123
                                {
124
                                        case XMLStreamConstants.START_ELEMENT:
125
                                                //println "elem "+parser.getLocalName()
126
                                                if(parser.getLocalName() == "div")
127
                                                        if(parser.getAttributeValue(null, "id") != null && parser.getAttributeValue(null, "id").startsWith("index-body"))
128
                                                                page = parser.getAttributeValue(null, "id")
129
                                                                
130
                                                String id = parser.getAttributeValue(null, "id");
131
                                                if(id != null && id.startsWith(idxprefix))
132
                                                {
133
                                                        if(!index.containsKey(lasttoken))
134
                                                                index.put(lasttoken, new ArrayList<String>());
135
                                                        index.get(lasttoken).add(htmlFile.getName()+"#"+id)
136
                                                }
137
                                        
138
                                                break;
139
                                        
140
                                        case XMLStreamConstants.CHARACTERS:
141
                                                String text = parser.getText().trim();
142
                                                if(text.length() > 0)
143
                                                {
144
                                                        def texts = text.split(" ");
145
                                                        lasttoken = texts[texts.size()-1];
146
                                                        if(lasttoken.endsWith("."))
147
                                                                lasttoken = lasttoken.substring(0, lasttoken.length() -1)
148
                                                }
149
                                }
150
                        }
151
                        
152
                }
153
                catch(Exception e){println("File "+htmlFile+"\n"+e); return false;}
154
                return true;
155
        }
156
        
157
        /**
158
         * The main method.
159
         *
160
         * @param args the arguments
161
         */
162
        public static void main(String[] args)
163
        {
164
                File htmlDirectory = new File("/home/mdecorde/xml/html")
165
                new HTMLIndexer().processHTMLDir(htmlDirectory);
166
        }
167
}