Statistics
| Revision:

root / tmp / org.txm.core / src / java / org / txm / scripts / importer / HTMLIndexer.groovy @ 1000

History | View | Annotate | Download (4.4 kB)

1 881 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 881 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 881 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 881 mdecorde
// Sophia Antipolis, University of Paris 3.
5 881 mdecorde
//
6 881 mdecorde
// The TXM platform is free software: you can redistribute it
7 881 mdecorde
// and/or modify it under the terms of the GNU General Public
8 881 mdecorde
// License as published by the Free Software Foundation,
9 881 mdecorde
// either version 2 of the License, or (at your option) any
10 881 mdecorde
// later version.
11 881 mdecorde
//
12 881 mdecorde
// The TXM platform is distributed in the hope that it will be
13 881 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 881 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 881 mdecorde
// PURPOSE. See the GNU General Public License for more
16 881 mdecorde
// details.
17 881 mdecorde
//
18 881 mdecorde
// You should have received a copy of the GNU General
19 881 mdecorde
// Public License along with the TXM platform. If not, see
20 881 mdecorde
// http://www.gnu.org/licenses.
21 881 mdecorde
//
22 881 mdecorde
//
23 881 mdecorde
//
24 881 mdecorde
// $LastChangedDate:$
25 881 mdecorde
// $LastChangedRevision:$
26 881 mdecorde
// $LastChangedBy:$
27 881 mdecorde
//
28 1000 mdecorde
package org.txm.scripts.importer
29 881 mdecorde
30 881 mdecorde
import java.util.Collections;
31 881 mdecorde
import java.io.File;
32 881 mdecorde
import org.txm.utils.io.FileCopy;
33 881 mdecorde
import org.txm.utils.*;
34 881 mdecorde
import java.io.File;
35 881 mdecorde
import java.util.HashMap;
36 881 mdecorde
import java.util.ArrayList;
37 881 mdecorde
import java.io.File;
38 881 mdecorde
import javax.xml.stream.*;
39 881 mdecorde
import java.net.URL;
40 881 mdecorde
41 881 mdecorde
// TODO: Auto-generated Javadoc
42 881 mdecorde
/**
43 881 mdecorde
 * The Class HTMLIndexer.
44 881 mdecorde
 */
45 881 mdecorde
class HTMLIndexer {
46 881 mdecorde
47 881 mdecorde
        /** The idxprefix. */
48 881 mdecorde
        static String idxprefix = "IDX-";
49 881 mdecorde
50 881 mdecorde
        /** The index. */
51 881 mdecorde
        HashMap<String, ArrayList<String>> index = new HashMap<String, ArrayList<String>>();
52 881 mdecorde
53 881 mdecorde
        /**
54 881 mdecorde
         * Process html dir.
55 881 mdecorde
         *
56 881 mdecorde
         * @param htmlDirectory the html directory
57 881 mdecorde
         * @return true, if successful
58 881 mdecorde
         */
59 881 mdecorde
        private boolean processHTMLDir(File htmlDirectory)
60 881 mdecorde
        {
61 881 mdecorde
                ArrayList<File> htmlfiles = DeleteDir.scanDirectory(htmlDirectory, true)
62 881 mdecorde
                Collections.sort(htmlfiles);
63 881 mdecorde
64 881 mdecorde
                for(File htmlFile : htmlfiles)//get all indexes
65 881 mdecorde
                {
66 881 mdecorde
                        if(htmlFile.getName().endsWith(".html"))
67 881 mdecorde
                                processHTMLFile(htmlFile);
68 881 mdecorde
                }
69 881 mdecorde
70 881 mdecorde
                ArrayList<String> tokens = new ArrayList<String>(index.keySet());
71 881 mdecorde
                Collections.sort(tokens);
72 881 mdecorde
73 881 mdecorde
                //fix doubles like étiquette&étiquettes
74 881 mdecorde
                for(int i = 0 ; i < tokens.size() ; i++)
75 881 mdecorde
                {
76 881 mdecorde
                        String t1 = tokens.get(i);
77 881 mdecorde
                        String t2 = tokens.get(i+1);
78 881 mdecorde
                        if(t1.equals(t2.substring(0, t2.length() -1)))
79 881 mdecorde
                        {
80 881 mdecorde
                                tokens.remove(i+1);
81 881 mdecorde
                                index.get(t1).addAll(index.get(t2));
82 881 mdecorde
                                //i--;
83 881 mdecorde
                        }
84 881 mdecorde
                }
85 881 mdecorde
86 881 mdecorde
                tokens = new ArrayList<String>(index.keySet());
87 881 mdecorde
                for(String token : tokens)
88 881 mdecorde
                {
89 881 mdecorde
                        println("Token: "+token);
90 881 mdecorde
                        println(index.get(token));
91 881 mdecorde
                }
92 881 mdecorde
        }
93 881 mdecorde
94 881 mdecorde
        /**
95 881 mdecorde
         * Process html file.
96 881 mdecorde
         *
97 881 mdecorde
         * @param htmlFile the html file
98 881 mdecorde
         * @return true, if successful
99 881 mdecorde
         */
100 881 mdecorde
        private boolean processHTMLFile(File htmlFile)
101 881 mdecorde
        {
102 881 mdecorde
103 881 mdecorde
104 881 mdecorde
                String lasttoken;
105 881 mdecorde
                String page;
106 881 mdecorde
107 881 mdecorde
                try
108 881 mdecorde
                {
109 881 mdecorde
                        URL url = htmlFile.toURI().toURL();
110 881 mdecorde
                        println "process html file "+url;
111 881 mdecorde
                        def inputData = url.openStream();
112 881 mdecorde
                        def factory = XMLInputFactory.newInstance();
113 881 mdecorde
                        //factory.setXMLResolver resolver
114 881 mdecorde
                        //factory.setProperty(factory.IS_VALIDATING, false)
115 881 mdecorde
                        factory.setProperty("javax.xml.stream.supportDTD", false);
116 881 mdecorde
                        factory.setProperty("javax.xml.stream.isReplacingEntityReferences", false);
117 881 mdecorde
118 881 mdecorde
                        XMLStreamReader parser = factory.createXMLStreamReader(inputData);
119 881 mdecorde
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
120 881 mdecorde
                        {
121 881 mdecorde
                                //println "parse"
122 881 mdecorde
                                switch (event)
123 881 mdecorde
                                {
124 881 mdecorde
                                        case XMLStreamConstants.START_ELEMENT:
125 881 mdecorde
                                                //println "elem "+parser.getLocalName()
126 881 mdecorde
                                                if(parser.getLocalName() == "div")
127 881 mdecorde
                                                        if(parser.getAttributeValue(null, "id") != null && parser.getAttributeValue(null, "id").startsWith("index-body"))
128 881 mdecorde
                                                                page = parser.getAttributeValue(null, "id")
129 881 mdecorde
130 881 mdecorde
                                                String id = parser.getAttributeValue(null, "id");
131 881 mdecorde
                                                if(id != null && id.startsWith(idxprefix))
132 881 mdecorde
                                                {
133 881 mdecorde
                                                        if(!index.containsKey(lasttoken))
134 881 mdecorde
                                                                index.put(lasttoken, new ArrayList<String>());
135 881 mdecorde
                                                        index.get(lasttoken).add(htmlFile.getName()+"#"+id)
136 881 mdecorde
                                                }
137 881 mdecorde
138 881 mdecorde
                                                break;
139 881 mdecorde
140 881 mdecorde
                                        case XMLStreamConstants.CHARACTERS:
141 881 mdecorde
                                                String text = parser.getText().trim();
142 881 mdecorde
                                                if(text.length() > 0)
143 881 mdecorde
                                                {
144 881 mdecorde
                                                        def texts = text.split(" ");
145 881 mdecorde
                                                        lasttoken = texts[texts.size()-1];
146 881 mdecorde
                                                        if(lasttoken.endsWith("."))
147 881 mdecorde
                                                                lasttoken = lasttoken.substring(0, lasttoken.length() -1)
148 881 mdecorde
                                                }
149 881 mdecorde
                                }
150 881 mdecorde
                        }
151 881 mdecorde
152 881 mdecorde
                }
153 881 mdecorde
                catch(Exception e){println("File "+htmlFile+"\n"+e); return false;}
154 881 mdecorde
                return true;
155 881 mdecorde
        }
156 881 mdecorde
157 881 mdecorde
        /**
158 881 mdecorde
         * The main method.
159 881 mdecorde
         *
160 881 mdecorde
         * @param args the arguments
161 881 mdecorde
         */
162 881 mdecorde
        public static void main(String[] args)
163 881 mdecorde
        {
164 881 mdecorde
                File htmlDirectory = new File("/home/mdecorde/xml/html")
165 881 mdecorde
                new HTMLIndexer().processHTMLDir(htmlDirectory);
166 881 mdecorde
        }
167 881 mdecorde
}