Revision 2877 tmp/org.txm.core/src/java/org/txm/scripts/importer/HTMLIndexer.groovy

HTMLIndexer.groovy (revision 2877)
58 58
	 */
59 59
	private boolean processHTMLDir(File htmlDirectory)
60 60
	{
61
		ArrayList<File> htmlfiles = DeleteDir.scanDirectory(htmlDirectory, true)
61
		ArrayList<File> htmlfiles = DeleteDir.scanDirectory(htmlDirectory, true, true)
62 62
		Collections.sort(htmlfiles);
63 63
		
64
		for(File htmlFile : htmlfiles)//get all indexes
65
		{
66
			if(htmlFile.getName().endsWith(".html"))
64
		for (File htmlFile : htmlfiles) {//get all indexes
65
			if (htmlFile.getName().endsWith(".html")) {
67 66
				processHTMLFile(htmlFile);
67
			}
68 68
		}
69 69
		
70 70
		ArrayList<String> tokens = new ArrayList<String>(index.keySet());
71 71
		Collections.sort(tokens);
72 72
		
73 73
		//fix doubles like étiquette&étiquettes
74
		for(int i = 0 ; i < tokens.size() ; i++)
75
		{
74
		for (int i = 0 ; i < tokens.size() ; i++) {
76 75
			String t1 = tokens.get(i);
77 76
			String t2 = tokens.get(i+1);
78
			if(t1.equals(t2.substring(0, t2.length() -1)))
79
			{
77
			if (t1.equals(t2.substring(0, t2.length() -1))) {
80 78
				tokens.remove(i+1);
81 79
				index.get(t1).addAll(index.get(t2));
82 80
				//i--;
......
84 82
		}
85 83
		
86 84
		tokens = new ArrayList<String>(index.keySet());
87
		for(String token : tokens)
88
		{
85
		for (String token : tokens) {
89 86
			println("Token: "+token);
90 87
			println(index.get(token));
91 88
		}
......
107 104
		def inputData = null;
108 105
		def factory = null;
109 106
		
110
		try
111
		{
107
		try {
112 108
			URL url = htmlFile.toURI().toURL();
113 109
			println "process html file "+url;
114 110
			inputData = url.openStream();
......
122 118
			for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) 
123 119
			{
124 120
				//println "parse"
125
				switch (event) 
126
				{
121
				switch (event) {
127 122
					case XMLStreamConstants.START_ELEMENT:
128 123
						//println "elem "+parser.getLocalName()
129
						if(parser.getLocalName() == "div")
130
							if(parser.getAttributeValue(null, "id") != null && parser.getAttributeValue(null, "id").startsWith("index-body"))
124
						if (parser.getLocalName() == "div")
125
							if (parser.getAttributeValue(null, "id") != null && parser.getAttributeValue(null, "id").startsWith("index-body"))
131 126
								page = parser.getAttributeValue(null, "id")
132 127
								
133 128
						String id = parser.getAttributeValue(null, "id");
134
						if(id != null && id.startsWith(idxprefix))
129
						if (id != null && id.startsWith(idxprefix))
135 130
						{
136
							if(!index.containsKey(lasttoken))
131
							if (!index.containsKey(lasttoken))
137 132
								index.put(lasttoken, new ArrayList<String>());
138 133
							index.get(lasttoken).add(htmlFile.getName()+"#"+id)
139 134
						}
......
142 137
					
143 138
					case XMLStreamConstants.CHARACTERS:
144 139
						String text = parser.getText().trim();
145
						if(text.length() > 0)
146
						{
140
						if (text.length() > 0) {
147 141
							def texts = text.split(" ");
148 142
							lasttoken = texts[texts.size()-1];
149
							if(lasttoken.endsWith("."))
143
							if (lasttoken.endsWith("."))
150 144
								lasttoken = lasttoken.substring(0, lasttoken.length() -1)
151 145
						}
152 146
				}

Also available in: Unified diff