Révision 557e0020
b/livrables/L4_2_4/Data_file/Cat1/.Rhistory | ||
---|---|---|
1 |
symbols(0,0,circles=1,inches=FALSE,add=TRUE) |
|
2 |
dev.off() |
|
3 |
return (X) |
|
4 |
} |
|
5 |
funct(filename, "test_bt_cool.eps") |
|
6 |
draw <- function(dataset,out_file){ |
|
7 |
X <- dudi.pca(dataset, center=T,scale = T, scannf=F) |
|
8 |
postscript(out_file, width = 4.0, height = 4.0,horizontal = FALSE, onefile = FALSE, paper = "special",family = "ComputerModern", encoding = "TeXtext.enc") |
|
9 |
keep <- 100 * X$eig/sum(X$eig) |
|
10 |
plot(X$co[,1],X$co[,2],xlim = c(-1,1), ylim = c(-1,1), asp = 1,ylab=paste("2nd principal axis (", round(keep[2],1), "%)", sep=""),xlab=paste("1st principal axis (", round(keep[1],1), "%)", sep="")) |
|
11 |
z <- row.names(X$co) |
|
12 |
print(z[1]) |
|
13 |
print(X$co[1,1]) |
|
14 |
k <- 1 |
|
15 |
#while (k < length(row.names(X$co))){ |
|
16 |
x <- rnorm(5, 0, 1) |
|
17 |
arrows(x[2] , x[4] , X$co[,1], X$co[,2], code = 2, col = 1, length=.25) |
|
18 |
#arrows(X$co[,1], X$co[,2], y0=x[2] +1, y1=x[3] +1) |
|
19 |
#text(X$co[,1],X$co[,2],row.names(X$co),col="navy") |
|
20 |
text(x[2] ,x[4] ,z[1],col="navy") |
|
21 |
k <- k + 1 |
|
22 |
#} |
|
23 |
abline(h=0,v=0) |
|
24 |
symbols(0,0,circles=1,inches=FALSE,add=TRUE) |
|
25 |
dev.off() |
|
26 |
return (X) |
|
27 |
} |
|
28 |
funct(filename, "test_bt_cool.eps") |
|
29 |
draw <- function(dataset,out_file){ |
|
30 |
X <- dudi.pca(dataset, center=T,scale = T, scannf=F) |
|
31 |
postscript(out_file, width = 4.0, height = 4.0,horizontal = FALSE, onefile = FALSE, paper = "special",family = "ComputerModern", encoding = "TeXtext.enc") |
|
32 |
keep <- 100 * X$eig/sum(X$eig) |
|
33 |
plot(X$co[,1],X$co[,2],xlim = c(-1,1), ylim = c(-1,1), asp = 1,ylab=paste("2nd principal axis (", round(keep[2],1), "%)", sep=""),xlab=paste("1st principal axis (", round(keep[1],1), "%)", sep="")) |
|
34 |
z <- row.names(X$co) |
|
35 |
print(z[1]) |
|
36 |
print(X$co[1,1]) |
|
37 |
k <- 1 |
|
38 |
while (k <= length(row.names(X$co))){ |
|
39 |
x <- rnorm(5, 0, 1) |
|
40 |
arrows(x[2] , x[4] , X$co[k,1], X$co[k,2], code = 2, col = 1, length=.25) |
|
41 |
#arrows(X$co[,1], X$co[,2], y0=x[2] +1, y1=x[3] +1) |
|
42 |
#text(X$co[,1],X$co[,2],row.names(X$co),col="navy") |
|
43 |
text(x[2] ,x[4] ,z[k],col="navy") |
|
44 |
k <- k + 1 |
|
45 |
} |
|
46 |
abline(h=0,v=0) |
|
47 |
symbols(0,0,circles=1,inches=FALSE,add=TRUE) |
|
48 |
dev.off() |
|
49 |
return (X) |
|
50 |
} |
|
51 |
funct(filename, "test_bt_cool.eps") |
|
52 |
source('~/AppProfile/Experiments/scriptPdpPca.R') |
|
53 |
funct(filename, "test_bt_cool.eps") |
|
54 |
source('~/AppProfile/Experiments/scriptPdpPca.R') |
|
55 |
funct(filename, "test_bt_cool.eps") |
|
56 |
source('~/AppProfile/Experiments/scriptPdpPca.R') |
|
57 |
funct(filename, "test_bt_cool.eps") |
|
58 |
source('~/AppProfile/Experiments/scriptPdpPca.R') |
|
59 |
funct(filename, "test_bt_cool.eps") |
|
60 |
source('~/AppProfile/Experiments/scriptPdpPca.R') |
|
61 |
funct(filename, "test_bt_cool.eps") |
|
62 |
source('~/AppProfile/Experiments/scriptPdpPca.R') |
|
63 |
funct(filename, "test_bt_cool.eps") |
|
64 |
source('~/AppProfile/Experiments/scriptPdpPca.R') |
|
65 |
funct(filename, "test_bt_cool.eps") |
|
66 |
source('~/AppProfile/Experiments/scriptPdpPca.R') |
|
67 |
funct(filename, "test_bt_cool.eps") |
|
68 |
source('~/AppProfile/Experiments/scriptPdpPca.R') |
|
69 |
funct(filename, "test_bt_cool.eps") |
|
70 |
source('~/AppProfile/Experiments/scriptPdpPca.R') |
|
71 |
funct(filename, "test_bt_cool.eps") |
|
72 |
source('~/AppProfile/Experiments/scriptPdpPca.R') |
|
73 |
funct(filename, "test_bt_cool.eps") |
|
74 |
source('~/AppProfile/Experiments/scriptPdpPca.R') |
|
75 |
funct(filename, "test_bt_cool.eps") |
|
76 |
source('~/AppProfile/Experiments/scriptPdpPca.R') |
|
77 |
funct(filename, "test_bt_cool.eps") |
|
78 |
source('~/AppProfile/Experiments/scriptPdpPca.R') |
|
79 |
source('~/AppProfile/Experiments/scriptPdpPca.R') |
|
80 |
funct(filename, "test_bt_cool.eps") |
|
81 |
source('~/AppProfile/Experiments/scriptPdpPca.R') |
|
82 |
funct(filename, "test_bt_cool.eps") |
|
83 |
source('~/AppProfile/Experiments/scriptPdpPca.R') |
|
84 |
funct(filename, "test_bt_cool.eps") |
|
85 |
source('~/AppProfile/Experiments/scriptPdpPca.R') |
|
86 |
funct(filename, "test_bt_cool.eps") |
|
87 |
source('~/AppProfile/Experiments/scriptPdpPca.R') |
|
88 |
source('~/AppProfile/Experiments/scriptPdpPca.R') |
|
89 |
funct(filename, "test_bt_cool.eps") |
|
90 |
source('~/AppProfile/Experiments/scriptPdpPca.R') |
|
91 |
funct(filename, "test_bt_cool.eps") |
|
92 |
source('~/AppProfile/Experiments/scriptPdpPca.R') |
|
93 |
funct(filename, "test_bt_cool.eps") |
|
94 |
source('~/AppProfile/Experiments/scriptPdpPca.R') |
|
95 |
funct(filename, "test_bt_cool.eps") |
|
96 |
filename="/Users/ghislainlandry/AppProfile/Experiments/PhaseDetect/data.raw/data.pca/BT.txt" |
|
97 |
funct(filename, "test_bt_cool.eps") |
|
98 |
filename="/Users/ghislainlandry/AppProfile/Experiments/PhaseDetect/data.raw/data.pca/bakup/BT.txt" |
|
99 |
funct(filename, "test_bt_cool.eps") |
|
100 |
source('~/AppProfile/Experiments/scriptPdpPca.R') |
|
101 |
funct(filename, "test_bt_cool.eps") |
|
102 |
filename = "/Users/ghislainlandry/AppProfile/Experiments/PhaseDetect/data.raw/data.pca/data.nas/l2.bt.txt" |
|
103 |
funct(filename, "test_bt_cool.eps") |
|
104 |
source('~/AppProfile/Experiments/scriptPdpPca.R') |
|
105 |
funct(filename, "test_bt_cool.eps") |
|
106 |
source('~/AppProfile/Experiments/scriptPdpPca.R') |
|
107 |
funct(filename, "test_bt_cool.eps") |
|
108 |
?as.numeric |
|
109 |
?mode |
|
110 |
?ld |
|
111 |
??ld |
|
112 |
??ForImp |
|
113 |
install.packages("ForImp") |
|
114 |
install.packages("mi") |
|
115 |
?mi |
|
116 |
??mi |
|
117 |
library(stat) |
|
118 |
library(stats) |
|
119 |
?mi |
|
120 |
x <- rnorm(100,0,1) # N(0,1) |
|
121 |
x |
|
122 |
y <- rbinom(100,1,invlogit(1+2*x)) |
|
123 |
library(mi) |
|
124 |
y <- rbinom(100,1,invlogit(1+2*x)) |
|
125 |
y[seq(1,100,10)]<-NA |
|
126 |
dat.xy <- data.frame(x,y) |
|
127 |
head(dat.xy) |
|
128 |
mi.binary(y~x, data = dat.xy) |
|
129 |
x <-rnorm(100,0,1) |
|
130 |
y <- x+4 |
|
131 |
y <- round(y) |
|
132 |
y[y<0] <- 0 |
|
133 |
y |
|
134 |
y[seq(1,100,10)] <- NA |
|
135 |
?seq |
|
136 |
dat.xy <- data.frame(x,y) |
|
137 |
head(dat.xy) |
|
138 |
mi.categorical(formula = y ~ x, data = dat.xy) |
|
139 |
z <- mi.categorical(formula = y ~ x, data = dat.xy) |
|
140 |
head(z) |
|
141 |
head(dat.xy) |
|
142 |
z <- as.data.frame(mi.categorical(formula = y ~ x, data = dat.xy)) |
|
143 |
head(z) |
|
144 |
z <- mi.categorical(formula = y ~ x, data = dat.xy) |
|
145 |
z |
|
146 |
?mi |
|
147 |
library(doMc) |
|
148 |
library(doMC) |
|
149 |
dir() |
|
150 |
?factors |
|
151 |
Factors |
|
152 |
?Factors |
|
153 |
?as.factor |
|
154 |
?remove |
|
155 |
?random |
|
156 |
?rand |
|
157 |
??random |
|
158 |
seed <- 3433 |
|
159 |
set.seed(seed) |
|
160 |
?runif |
|
161 |
runif(1) |
|
162 |
runif(1) |
|
163 |
sample(1:6,10,replace=F) |
|
164 |
sample(1:20,10,replace=F) |
|
165 |
x <- sample(1:20,10,replace=F) |
|
166 |
x[1] |
|
167 |
df.fits <- list() |
|
168 |
df.fits |
|
169 |
?complete.cases() |
|
170 |
x <- c(0.8, 0.47, 0.51, 0.73, 0.36, 0.58, 0.57, 0.85, 0.44, 0.42) |
|
171 |
y <- c(1.39, 0.72, 1.55, 0.48, 1.19, -1.59, 1.23, -0.65, 1.49, 0.05) |
|
172 |
fit <- lm(y ~ x -1) |
|
173 |
fit |
|
174 |
x <- c(10.1, 9.2, 8.4, 9.5, 7.5) |
|
175 |
y <- c(9.9, 9.1, 8.4, 9.3, 7.2) |
|
176 |
mean(x+y) |
|
177 |
mean(x) |
|
178 |
mean(y) |
|
179 |
x |
|
180 |
sum(x) |
|
181 |
sum(x)*12 |
|
182 |
x <- c(0.8, 0.47, 0.51, 0.73, 0.36, 0.58, 0.57, 0.85, 0.44, 0.42) |
|
183 |
y <- c(1.39, 0.72, 1.55, 0.48, 1.19, -1.59, 1.23, -0.65, 1.49, 0.05) |
|
184 |
fit <- lm(y ~ 0+x) |
|
185 |
summary(fit) |
|
186 |
fit <- lm(y ~ x-1) |
|
187 |
summary(fit) |
|
188 |
x <- -5:5 |
|
189 |
y <- c(5.12, 3.93, 2.67, 1.87, 0.52, 0.08, 0.93, 2.05, 2.54, 3.87, 4.97) |
|
190 |
knots <- rep(0,10) |
|
191 |
splineTerms <- sapply(knots, function(k) (x > k) * (x -k)) |
|
192 |
xMat <- cbind(1, x, splineTerms) |
|
193 |
yhat <- predict(lm(y ~ xMat - 1)) |
|
194 |
plot(x, y, frame = FALSE, pch = 21, bg = "lightblue", cex = 2) |
|
195 |
lines(x, yhat, col = "red", lwd = 2) |
|
196 |
fit <- lm(y ~ xMat - 1) |
|
197 |
library(ggplot2) |
|
198 |
?rnorm |
|
199 |
?runif |
|
200 |
x <- runif(10000, 50,47000) |
|
201 |
hist(x) |
|
202 |
con <- url("http://www.router-switch.com/search/server/?pagesize=150") |
|
203 |
library(XML) |
|
204 |
html <- htmlTreeParse(con, useInternalNodes=T) |
|
205 |
url <- "http://www.router-switch.com/search/server/?pagesize=150" |
|
206 |
html <- htmlTreeParse(url, useInternalNodes=T) |
|
207 |
Y <- xpathSAply(html, "//List Price:", xmlValue) |
|
208 |
Y <- xpathSApply(html, "//List Price:", xmlValue) |
|
209 |
Y <- xpathSApply(html, "//td[@id='List Price:']", xmlValue) |
|
210 |
head(y) |
|
211 |
head(Y) |
|
212 |
Y |
|
213 |
Y[0] |
|
214 |
xpathSApply(html, "//title", xmlValue) |
|
215 |
head(html) |
|
216 |
xpathSApply(html, "//td[@id='List Price:']", xmlValue) |
|
217 |
html |
|
218 |
doc.text = unlist(xpathApply(html, '//p', xmlValue)) |
|
219 |
head(doc.tex) |
|
220 |
head(doc.text) |
|
221 |
"List" %in% doc.text |
|
222 |
doc.text |
|
223 |
doc.text = gsub('\\n', ' ', doc.text) |
|
224 |
doc.text |
|
225 |
doc.text = gsub('\\r', '', doc.text) |
|
226 |
doc.text |
|
227 |
"Our" %in% doc.text |
|
228 |
doc.text[338] |
|
229 |
doc.text[339] |
|
230 |
"Our" %in% doc.textdoc.text[339] |
|
231 |
"Our" %in% doc.text[339] |
|
232 |
class(doc.text[339]) |
|
233 |
?gsub |
|
234 |
y <- gsub(" ", " ",doc.text) |
|
235 |
y |
|
236 |
y <- paste(doc.text, collapse=" ") |
|
237 |
y |
|
238 |
?grepl |
|
239 |
y <- grepl("Our",doc.text[339]) |
|
240 |
y |
|
241 |
doc.text |
|
242 |
y <- grepl("Our",doc.text[303]) |
|
243 |
y |
|
244 |
y <- grepl("List",doc.text[303]) |
|
245 |
y |
|
246 |
y <- grepl(c("List","Our"), doc.text[303]) |
|
247 |
y <- grepl("List, Our", doc.text[303]) |
|
248 |
y |
|
249 |
y <- grepl("List" "Our", doc.text[303]) |
|
250 |
y <- grepl("List", "Our", doc.text[303]) |
|
251 |
y |
|
252 |
y <- grepl("$",, doc.text[303]) |
|
253 |
y <- grepl("\\$",, doc.text[303]) |
|
254 |
y <- grepl("\$",, doc.text[303]) |
|
255 |
y <- grepl("\$",doc.text[303]) |
|
256 |
y <- grepl("\\$",doc.text[303]) |
|
257 |
y |
|
258 |
x <- rapply(lapply(doc.text, function(x){grepl("\\$",doc.text[303])}), function(z){z}) |
|
259 |
x[1:20] |
|
260 |
FALSE %in% x |
|
261 |
x <- rapply(lapply(doc.text, function(x){grepl("\\$",doc.text)}), function(z){z}) |
|
262 |
x[1:20] |
|
263 |
FALSE %in% x |
|
264 |
y <- doc.text[x] |
|
265 |
length(y) |
|
266 |
head(y) |
|
267 |
class(y) |
|
268 |
y[1] |
|
269 |
y[2] |
|
270 |
y <- gsub("List Price: USD$", "", doc.text[x]) |
|
271 |
head(y) |
|
272 |
y <- gsub("List Price: USD$", "", y[1]) |
|
273 |
y |
|
274 |
y[1] |
|
275 |
y |
|
276 |
gsub("List Price: USD\\$", "", y) |
|
277 |
as.numeric("3,495.00") |
|
278 |
gsub(",", "","3,495.00") |
|
279 |
y <- doc.text[x] |
|
280 |
head(y) |
|
281 |
lprice <- rapply(lapply(y, function(a){gsub("List Price: USD\\$", "", a)}), function(z){z}) |
|
282 |
head(lprice) |
|
283 |
lprice <- rapply(lapply(lprice, function(a){gsub("Our Price: USD\\$", "", a)}), function(z){z}) |
|
284 |
head(lprice) |
|
285 |
prices <- rapply(lapply(lprice, function(x){gsub(",", "",x)}), function(z){as.numeric(z)}) |
|
286 |
prices |
|
287 |
tail(lprice) |
|
288 |
tail(y) |
|
289 |
y <- doc.text[x] |
|
290 |
tail(y) |
|
291 |
head(y) |
|
292 |
?ifelse |
|
293 |
Z <- ifelse(is.na(y), True, False) |
|
294 |
Z <- ifelse(is.na(y), T, F) |
|
295 |
lean(Z) |
|
296 |
length(Z) |
|
297 |
l <- y[!Z] |
|
298 |
l |
|
299 |
serversPrices <- function(pagelink){ |
|
300 |
# Read and parse HTML file |
|
301 |
html <- htmlTreeParse(pagelink, useInternalNodes=T) |
|
302 |
# Extract all the paragraphs (HTML tag is p, starting at |
|
303 |
# the root of the document). Unlist flattens the list to |
|
304 |
# create a character vector. |
|
305 |
doc.text = unlist(xpathApply(html, '//p', xmlValue)) |
|
306 |
# Replace all \n by spaces |
|
307 |
doc.text = gsub('\\n', ' ', doc.text) |
|
308 |
doc.text = gsub('\\r', ' ', doc.text) |
|
309 |
x <- rapply(lapply(doc.text, function(x){grepl("\\$",doc.text)}), function(z){z}) |
|
310 |
y <- doc.text[x] |
|
311 |
y <- ifelse(is.na(y), True, False) |
|
312 |
lprice <- rapply(lapply(y, function(a){gsub("List Price: USD\\$", "", a)}), |
|
313 |
function(z){z}) |
|
314 |
lprice <- rapply(lapply(lprice, function(a){gsub("Our Price: USD\\$", "", a)}), |
|
315 |
function(z){z}) |
|
316 |
prices <- rapply(lapply(lprice, function(x){gsub(",", "",x)}), |
|
317 |
function(z){as.numeric(z)}) |
|
318 |
prices |
|
319 |
} |
|
320 |
prices <- serversPrices("http://www.router-switch.com/search/server/?pagesize=150") |
|
321 |
serversPrices <- function(pagelink){ |
|
322 |
# Read and parse HTML file |
|
323 |
html <- htmlTreeParse(pagelink, useInternalNodes=T) |
|
324 |
# Extract all the paragraphs (HTML tag is p, starting at |
|
325 |
# the root of the document). Unlist flattens the list to |
|
326 |
# create a character vector. |
|
327 |
doc.text = unlist(xpathApply(html, '//p', xmlValue)) |
|
328 |
# Replace all \n by spaces |
|
329 |
doc.text = gsub('\\n', ' ', doc.text) |
|
330 |
doc.text = gsub('\\r', ' ', doc.text) |
|
331 |
x <- rapply(lapply(doc.text, function(x){grepl("\\$",doc.text)}), function(z){z}) |
|
332 |
y <- doc.text[x] |
|
333 |
y <- ifelse(is.na(y), T, F) |
|
334 |
lprice <- rapply(lapply(y, function(a){gsub("List Price: USD\\$", "", a)}), |
|
335 |
function(z){z}) |
|
336 |
lprice <- rapply(lapply(lprice, function(a){gsub("Our Price: USD\\$", "", a)}), |
|
337 |
function(z){z}) |
|
338 |
prices <- rapply(lapply(lprice, function(x){gsub(",", "",x)}), |
|
339 |
function(z){as.numeric(z)}) |
|
340 |
prices |
|
341 |
} |
|
342 |
prices <- serversPrices("http://www.router-switch.com/search/server/?pagesize=150") |
|
343 |
warnings() |
|
344 |
length(prices) |
|
345 |
serversPrices <- function(pagelink){ |
|
346 |
# Read and parse HTML file |
|
347 |
html <- htmlTreeParse(pagelink, useInternalNodes=T) |
|
348 |
# Extract all the paragraphs (HTML tag is p, starting at |
|
349 |
# the root of the document). Unlist flattens the list to |
|
350 |
# create a character vector. |
|
351 |
doc.text = unlist(xpathApply(html, '//p', xmlValue)) |
|
352 |
# Replace all \n by spaces |
|
353 |
doc.text = gsub('\\n', ' ', doc.text) |
|
354 |
doc.text = gsub('\\r', ' ', doc.text) |
|
355 |
x <- rapply(lapply(doc.text, function(x){grepl("\\$",doc.text)}), function(z){z}) |
|
356 |
y <- doc.text[x] |
|
357 |
#y <- ifelse(is.na(y), T, F) |
|
358 |
lprice <- rapply(lapply(y, function(a){gsub("List Price: USD\\$", "", a)}), |
|
359 |
function(z){z}) |
|
360 |
lprice <- rapply(lapply(lprice, function(a){gsub("Our Price: USD\\$", "", a)}), |
|
361 |
function(z){z}) |
|
362 |
Z <- ifelse(is.na(y), T, F) |
|
363 |
lprice <- lprice[!Z] |
|
364 |
prices <- rapply(lapply(lprice, function(x){gsub(",", "",x)}), |
|
365 |
function(z){as.numeric(z)}) |
|
366 |
prices |
|
367 |
} |
|
368 |
prices <- serversPrices("http://www.router-switch.com/search/server/?pagesize=150") |
|
369 |
serversPrices <- function(pagelink){ |
|
370 |
# Read and parse HTML file |
|
371 |
html <- htmlTreeParse(pagelink, useInternalNodes=T) |
|
372 |
# Extract all the paragraphs (HTML tag is p, starting at |
|
373 |
# the root of the document). Unlist flattens the list to |
|
374 |
# create a character vector. |
|
375 |
doc.text = unlist(xpathApply(html, '//p', xmlValue)) |
|
376 |
# Replace all \n by spaces |
|
377 |
doc.text = gsub('\\n', ' ', doc.text) |
|
378 |
doc.text = gsub('\\r', ' ', doc.text) |
|
379 |
x <- rapply(lapply(doc.text, function(x){grepl("\\$",doc.text)}), function(z){z}) |
|
380 |
y <- doc.text[x] |
|
381 |
#y <- ifelse(is.na(y), T, F) |
|
382 |
lprice <- rapply(lapply(y, function(a){gsub("List Price: USD\\$", "", a)}), |
|
383 |
function(z){z}) |
|
384 |
lprice <- rapply(lapply(lprice, function(a){gsub("Our Price: USD\\$", "", a)}), |
|
385 |
function(z){z}) |
|
386 |
Z <- ifelse(is.na(lprice), T, F) |
|
387 |
lprice <- lprice[!Z] |
|
388 |
prices <- rapply(lapply(lprice, function(x){gsub(",", "",x)}), |
|
389 |
function(z){as.numeric(z)}) |
|
390 |
prices |
|
391 |
} |
|
392 |
prices <- serversPrices("http://www.router-switch.com/search/server/?pagesize=150") |
|
393 |
length(prices) |
|
394 |
prices |
|
395 |
serversPrices <- function(pagelink){ |
|
396 |
# Read and parse HTML file |
|
397 |
html <- htmlTreeParse(pagelink, useInternalNodes=T) |
|
398 |
# Extract all the paragraphs (HTML tag is p, starting at |
|
399 |
# the root of the document). Unlist flattens the list to |
|
400 |
# create a character vector. |
|
401 |
doc.text = unlist(xpathApply(html, '//p', xmlValue)) |
|
402 |
# Replace all \n by spaces |
|
403 |
doc.text = gsub('\\n', ' ', doc.text) |
|
404 |
doc.text = gsub('\\r', ' ', doc.text) |
|
405 |
x <- rapply(lapply(doc.text, function(x){grepl("\\$",doc.text)}), function(z){z}) |
|
406 |
y <- doc.text[x] |
|
407 |
#y <- ifelse(is.na(y), T, F) |
|
408 |
lprice <- rapply(lapply(y, function(a){gsub("List Price: USD\\$", "", a)}), |
|
409 |
function(z){z}) |
|
410 |
lprice <- rapply(lapply(lprice, function(a){gsub("Our Price: USD\\$", "", a)}), |
|
411 |
function(z){z}) |
|
412 |
Z <- ifelse(is.na(lprice), T, F) |
|
413 |
lprice <- lprice[!Z] |
|
414 |
lprice |
|
415 |
#prices <- rapply(lapply(lprice, function(x){gsub(",", "",x)}), |
|
416 |
# function(z){as.numeric(z)}) |
|
417 |
#prices |
|
418 |
} |
|
419 |
prices <- serversPrices("http://www.router-switch.com/search/server/?pagesize=150") |
|
420 |
prices |
|
421 |
lprice <- rapply(lapply(lprice, function(a){gsub("Our Price: USD\\$", "", a)}), |
|
422 |
function(z){z}) |
|
423 |
lprice |
|
424 |
doc.text = unlist(xpathApply(html, '//p', xmlValue)) |
|
425 |
doc.text = gsub('\\n', ' ', doc.text) |
|
426 |
doc.text = gsub('\\r', ' ', doc.text) |
|
427 |
doc.text |
|
428 |
x <- rapply(lapply(doc.text, function(x){grepl("Price",doc.text)}), function(z){z}) |
|
429 |
head(x) |
|
430 |
y <- doc.text[x] |
|
431 |
length(y) |
|
432 |
head(y) |
|
433 |
tail(y) |
|
434 |
Z <- ifelse(is.na(y), T, F) |
|
435 |
y <- y[!Z] |
|
436 |
tail(y) |
|
437 |
t <- y |
|
438 |
length(y) |
|
439 |
head(y) |
|
440 |
gsub(" $", "", "Our Price: USD$4,598.00 ") |
|
441 |
gsub(" $ ", "", "Our Price: USD$4,598.00 ") |
|
442 |
gsub("\\s", " ", "Our Price: USD$4,598.00 ") |
|
443 |
gsub("\t", " ", "Our Price: USD$4,598.00 ") |
|
444 |
gsub("\\t", "", "Our Price: USD$4,598.00 ") |
|
445 |
gsub("\\t", "", "Our Price: USD$4,598.00 ") |
|
446 |
str.replace(/[\t\n\r]/gm,'') |
|
447 |
str <- "Our Price: USD$4,598.00 " |
|
448 |
str.replace(/[\t\n\r]/gm,'') |
|
449 |
y[72] |
|
450 |
gsub("\t","",y[72]) |
|
451 |
gsub("\t","",as.character(y[72])) |
|
452 |
gsub(" ","",as.character(y[72])) |
|
453 |
lprice <- rapply(lapply(y, function(a){" ","",as.character(a)}), |
|
454 |
function(z){z}) |
|
455 |
lprice <- rapply(lapply(y, function(a){" ","",as.character(a)}),function(z){z}) |
|
456 |
lprice <- rapply(lapply(y, function(a){gsub(" ","",as.character(a))}),function(z){z}) |
|
457 |
head(lprice) |
|
458 |
lprice <- rapply(lapply(y, function(a){gsub("List Price:USD\\$", "", a)}), |
|
459 |
function(z){z}) |
|
460 |
head(lprice) |
|
461 |
lprice <- rapply(lapply(y, function(a){gsub(" ","",as.character(a))}),function(z){z}) |
|
462 |
lprice <- rapply(lapply(lprice, function(a){gsub("List Price:USD\\$", "", a)}), |
|
463 |
function(z){z}) |
|
464 |
head(lprice) |
|
465 |
lprice <- rapply(lapply(lprice, function(a){gsub("ListPrice:USD\\$", "", a)}), |
|
466 |
function(z){z}) |
|
467 |
head(lprice) |
|
468 |
lprice <- rapply(lapply(lprice, function(a){gsub("OurPrice:USD\\$", "", a)}), |
|
469 |
function(z){z}) |
|
470 |
head(lprice) |
|
471 |
prices <- rapply(lapply(lprice, function(x){gsub(",", "",x)}), |
|
472 |
function(z){as.numeric(z)}) |
|
473 |
prices |
|
474 |
hist(prices) |
|
475 |
?rweibull |
|
476 |
hist(rweibull(1e5,1.5,33)) |
|
477 |
mean(prices) |
|
478 |
hist(rweibull(1e5,1.5,mean(prices))) |
|
479 |
?rweibull |
|
480 |
y <- rweibull(1e5,1.5,mean(prices)) |
|
481 |
mean(y) |
|
482 |
y <- rweibull(1e5,2,mean(prices)) |
|
483 |
mean(y) |
|
484 |
y <- rweibull(1e5,1,mean(prices)) |
|
485 |
mean(y) |
|
486 |
hist(y) |
|
487 |
y <- rweibull(1e5,1.5,mean(prices)) |
|
488 |
hist(y) |
|
489 |
y <- rweibull(1e5,3.5,mean(prices)) |
|
490 |
hist(y) |
|
491 |
y <- rweibull(1e5,2.5,mean(prices)) |
|
492 |
hist(y) |
|
493 |
y <- rweibull(1e5,2,mean(prices)) |
|
494 |
hist(y) |
|
495 |
mean(y) |
|
496 |
setwd("/Users/ghislainlandry/Enseignements/HaasAnalysis/Cat1") |
|
497 |
dir() |
|
498 |
require(xlsReadWrite) |
|
499 |
require(XLConnect) |
|
500 |
require(XLConnect) |
|
501 |
require(xlsx) |
|
502 |
?read.xlsx |
|
503 |
conf1_df <- read.xlsx("categoryOneServer.xlsx", sheetName = "conf1") |
|
504 |
head(conf1_df) |
|
505 |
procs <- read.xlsx("conf1extensions.xlsx", sheetName = "procs") |
|
506 |
head(procs) |
|
507 |
sep(nrow(procs)) |
|
508 |
seq(nrow(procs)) |
|
509 |
procs$id <- seq(nrow(procs)) |
|
510 |
head(procs) |
|
511 |
y <- merge(procs, conf1_df, by= NULL) |
|
512 |
head(y) |
b/livrables/L4_2_4/Data_file/Cat1/GenerateServers.py | ||
---|---|---|
1 |
#!/usr/bin/env python |
|
2 |
|
|
3 |
from __future__ import division |
|
4 |
import numpy as np |
|
5 |
import pandas as pd |
|
6 |
from numpy import genfromtxt |
|
7 |
from pandas import DataFrame |
|
8 |
|
|
9 |
import re |
|
10 |
import math |
|
11 |
import sys |
|
12 |
import os |
|
13 |
|
|
14 |
|
|
15 |
|
|
16 |
|
|
17 |
def duplicate(x, n): |
|
18 |
return [x[0]] *n |
|
19 |
|
|
20 |
|
|
21 |
basedirs = ["Cat1", "Cat2", "Cat3"] |
|
22 |
|
|
23 |
|
|
24 |
|
|
25 |
for directory in basedirs: |
|
26 |
|
|
27 |
|
b/livrables/L4_2_4/Data_file/Cat1/codeServers.R | ||
---|---|---|
1 |
require(xlsx) |
|
2 |
|
|
3 |
conf1_df <- read.xlsx("categoryOneServer.xlsx", sheetName = "conf1") |
|
4 |
conf1_df$id <- seq(nrow(conf1_df)) |
|
5 |
|
|
6 |
procs <- read.xlsx("conf1extensions.xlsx", sheetName = "procs") |
|
7 |
procs$id <- seq(nrow(procs)) |
|
8 |
|
b/livrables/L4_2_4/Data_file/Cat1/makeserverinstances.py | ||
---|---|---|
1 |
#!/usr/bin/env python |
|
2 |
|
|
3 |
from __future__ import division |
|
4 |
import numpy as np |
|
5 |
import pandas as pd |
|
6 |
from numpy import genfromtxt |
|
7 |
from pandas import DataFrame |
|
8 |
|
|
9 |
import re |
|
10 |
import math |
|
11 |
import sys |
|
12 |
|
|
13 |
|
|
14 |
|
|
15 |
def duplicate(x, n): |
|
16 |
return [x[0]] *n |
|
17 |
|
|
18 |
|
|
19 |
|
|
20 |
cat1 = pd.ExcelFile("categoryOneServer.xlsx") |
|
21 |
|
|
22 |
dico = {} |
|
23 |
|
|
24 |
configurations = ["conf1", "conf2"] |
|
25 |
|
|
26 |
for item in configurations: |
|
27 |
df_conf1 = cat1.parse("conf1") |
|
28 |
df_conf1["id"] = range(1, df_conf1.shape[0] + 1) |
|
29 |
|
|
30 |
extension_file = item + "extensions.xlsx" |
|
31 |
|
|
32 |
ext = pd.ExcelFile(extension_file) |
|
33 |
procs = ext.parse("procs") |
|
34 |
procs["id"] = range(1, procs.shape[0] + 1) |
|
35 |
|
|
36 |
mem = ext.parse("mem") |
|
37 |
mem["id"] = range(1, mem.shape[0] + 1) |
|
38 |
|
|
39 |
storage = ext.parse("storage3inch") |
|
40 |
storage["id"] = range(1, storage.shape[0] + 1) |
|
41 |
|
|
42 |
#network = ext.parse("network") |
|
43 |
#network["id"] = range(1, network.shape[0] + 1) |
|
44 |
|
|
45 |
|
|
46 |
df1 = df_conf1 |
|
47 |
|
|
48 |
df1["procUnitCost"] = 0 |
|
49 |
|
|
50 |
df = df1.merge(procs, how='outer', copy=False) |
|
51 |
|
|
52 |
colnames = list(df.columns) |
|
53 |
proc_names = list(procs.columns) |
|
54 |
|
|
55 |
colnames_t = [x for x in colnames if x not in proc_names] |
|
56 |
colnames = [x for x in colnames_t if x not in ["id"]] |
|
57 |
|
|
58 |
df[colnames] = df[colnames].apply(duplicate, 0, args=[df.shape[0]]) |
|
59 |
|
|
60 |
data = df |
|
61 |
data["MemUnitCost"] = 0 |
|
62 |
|
|
63 |
for i in range(df.shape[0]): |
|
64 |
d = pd.DataFrame(df.iloc[i, ]).T # get the transpose of the original data frame |
|
65 |
d1 = d.merge(mem, how="outer", copy=False) |
|
66 |
colnames = list(d1.columns) |
|
67 |
mem_names = list(mem.columns) |
|
68 |
colnames_t = [x for x in colnames if x not in mem_names] |
|
69 |
colnames = [x for x in colnames_t if x not in ["id"]] |
|
70 |
d1[colnames] = d1[colnames].apply(duplicate, 0, args=[d1.shape[0]]) |
|
71 |
data = data.append(d1) |
|
72 |
|
|
73 |
## process storage |
|
74 |
df = data |
|
75 |
data["StorageUnitCost"] = 0 |
|
76 |
|
|
77 |
for i in range(df.shape[0]): |
|
78 |
d = pd.DataFrame(df.iloc[i, ]).T # get the transpose of the original data frame |
|
79 |
d1 = d.merge(storage, how="outer", copy=False) |
|
80 |
colnames = list(d1.columns) |
|
81 |
storage_names = list(storage.columns) |
|
82 |
colnames_t = [x for x in colnames if x not in storage_names] |
|
83 |
colnames = [x for x in colnames_t if x not in ["id"]] |
|
84 |
d1[colnames] = d1[colnames].apply(duplicate, 0, args=[d1.shape[0]]) |
|
85 |
data = data.append(d1) |
|
86 |
|
|
87 |
dico[item] = data |
|
88 |
|
|
89 |
|
|
90 |
df = dico["conf1"] |
|
91 |
|
|
92 |
## remove conf1 from the list |
|
93 |
configurations.remove("conf1") |
|
94 |
|
|
95 |
for items in configurations: |
|
96 |
df.append(dico[items]) |
|
97 |
|
|
98 |
#df = dico["conf1"].append(dico['conf2']) |
|
99 |
|
|
100 |
print df.head() |
|
101 |
|
|
102 |
df["category"] = "standard" |
|
103 |
cost_fileds = [v for v in df.columns if "Cost" in v] |
|
104 |
df["Cost"] = 0 |
|
105 |
|
|
106 |
for items in cost_fileds: |
|
107 |
df["Cost"] += df[items] |
|
108 |
|
|
109 |
df.to_excel("standard.xlsx", sheet_name='cat', engine='xlsxwriter',index=False) |
|
110 |
|
b/livrables/L4_2_4/Data_file/Cat1/makeserverinstances.py~ | ||
---|---|---|
1 |
#!/usr/bin/env python |
|
2 |
|
|
3 |
from __future__ import division |
|
4 |
import numpy as np |
|
5 |
import pandas as pd |
|
6 |
from numpy import genfromtxt |
|
7 |
from pandas import DataFrame |
|
8 |
|
|
9 |
import re |
|
10 |
import math |
|
11 |
import sys |
|
12 |
|
|
13 |
|
|
14 |
|
|
15 |
def duplicate(x, n): |
|
16 |
return [x[0]] *n |
|
17 |
|
|
18 |
|
|
19 |
|
|
20 |
cat1 = pd.ExcelFile("categoryOneServer.xlsx") |
|
21 |
|
|
22 |
dico = {} |
|
23 |
|
|
24 |
configurations = ["conf1", "conf2"] |
|
25 |
|
|
26 |
for item in configurations: |
|
27 |
df_conf1 = cat1.parse("conf1") |
|
28 |
df_conf1["id"] = range(1, df_conf1.shape[0] + 1) |
|
29 |
|
|
30 |
extension_file = item + "extensions.xlsx" |
|
31 |
|
|
32 |
ext = pd.ExcelFile(extension_file) |
|
33 |
procs = ext.parse("procs") |
|
34 |
procs["id"] = range(1, procs.shape[0] + 1) |
|
35 |
|
|
36 |
mem = ext.parse("mem") |
|
37 |
mem["id"] = range(1, mem.shape[0] + 1) |
|
38 |
|
|
39 |
storage = ext.parse("storage3inch") |
|
40 |
storage["id"] = range(1, storage.shape[0] + 1) |
|
41 |
|
|
42 |
#network = ext.parse("network") |
|
43 |
#network["id"] = range(1, network.shape[0] + 1) |
|
44 |
|
|
45 |
|
|
46 |
df1 = df_conf1 |
|
47 |
|
|
48 |
df1["procUnitCost"] = 0 |
|
49 |
|
|
50 |
df = df1.merge(procs, how='outer', copy=False) |
|
51 |
|
|
52 |
colnames = list(df.columns) |
|
53 |
proc_names = list(procs.columns) |
|
54 |
|
|
55 |
colnames_t = [x for x in colnames if x not in proc_names] |
|
56 |
colnames = [x for x in colnames_t if x not in ["id"]] |
|
57 |
|
|
58 |
df[colnames] = df[colnames].apply(duplicate, 0, args=[df.shape[0]]) |
|
59 |
|
|
60 |
data = df |
|
61 |
data["MemUnitCost"] = 0 |
|
62 |
|
|
63 |
for i in range(df.shape[0]): |
|
64 |
d = pd.DataFrame(df.iloc[i, ]).T # get the transpose of the original data frame |
|
65 |
d1 = d.merge(mem, how="outer", copy=False) |
|
66 |
colnames = list(d1.columns) |
|
67 |
mem_names = list(mem.columns) |
|
68 |
colnames_t = [x for x in colnames if x not in mem_names] |
|
69 |
colnames = [x for x in colnames_t if x not in ["id"]] |
|
70 |
d1[colnames] = d1[colnames].apply(duplicate, 0, args=[d1.shape[0]]) |
|
71 |
data = data.append(d1) |
|
72 |
|
|
73 |
## process storage |
|
74 |
df = data |
|
75 |
data["StorageUnitCost"] = 0 |
|
76 |
|
|
77 |
for i in range(df.shape[0]): |
|
78 |
d = pd.DataFrame(df.iloc[i, ]).T # get the transpose of the original data frame |
|
79 |
d1 = d.merge(storage, how="outer", copy=False) |
|
80 |
colnames = list(d1.columns) |
|
81 |
storage_names = list(storage.columns) |
|
82 |
colnames_t = [x for x in colnames if x not in storage_names] |
|
83 |
colnames = [x for x in colnames_t if x not in ["id"]] |
|
84 |
d1[colnames] = d1[colnames].apply(duplicate, 0, args=[d1.shape[0]]) |
|
85 |
data = data.append(d1) |
|
86 |
|
|
87 |
dico[item] = data |
|
88 |
|
|
89 |
|
|
90 |
df = dico["conf1"] |
|
91 |
|
|
92 |
## remove conf1 from the list |
|
93 |
configurations.remove("conf1") |
|
94 |
|
|
95 |
for items in configurations: |
|
96 |
df.append(dico[items]) |
|
97 |
|
|
98 |
#df = dico["conf1"].append(dico['conf2']) |
|
99 |
|
|
100 |
print df.head() |
|
101 |
|
|
102 |
df["category"] = "standard" |
|
103 |
cost_fileds = [v for v in df.columns if "Cost" in v] |
|
104 |
df["Cost"] = 0 |
|
105 |
|
|
106 |
for items in cost_fileds: |
|
107 |
df["Cost"] += df[items] |
|
108 |
|
|
109 |
df.to_excel("standard.xlsx", sheet_name='cat', engine='xlsxwriter',index=False) |
|
110 |
|
b/livrables/L4_2_4/Data_file/GenerateServers.py | ||
---|---|---|
1 |
#!/usr/bin/env python |
|
2 |
|
|
3 |
from __future__ import division |
|
4 |
import numpy as np |
|
5 |
import pandas as pd |
|
6 |
from numpy import genfromtxt |
|
7 |
from pandas import DataFrame |
|
8 |
|
|
9 |
|
|
10 |
import re |
|
11 |
import math |
|
12 |
import sys |
|
13 |
import os |
|
14 |
from optparse import OptionParser |
|
15 |
|
|
16 |
import logging |
|
17 |
|
|
18 |
|
|
19 |
''' |
|
20 |
python version: |
|
21 |
2.7.8 |Anaconda 2.0.1 (x86_64) |
|
22 |
|
|
23 |
full description: |
|
24 |
2.7.8 |Anaconda 2.0.1 (x86_64)| (default, Aug 21 2014, 15:21:46) \n[GCC 4.2.1 (Apple Inc. build 5577)] |
|
25 |
''' |
|
26 |
|
|
27 |
|
|
28 |
np.random.seed(12345) |
|
29 |
|
|
30 |
|
|
31 |
class GenerateDate(): |
|
32 |
|
|
33 |
def __init__(self, nber_servers, |
|
34 |
maintainanceCost, |
|
35 |
maintainance_hours, |
|
36 |
software_init_cost, |
|
37 |
software_update_cost, |
|
38 |
network_hardware_cost, |
|
39 |
installation_cost, |
|
40 |
one_time_cost, |
|
41 |
platform_cost_hour, |
|
42 |
monthly_cost_instance, |
|
43 |
cost_extra_gb_memory, |
|
44 |
cost_extra_gb_storage, |
|
45 |
number_years, |
|
46 |
number_cores, |
|
47 |
memory_size, |
|
48 |
storage_size, |
|
49 |
storage_type, |
|
50 |
usage_hours): |
|
51 |
|
|
52 |
self.number_servers = nber_servers |
|
53 |
self.basedirs = ["Cat1", "Cat2"] |
|
54 |
self.dfservers = pd.DataFrame() |
|
55 |
self.maintainanceCost = maintainanceCost |
|
56 |
self.maintainance_hours = maintainance_hours |
|
57 |
self.software_init_cost = software_init_cost |
|
58 |
self.software_update_cost = software_update_cost |
|
59 |
self.network_hardware_cost= network_hardware_cost |
|
60 |
self.installation_cost = installation_cost |
|
61 |
self.one_time_cost = one_time_cost |
|
62 |
self.platform_cost_hour = platform_cost_hour |
|
63 |
self.monthly_cost_instance = monthly_cost_instance |
|
64 |
self.cost_extra_gb_memory = cost_extra_gb_memory #cost per extra giga bite of memory |
|
65 |
self.cost_extra_gb_storage = cost_extra_gb_storage |
|
66 |
self.number_years = number_years |
|
67 |
self.number_cores = number_cores |
|
68 |
self.memory_size = memory_size |
|
69 |
self.storage_size = storage_size |
|
70 |
self.storage_type = storage_type |
|
71 |
self.usage_hours = usage_hours |
|
72 |
|
|
73 |
def duplicate(self, x, n): |
|
74 |
return [x[0]] *n |
|
75 |
|
|
76 |
def getItem(self, item): |
|
77 |
if item == "procs": |
|
78 |
return "procUnitCost" |
|
79 |
else: |
|
80 |
if item == "mem": |
|
81 |
return "MemUnitCost" |
|
82 |
else: |
|
83 |
if item == "storage3inch" or item == "storage2inch": |
|
84 |
return "StorageUnitCost" |
|
85 |
else: |
|
86 |
if item == "network": |
|
87 |
return "netUnitCost" |
|
88 |
|
|
89 |
def difference_percentage(self, x): |
|
90 |
if np.nan in x: |
|
91 |
return np.nan |
|
92 |
else: |
|
93 |
return round((x[0] / x[1]) * 100, 2) |
|
94 |
|
|
95 |
|
|
96 |
''' |
|
97 |
base configuration data frame |
|
98 |
file name for the corresponding extension file |
|
99 |
flag to identify the appropriate component |
|
100 |
''' |
|
101 |
|
|
102 |
def createServerSet(self, baseConfDf, extensionfile): |
|
103 |
xcl_file = pd.ExcelFile(extensionfile) |
|
104 |
|
|
105 |
dframes = {sheet_name: xcl_file.parse(sheet_name) for sheet_name in xcl_file.sheet_names} |
|
106 |
# |
|
107 |
keys = dframes.keys() |
|
108 |
|
|
109 |
## add the processor unit cost |
|
110 |
baseConfDf[self.getItem("procs")] = 0 |
|
111 |
# |
|
112 |
|
|
113 |
data = baseConfDf.merge(dframes["procs"], how="outer", copy=False) |
|
114 |
colnames = list(data.columns) |
|
115 |
proc_names = list(dframes["procs"]) |
|
116 |
colnames_t = [x for x in colnames if x not in proc_names] |
|
117 |
colnames = [x for x in colnames_t if x not in ["id"]] |
|
118 |
|
|
119 |
data[colnames] = data[colnames].apply(self.duplicate, 0, args=[data.shape[0]]) |
|
120 |
|
|
121 |
ext = [v for v in keys if v in ["mem", "network","storage3inch", "storage2inch"]] |
|
122 |
#print ext |
|
123 |
if ("storage3inch" in ext) and ("storage2inch" in ext): |
|
124 |
ext.remove("storage3inch") |
|
125 |
ext.remove("storage2inch") |
|
126 |
|
|
127 |
for elt in ext: |
|
128 |
|
|
129 |
#df = data |
|
130 |
data[self.getItem(elt)] = 0 |
|
131 |
df = data |
|
132 |
|
|
133 |
for i in range(df.shape[0]): |
|
134 |
d = pd.DataFrame(df.iloc[i, ]).T # get the transpose of the original data frame |
|
135 |
d1 = d.merge(dframes[elt], how="outer", copy=False) |
|
136 |
colnames = list(d1.columns) |
|
137 |
names = list(dframes[elt].columns) |
|
138 |
colnames_t = [x for x in colnames if x not in names] |
|
139 |
colnames = [x for x in colnames_t if x not in ["id"]] ## to be removed |
|
140 |
d1[colnames] = d1[colnames].apply(self.duplicate, 0, args=[d1.shape[0]]) |
|
141 |
data = data.append(d1) |
|
142 |
|
|
143 |
for item in ["procUnitCost", "MemUnitCost", "StorageUnitCost", "netUnitCost"]: |
|
144 |
if item not in list(data): |
|
145 |
data[item] = 0 |
|
146 |
|
|
147 |
threeinch = data.copy() |
|
148 |
twoinche = data.copy() |
|
149 |
|
|
150 |
#df = threeinch |
|
151 |
threeinch[self.getItem("storage3inch")] = 0 |
|
152 |
df = threeinch |
|
153 |
|
|
154 |
for i in range(df.shape[0]): |
|
155 |
d = pd.DataFrame(df.iloc[i, ]).T |
|
156 |
d1 = d.merge(dframes["storage3inch"], how="outer", copy=False) |
|
157 |
colnames = list(d1.columns) |
|
158 |
names = list(dframes["storage3inch"].columns) |
|
159 |
colnames_t = [x for x in colnames if x not in names] |
|
160 |
colnames = [x for x in colnames_t if x not in ["id"]] ## to be removed |
|
161 |
d1[colnames] = d1[colnames].apply(self.duplicate, 0, args=[d1.shape[0]]) |
|
162 |
threeinch = threeinch.append(d1) |
|
163 |
data = data.append(threeinch) |
|
164 |
del(threeinch) |
|
165 |
|
|
166 |
#df = twoinche |
|
167 |
twoinche[self.getItem("storage2inch")] = 0 |
|
168 |
df = twoinche |
|
169 |
|
|
170 |
for i in range(df.shape[0]): |
|
171 |
d = pd.DataFrame(df.iloc[i, ]).T |
|
172 |
d1 = d.merge(dframes["storage2inch"], how="outer", copy=False) |
|
173 |
colnames = list(d1.columns) |
|
174 |
names = list(dframes["storage2inch"].columns) |
|
175 |
colnames_t = [x for x in colnames if x not in names] |
|
176 |
colnames = [x for x in colnames_t if x not in ["id"]] ## to be removed |
|
177 |
d1[colnames] = d1[colnames].apply(self.duplicate, 0, args=[d1.shape[0]]) |
|
178 |
twoinche = twoinche.append(d1) |
|
179 |
|
|
180 |
data = data.append(twoinche) |
|
181 |
|
|
182 |
del(twoinche) |
|
183 |
else: |
|
184 |
for elt in ext: |
|
185 |
|
|
186 |
#df = data |
|
187 |
data[self.getItem(elt)] = 0 |
|
188 |
df = data |
|
189 |
|
|
190 |
for i in range(df.shape[0]): |
|
191 |
d = pd.DataFrame(df.iloc[i, ]).T # get the transpose of the original data frame |
|
192 |
d1 = d.merge(dframes[elt], how="outer", copy=False) |
|
193 |
colnames = list(d1.columns) |
|
194 |
names = list(dframes[elt].columns) |
|
195 |
colnames_t = [x for x in colnames if x not in names] |
|
196 |
colnames = [x for x in colnames_t if x not in ["id"]] ## to be removed |
|
197 |
d1[colnames] = d1[colnames].apply(self.duplicate, 0, args=[d1.shape[0]]) |
|
198 |
data = data.append(d1) |
|
199 |
for item in ["procUnitCost", "MemUnitCost", "StorageUnitCost", "netUnitCost"]: |
|
200 |
if item not in list(data): |
|
201 |
data[item] = 0 |
|
202 |
|
|
203 |
return data |
|
204 |
|
|
205 |
|
|
206 |
def merge_datasets(self): |
|
207 |
for directory in self.basedirs: |
|
208 |
APP_ROOT = os.path.dirname(os.path.abspath(" ")) |
|
209 |
#print APP_ROOT |
|
210 |
path = os.path.join(APP_ROOT, directory) |
|
211 |
#print path |
|
212 |
baseconfigs = [ v for v in os.listdir(path) if "category" in v and "$" not in v] |
|
213 |
#print directory |
|
214 |
dataFrames = pd.DataFrame() |
|
215 |
|
|
216 |
extensions = [ v for v in os.listdir(path) if "extensions" in v and "$" not in v] |
|
217 |
#print ' '.join(extensions) |
|
218 |
for base in baseconfigs: |
|
219 |
filename = path +"/"+base |
|
220 |
|
|
221 |
xl_file = pd.ExcelFile(filename) |
|
222 |
## create a dictionary containing a DataFrame for every sheet |
|
223 |
dfs = {sheet_name: xl_file.parse(sheet_name, na_value="na") |
|
224 |
for sheet_name in xl_file.sheet_names} |
|
225 |
|
|
226 |
dataframe = pd.DataFrame() |
|
227 |
|
|
228 |
for items in dfs.keys(): |
|
229 |
extensionFilename = [v for v in extensions if items in v] |
|
230 |
filename = path +"/"+extensionFilename[0] |
|
231 |
#print filename |
|
232 |
dframes = self.createServerSet(dfs[items], filename) |
|
233 |
|
|
234 |
dataframe = dataframe.append(dframes) |
|
235 |
|
|
236 |
#for col in ["procUnitCost", "MemUnitCost", "StorageUnitCost", "netUnitCost"]: |
|
237 |
# if col not in list(dataframe.columns): |
|
238 |
# dataframe[col] = 0 |
|
239 |
dataFrames = dataFrames.append(dataframe) |
|
240 |
|
|
241 |
dataFrames["category"] = directory |
|
242 |
|
|
243 |
self.dfservers = self.dfservers.append(dataFrames) |
|
244 |
|
|
245 |
|
|
246 |
cost_fileds = [v for v in self.dfservers.columns if "Cost" in v] |
|
247 |
self.dfservers["Cost"] = 0 |
|
248 |
|
|
249 |
for item_ in cost_fileds: |
|
250 |
self.dfservers["Cost"] += map(lambda y: float(y) if y != "na" else np.nan, |
|
251 |
self.dfservers[item_]) #dfservers[items] |
|
252 |
|
|
253 |
self.dfservers.to_excel("dataset.xlsx", sheet_name='cat', |
|
254 |
engine='xlsxwriter',index=False) |
|
255 |
|
|
256 |
|
|
257 |
def filter_dataframe(self,items): |
|
258 |
|
|
259 |
if items == "numberCores": |
|
260 |
self.dfservers["subset"] = map(lambda x: |
|
261 |
True if int(x) >= self.number_cores else False, |
|
262 |
self.dfservers[items]) |
|
263 |
df = self.dfservers[self.dfservers["subset"] == True] |
|
264 |
self.dfservers = df |
|
265 |
else: |
|
266 |
if items == "storageType": |
|
267 |
self.dfservers["subset"] = map(lambda x: |
|
268 |
True if str(x) == self.storage_type else False, |
|
269 |
self.dfservers[items]) |
|
270 |
df = self.dfservers[self.dfservers["subset"] == True] |
|
271 |
self.dfservers = df |
|
272 |
else: |
|
273 |
if items == "memory": |
|
274 |
self.dfservers["subset"] = map(lambda x: |
|
275 |
True if int(x) >= self.memory_size else False, |
|
276 |
self.dfservers[items]) |
|
277 |
df = self.dfservers[self.dfservers["subset"] == True] |
|
278 |
self.dfservers = df |
|
279 |
else: |
|
280 |
if items == "storageSize": |
|
281 |
self.dfservers["subset"] = map(lambda x: |
|
282 |
True if int(x) >= self.storage_size else False, |
|
283 |
self.dfservers[items]) |
|
284 |
df = self.dfservers[self.dfservers["subset"] == True] |
|
285 |
self.dfservers = df |
|
286 |
|
|
287 |
|
|
288 |
def process_data(self): |
|
289 |
|
|
290 |
""" |
|
291 |
The algorithm keeps a constant number of maintainance hours over the |
|
292 |
years |
|
293 |
|
|
294 |
Software cost is relatively low but can be is provided as a parameter |
|
295 |
to the program. |
|
296 |
|
|
297 |
We only use servers with the following characteristics |
|
298 |
-- number of core == 4 |
|
299 |
-- memory 48 GB |
|
300 |
-- SSD storage greater than SSD 160GB |
|
301 |
|
|
302 |
""" |
|
303 |
filter_list = ["numberCores", "storageType", "storageSize", "memory"] |
|
304 |
for elt in filter_list: |
|
305 |
self.filter_dataframe(elt) |
|
306 |
|
|
307 |
del(self.dfservers["subset"]) |
|
308 |
|
|
309 |
self.dfservers["extra_memory_cost"] = (self.dfservers["memory"] |
|
310 |
- self.memory_size)*self.cost_extra_gb_memory |
|
311 |
|
|
312 |
self.dfservers["extra_storage_cost"] = (self.dfservers["storageSize"] |
|
313 |
- self.storage_size)*self.cost_extra_gb_storage |
|
314 |
|
|
315 |
|
|
316 |
#note that the model assumes that softeware are updated every year |
|
317 |
# even at cost 0, this cost include licences and so on. |
|
318 |
if (self.number_years >= 3): |
|
319 |
maintainance_hours_year = self.maintainance_hours * 12 |
|
320 |
## Compute the cost of a home made data center |
|
321 |
self.dfservers["traditional_hpc_cost"] = self.dfservers["Cost"] * self.number_servers + \ |
|
322 |
(self.number_years * maintainance_hours_year * self.maintainance_hours) + \ |
|
323 |
self.software_init_cost + \ |
|
324 |
self.number_years * self.software_update_cost + \ |
|
325 |
self.network_hardware_cost + \ |
|
326 |
self.installation_cost |
|
327 |
|
|
328 |
# Compute the cost of corresponding server instances rented, |
|
329 |
# remember that the base period is 3 years |
|
330 |
# We will call this cost the outsource cost |
|
331 |
self.dfservers["ondemand_hpc_cost"] = self.number_servers * ( \ |
|
332 |
self.one_time_cost + \ |
|
333 |
self.platform_cost_hour * self.usage_hours * 365 * self.number_years) +\ |
|
334 |
(self.number_years * maintainance_hours_year *\ |
|
335 |
self.maintainance_hours) +\ |
|
336 |
self.number_years * self.software_update_cost + \ |
|
337 |
self.software_init_cost |
|
338 |
|
|
339 |
self.dfservers["ondemand_hpc_cost"] = self.dfservers["ondemand_hpc_cost"] +\ |
|
340 |
self.number_servers * (self.dfservers["extra_memory_cost"] +\ |
|
341 |
self.dfservers["extra_storage_cost"]) |
|
342 |
|
|
343 |
self.dfservers["Cost difference"] = \ |
|
344 |
self.dfservers["ondemand_hpc_cost"] - \ |
|
345 |
self.dfservers["traditional_hpc_cost"] |
|
346 |
tmp_df = self.dfservers[["Cost difference", "ondemand_hpc_cost"]] |
|
347 |
|
|
348 |
|
|
349 |
self.dfservers["Cost difference (%)"] = map(lambda x: round(x, 2), |
|
350 |
tmp_df.apply(self.difference_percentage, 1)) |
|
351 |
|
|
352 |
self.dfservers["Cost difference (HPC ondemand - traditional)"] = \ |
|
353 |
self.dfservers["Cost difference"] |
|
354 |
del(self.dfservers["Cost difference"]) |
|
355 |
else: |
|
356 |
try : |
|
357 |
raise Exception |
|
358 |
except xception as inst: |
|
359 |
print("The number of years should be at leat 3! program exits on %s", inst) |
|
360 |
|
|
361 |
""" Update the server data set with new parameters """ |
|
362 |
self.dfservers.to_excel("dataset_analysis.xlsx", sheet_name='cat', |
|
363 |
engine='xlsxwriter',index=False) |
|
364 |
|
|
365 |
|
|
366 |
|
|
367 |
|
|
368 |
def main(argv): |
|
369 |
"""Main entrypoint.""" |
|
370 |
|
|
371 |
## Data used here are for the UK franckfort Amazon region |
|
372 |
|
|
373 |
try: |
|
374 |
# Parse command-line options |
|
375 |
parser = OptionParser() |
|
376 |
|
|
377 |
parser.add_option("-n", "--servers", dest="servers", |
|
378 |
metavar="NUMBER_SERVRES", action="store", type="int", default=1000, |
|
379 |
help="number of servers for each observation") |
|
380 |
|
|
381 |
parser.add_option("--hour_maintainance", dest="maintainance_hours", |
|
382 |
metavar="MAINTAINANCE_HOURS", action="store", type="int", default=15, |
|
383 |
help="number of hours of maintainance per month") |
|
384 |
|
|
385 |
parser.add_option("--maintananceCost", dest="maintainance_cost", |
|
386 |
metavar="MAINTAINANCE_COST", action="store", type="float", default=50, |
|
387 |
help="cost per maintainance hour") |
|
388 |
|
|
389 |
parser.add_option("--software_init_cost", dest="software_cost", |
|
390 |
metavar="SOFTWARE_INITIAL_COST", action="store", type="float", default=5000, |
|
391 |
help="initial software cost (this include network software)") |
|
392 |
|
|
393 |
parser.add_option("--software_update_cost", dest="software_update", |
|
394 |
metavar="SOFTWARE_UPDATE", action="store", type="float", default=10000, |
|
395 |
help="cost associate to software update including licences") |
|
396 |
|
|
397 |
parser.add_option("--network_hardware_cost", dest="network_hardware", |
|
398 |
metavar="NETWORK_HARDWARE", action="store", type="float", default=15000, |
|
399 |
help="initial cost associated to network equipments purchase") |
|
400 |
|
|
401 |
parser.add_option("--installation_cost", dest="install_cost", |
|
402 |
metavar="INTALLATION_COST", action="store", type="float", default=10000, |
|
403 |
help="initial cost associated to network equipments purchase") |
|
404 |
|
|
405 |
## One time cost for a period of 3 years |
|
406 |
parser.add_option("--one_time_cost", dest="one_time_cost", |
|
407 |
metavar="ONE_TIME_COST", action="store", type="int", default=2690.14, |
|
408 |
help="one time set up cost when you book instances from amazone") |
|
409 |
|
|
410 |
parser.add_option("--platform_cost_hour", dest="cost_hour", |
|
411 |
metavar="PLATFORM_COST_HOUR", action="store", type="float", default= 0.543, |
|
412 |
help="one time set up cost when you book instances from amazone") |
|
413 |
|
|
414 |
parser.add_option("--monthly_cost_instance", dest="cost_monthly", |
|
415 |
metavar="COST_HOUR", action="store", type="float", default=108.64, |
|
416 |
help="monthly cost of a single machine book instances from amazone") |
|
417 |
|
|
418 |
|
|
419 |
parser.add_option("--cost_extra_gb_memory", dest="extra_memory", |
|
420 |
metavar="EXTRA_MEMORY", action="store", type="float", default=1.23, |
|
421 |
help="cost per hour per giga byte of extra memory") |
|
422 |
|
|
423 |
parser.add_option("--cost_extra_gb_storage", dest="extra_storage", |
|
424 |
metavar="EXTRA_STORAGE", action="store", type="float", default=0.82, |
|
425 |
help="cost per hour per giga byte of extra storage") |
|
426 |
|
|
427 |
parser.add_option("--number_years", dest="number_years", |
|
428 |
metavar="NUMBER_YEARS", action="store", type="int", default=3, |
|
429 |
help="Number of years over which you want to predict") |
|
430 |
|
|
431 |
parser.add_option("--number_cores", dest="number_cores", |
|
432 |
metavar="NUMBER_CPU_CORES", action="store", type="int", default=8, |
|
433 |
help="Number of CPU cores per host") |
|
434 |
|
|
435 |
parser.add_option("--memory_size", dest="memory_size", |
|
436 |
metavar="MEMORY_SIZE", action="store", type="int", default=48, |
|
437 |
help="Memory size expected, max is 768") |
|
438 |
|
|
439 |
parser.add_option("--storage_size", dest="storage_size", |
|
440 |
metavar="STORAGE_SIZE", action="store", type="int", default=48, |
|
441 |
help="Storage size expected") |
|
442 |
|
|
443 |
parser.add_option("--storage_type", dest="storage_type", |
|
444 |
metavar="STORAGE_TYPE", action="store", type="string", default="ssd sata", |
|
445 |
help="Storage type expected") |
|
446 |
|
|
447 |
parser.add_option("--usage_ours", dest="usage_hours", |
|
448 |
metavar="STORAGE_TYPE", action="store", type="int", default=24, |
|
449 |
help="number of hours the system is used per day") |
|
450 |
|
|
451 |
## baseline: c3.2xlarge 8 28 15 2 x 80 SSD $0.420 par heure |
|
452 |
## upfront monthly hourly |
|
453 |
## $3288 $132.86 $0.3071 |
|
454 |
|
|
455 |
options, args = parser.parse_args(argv[1:]) |
|
456 |
|
|
457 |
gen = GenerateDate(options.servers, |
|
458 |
options.maintainance_hours, |
|
459 |
options.maintainance_cost, |
|
460 |
options.software_cost, |
|
461 |
options.software_update, |
|
462 |
options.network_hardware, |
|
463 |
options.install_cost, |
|
464 |
options.one_time_cost, |
|
465 |
options.cost_hour, |
|
466 |
options.cost_monthly, |
|
467 |
options.extra_memory, |
|
468 |
options.extra_storage, |
|
469 |
options.number_years, |
|
470 |
options.number_cores, |
|
471 |
options.memory_size, |
|
472 |
options.storage_size, |
|
473 |
options.storage_type, |
|
474 |
options.usage_hours) |
|
475 |
|
|
476 |
gen.merge_datasets() |
|
477 |
gen.process_data() |
|
478 |
|
|
479 |
except Exception as e: |
|
480 |
print "Exception % rised" % e |
|
481 |
return 1 |
|
482 |
|
|
483 |
return 0 |
|
484 |
|
|
485 |
if __name__ == "__main__": |
|
486 |
sys.exit(main(sys.argv)) |
b/livrables/L4_2_4/Data_file/GenerateServers.py~ | ||
---|---|---|
1 |
#!/usr/bin/env python |
|
2 |
|
|
3 |
from __future__ import division |
|
4 |
import numpy as np |
|
5 |
import pandas as pd |
|
6 |
from numpy import genfromtxt |
|
7 |
from pandas import DataFrame |
|
8 |
|
|
9 |
|
|
10 |
import re |
|
11 |
import math |
|
12 |
import sys |
|
13 |
import os |
|
14 |
from optparse import OptionParser |
|
15 |
|
|
16 |
import logging |
|
17 |
|
|
18 |
|
|
19 |
''' |
|
20 |
python version: |
|
21 |
2.7.8 |Anaconda 2.0.1 (x86_64) |
|
22 |
|
|
23 |
full description: |
|
24 |
2.7.8 |Anaconda 2.0.1 (x86_64)| (default, Aug 21 2014, 15:21:46) \n[GCC 4.2.1 (Apple Inc. build 5577)] |
|
25 |
''' |
|
26 |
|
|
27 |
|
|
28 |
np.random.seed(12345) |
|
29 |
|
|
30 |
|
|
31 |
class GenerateDate(): |
|
32 |
|
|
33 |
def __init__(self, nber_servers, |
|
34 |
maintainanceCost, |
|
35 |
maintainance_hours, |
|
36 |
software_init_cost, |
|
37 |
software_update_cost, |
|
38 |
network_hardware_cost, |
|
39 |
installation_cost, |
|
40 |
one_time_cost, |
|
41 |
platform_cost_hour, |
|
42 |
monthly_cost_instance, |
|
43 |
cost_extra_gb_memory, |
|
44 |
cost_extra_gb_storage, |
|
45 |
number_years, |
|
46 |
number_cores, |
|
47 |
memory_size, |
|
48 |
storage_size, |
|
49 |
storage_type, |
|
50 |
usage_hours): |
|
51 |
|
|
52 |
self.number_servers = nber_servers |
|
53 |
self.basedirs = ["Cat1", "Cat2"] |
|
54 |
self.dfservers = pd.DataFrame() |
|
55 |
self.maintainanceCost = maintainanceCost |
|
56 |
self.maintainance_hours = maintainance_hours |
|
57 |
self.software_init_cost = software_init_cost |
|
58 |
self.software_update_cost = software_update_cost |
|
59 |
self.network_hardware_cost= network_hardware_cost |
|
60 |
self.installation_cost = installation_cost |
|
61 |
self.one_time_cost = one_time_cost |
|
62 |
self.platform_cost_hour = platform_cost_hour |
|
63 |
self.monthly_cost_instance = monthly_cost_instance |
|
64 |
self.cost_extra_gb_memory = cost_extra_gb_memory #cost per extra giga bite of memory |
|
65 |
self.cost_extra_gb_storage = cost_extra_gb_storage |
|
66 |
self.number_years = number_years |
|
67 |
self.number_cores = number_cores |
|
68 |
self.memory_size = memory_size |
|
69 |
self.storage_size = storage_size |
|
70 |
self.storage_type = storage_type |
|
71 |
self.usage_hours = usage_hours |
|
72 |
|
|
73 |
def duplicate(self, x, n): |
|
74 |
return [x[0]] *n |
|
75 |
|
|
76 |
def getItem(self, item): |
|
77 |
if item == "procs": |
|
78 |
return "procUnitCost" |
|
79 |
else: |
|
80 |
if item == "mem": |
|
81 |
return "MemUnitCost" |
|
82 |
else: |
|
83 |
if item == "storage3inch" or item == "storage2inch": |
|
84 |
return "StorageUnitCost" |
|
85 |
else: |
|
86 |
if item == "network": |
|
87 |
return "netUnitCost" |
|
88 |
|
|
89 |
def difference_percentage(self, x): |
|
90 |
if np.nan in x: |
|
91 |
return np.nan |
|
92 |
else: |
|
93 |
return round((x[0] / x[1]) * 100, 2) |
|
94 |
|
|
95 |
|
|
96 |
''' |
|
97 |
base configuration data frame |
|
98 |
file name for the corresponding extension file |
|
99 |
flag to identify the appropriate component |
|
100 |
''' |
|
101 |
|
|
102 |
def createServerSet(self, baseConfDf, extensionfile): |
|
103 |
xcl_file = pd.ExcelFile(extensionfile) |
|
104 |
|
|
105 |
dframes = {sheet_name: xcl_file.parse(sheet_name) for sheet_name in xcl_file.sheet_names} |
|
106 |
# |
|
107 |
keys = dframes.keys() |
|
108 |
|
|
109 |
## add the processor unit cost |
|
110 |
baseConfDf[self.getItem("procs")] = 0 |
|
111 |
# |
|
112 |
|
|
113 |
data = baseConfDf.merge(dframes["procs"], how="outer", copy=False) |
|
114 |
colnames = list(data.columns) |
|
115 |
proc_names = list(dframes["procs"]) |
|
116 |
colnames_t = [x for x in colnames if x not in proc_names] |
|
117 |
colnames = [x for x in colnames_t if x not in ["id"]] |
|
118 |
|
|
119 |
data[colnames] = data[colnames].apply(self.duplicate, 0, args=[data.shape[0]]) |
|
120 |
|
|
121 |
ext = [v for v in keys if v in ["mem", "network","storage3inch", "storage2inch"]] |
|
122 |
print ext |
|
123 |
if ("storage3inch" in ext) and ("storage2inch" in ext): |
|
124 |
ext.remove("storage3inch") |
Formats disponibles : Unified diff