Script in R below:
library(stringr)
#Step 1 ———————-
rm(list=ls())
dd <- read.csv(“C:\\Users\\estoyanova\\OneDrive – VMware, Inc\\ES\\UNI\\master BA\\Boriana-Monthly challenge\\Air Tube\\data_bg_2017.csv”, header = TRUE, sep = “,”, na.strings = c(“”,” “, “NA”, “#NA”), stringsAsFactors = FALSE)
topo <- read.csv(“C:\\Users\\estoyanova\\OneDrive – VMware, Inc\\ES\\UNI\\master BA\\Boriana-Monthly challenge\\TOPO-DATA\\sofia_topo.csv”, header = TRUE, sep = “,”, na.strings = c(“”,” “, “NA”, “#NA”), stringsAsFactors = FALSE)
dd2 <- read.csv(“C:\\Users\\estoyanova\\OneDrive – VMware, Inc\\ES\\UNI\\master BA\\Boriana-Monthly challenge\\Air Tube\\data_bg_2018.csv”, header = TRUE, sep = “,”, na.strings = c(“”,” “, “NA”, “#NA”), stringsAsFactors = FALSE)
# Check for missing values
sum(is.na(dd))
sum(is.na(dd2)) #[1] 4
sum(is.na(topo))
#removing incorrect observations
dd2017 <- subset(dd,dd$humidity!=0)#min humidity =1,
min(dd2017$humidity) #[1] 1
dd2017c <- subset(dd2017,dd2017$pressure!=0)
min(dd2017c$pressure) #[1] 69774
#removing observations where the humidity is below 10%
dd2017cl <- subset(dd2017c,dd2017c$humidity>10)
min(dd2017cl$humidity)#[1] 11
#removing observations where the pressure is below 90000.
dd2017clean <- subset(dd2017cl,dd2017cl$pressure>90000)#min temperature -148, max 56
min (dd2017clean$pressure) #[1] 90001
#removing the observations with exteme temperature
#dd2017clean1 <- subset(dd2017clean,dd2017clean$temperature>-39)#the lowest measured temperature in Bulgaria is 38,5
dd2017clean2 <- subset(dd2017clean1,dd2017clean1$temperature<46)#the highest measured temperature in Bulgaria is 45,2
rm(dd)
rm(dd2017)
rm(dd2017c)
rm(dd2017cl)
rm(dd2017clean)
rm(dd2017clean1)
#removing the same observations from 2018 data
dd2018 <- subset(dd2,dd2$humidity>10)
dd2018c <- subset(dd2018,dd2018$pressure>90000)
dd2018cl <- subset(dd2018c,dd2018c$temperature>-39)
dd2018clean <- subset(dd2018cl,dd2018cl$temperature<46)
rm(dd2)
rm(dd2018)
rm(dd2018c)
rm(dd2018cl)
#Renaming with convenient names
dd2017 <- dd2017clean2
dd2018 <- dd2018clean
rm(dd2017clean2)
rm(dd2018clean)
#Step 2 ————–
library(lubridate)
dd2017$time <- ymd_hms(dd2017$time)
dd2018$time <- ymd_hms(dd2018$time)
#Step 3 ———-
#identifying unique geohash in both datasets
a <- unique(dd2017$geohash)
b <- unique(dd2018$geohash)
#identifying those geohaches from 2017data missing in 2018data
c <- setdiff(a,b)#12 obs missing in 2018
d <- setdiff(b,a)#731 obs. missing in 2017
rm(d)
library(dplyr)
#removing those geohaches from 2017data that are missing in 2018data
dd2017end <- subset(dd2017,dd2017$geohash!=c)
#aggrigating data for 2017 and 2018 in a single set
data <- bind_rows(dd2017end,dd2018)
#Step 4 ———–
f <- subset(data,is.na(data$geohas))#0 obs.
##making new dataframe, groupped by geohash with duration in days
dataG <- data %>%
group_by(geohash)%>%
summarise(tmin=min(time),tmax=max(time), obs=n(), days=tmax-tmin)
quantile(dataG$days,probs=seq(0,1,0.05)) # in the first 10% we have 35 days
#removing the stations with less than 35 days
dataGl <- subset(dataG,dataG$days>35)
d <- unique(dataGl$geohash)
e <- unique(data$geohash)
f <- setdiff(e,d)#109 obs.
dataEnd <- subset(data,data$geohash!=f)
#Task 5 —–
library(geohash)
#Convert geohashes into latitude and longitude
#bind columns from gh_decode dataframe into our main table
dataEnd <- cbind(dataEnd,gh_decode(dataEnd$geohash))
View(dataEnd)
#use a map to plot based on lat & lng
#library(ggmap)
library(ggplot2)
library(rworldmap)
newmap <- getMap(resolution = “low”)
#Bulgaria coordinates: Latitude and longitude coordinates are: 42.698334, 23.319941.
plot(newmap, xlim = c(20, 29), ylim = c(40, 45), asp = 1)
#visualize all those points
points(dataEnd$lng, dataEnd$lat, col = “red”, cex = .6)
#limit to only Sofia region:Latitude and longitude coordinates are: 42.698334, 23.319941
7 thoughts on “Air Sofia Pollution Case”
you overestimate the powers of our mentors, please write some humanly readable explanations, put some visualisations, etc. we would like to understand your ideas.
Thanks, if we knew how to do it in the article we will do it. Please have patience with use, we are still quite fresh with these types of challenges and this is not an easy task. Any guidance on the types of data vis you would like to see ?
Your assignments to peer review (and give feedback below the coresponding articles) for week 2 of the Monthly challenge are the following teams:
https://www.datasciencesociety.net/monthly-challenge-sofia-air-solution-kiwi-team/
https://www.datasciencesociety.net/sofia-air-quality-eda-exploratory-data-analysis/
https://www.datasciencesociety.net/monthly-challenge-sofia-air-solution-kung-fu-panda/
Hi @ESTOYANOVA, @MATANSKI, @SKADIR, @POLINA – I just visited your page for week 2 updates as that was part of my review assignment. Let me know when you are ready for review by just tagging me in a comment or chatting me.
For what’s its worth, if you want a free and drag and drop easy way to visualize your data, you can try https://public.tableau.com/en-us/s/ – the public version of the tool is free and is fully functional for up to 15million rows of data.
Visiting again on Nov 3rd. to check in for review. Keep working through you, I look forward to your update.
Your assignments to peer review (and give feedback below the coresponding articles) for week 3 of the Monthly challenge are the following teams:
https://www.datasciencesociety.net/the-pumpkins/
https://www.datasciencesociety.net/monthly-challenge-sofia-air-solution-banana/
https://www.datasciencesociety.net/sofia-air-week-1/
Your assignments to peer review (and give feedback below the coresponding articles) for week 4 of the Monthly challenge are the following teams:
https://www.datasciencesociety.net/monthly-challenge-sofia-air-solution-dirty-minds/
https://www.datasciencesociety.net/monthly-challenge-sofia-air-solution-iseveryonehigh/
https://www.datasciencesociety.net/monthly-challenge-sofia-air-solution-newbees/