Datasklr is a blog to provide examples of data science projects to those passionate about learning and having fun with data.

Python Code: Pressure Changes

Python Code: Pressure Changes

yeuwz2uwzbxim8wgnsmc.png

The code below was used to analyze barometric changes in 15 cities in the United States. The code is a good example of basic Python programming that can be used in data cleaning and exploratory data analysis.

The example below shows the following topics: (a) concatonate csv files, (b) dropping columns, (c) dropping rows based on a condition, d) replacing values in a column, (e) datetime operations, (f) writing basic functions, (g) grouping observations based on a value, basic statistical functions, (h) visualizations such as marking coordinates on a map and ggplot graphs

#Import packages and set the working directory
import os
import glob
import pandas as pd
os.chdir("/Users/gellerttoth/Documents/Datasklr/Weather/Analysis Space")
# Match csv patern
# save list of all file names ‘all_filenames’ variable. 
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]

#Concatonate all files on the list 
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
combined_csv.to_csv( "combined_csv.csv", index=False, encoding='utf-8-sig')
#Concatonate and craete a dataframe all_cities
all_cities = pd.concat([pd.read_csv(f) for f in all_filenames ])
#Check the dataframe
all_cities.head(5)  #look at first 5 observations
len(all_cities)     #find the number of observations in the data
all_cities.dtypes   #look at the variable tyoes in the data
all_cities.shape    #checkhow many observations and variables are in the data
type(all_cities)   #Check the type of dataframe: see if it is pandas data frame
all_cities.columns #list the column names

print(all_cities.head(5),len(all_cities), all_cities.dtypes, all_cities.shape, type(all_cities), all_cities.columns)
#Drop unwanted columns/Select necessary columns
#Save work in newe dataframe analytical_dataset

analytical_dataset=all_cities[['USAF', 'WBAN', 'YR--MODAHRMN','SLP', 'ALT', 'STP']]
analytical_dataset.head

#Clean up data
#Change the name of the column YR--MODAHRMN to DATE_HOUR

analytical_dataset.rename(columns={'YR--MODAHRMN':'DATE_HOUR'}, inplace=True)
#Examinatipon of the data shows that most weather stations report in predetermined sequence but also 
#out of sequence reoports.  These out of sequence reports do not have SLP measurements and are marked with stars ****
# Note 6 stars for SLP and 5 stars for ALT

#Clean up SLP column - drop row by condition 
#Proc edure drops measurements on the hour and measurements that are out of sequence. 

analytical_dataset=analytical_dataset[analytical_dataset.SLP != '******']
analytical_dataset=analytical_dataset[analytical_dataset.ALT != '*****']
#List of station codes are included in the dataframe
#Look up name of each city for corresponding USAF ID on internet
analytical_dataset.USAF.unique()

#Recode each USAF ID with appropriate city name to be more meaningful 

analytical_dataset['USAF'].replace(725020,'Newark', inplace=True)
analytical_dataset['USAF'].replace(722020,'Miami', inplace=True)
analytical_dataset['USAF'].replace(722780,'Phoenix', inplace=True)
analytical_dataset['USAF'].replace(724940,'San Francisco', inplace=True)
analytical_dataset['USAF'].replace(722900,'San Diego', inplace=True)
analytical_dataset['USAF'].replace(725650,'Denver', inplace=True)
analytical_dataset['USAF'].replace(726070,'Bangor', inplace=True)
analytical_dataset['USAF'].replace(722590,'Dallas', inplace=True)
analytical_dataset['USAF'].replace(911820,'Honolulu', inplace=True)
analytical_dataset['USAF'].replace(725300,'Chicago', inplace=True)
analytical_dataset['USAF'].replace(727930,'Seattle', inplace=True)
analytical_dataset['USAF'].replace(724846,'Las Vegas', inplace=True)
analytical_dataset['USAF'].replace(723060,'Raleigh', inplace=True)

analytical_dataset['USAF'].replace(725510,'Lincoln', inplace=True)
analytical_dataset['USAF'].replace(723405,'Little Rock', inplace=True)
analytical_dataset['USAF'].replace(722316,'New Orleans', inplace=True)
analytical_dataset['USAF'].replace(722316,'New Orleans', inplace=True)
#MAKE DATE_HOUR COLUMN USEFUL FOR ANALYS

from datetime import datetime

#FIRST CHANGE DATE_HOUR TO STRING
analytical_dataset['DATE_HOUR']=analytical_dataset.DATE_HOUR.astype(int).astype(str)

#The date_time data was a bit unusual since it contained year, month, day, hour and minute in one string 
#The function creates datetime from the 12 digit string. 
#The first four digits are year, then month, day, hour and minute

def date_convert(input_str):
    from datetime import datetime
    output = datetime(year = int(input_str[0:4]),
                      month = int(input_str[4:6]),
                      day = int(input_str[6:8]),
                      hour = int(input_str[8:10]),
                      minute = int(input_str[10:12]))
    return output

#deploy function on data and create NEWDATE column which contains the date in time date format
analytical_dataset['NEWDATE'] = analytical_dataset.DATE_HOUR.astype(int).astype(str).apply(date_convert)
analytical_dataset.head
#object SLP needs to be converted to float for calculation of standard deviation and mean

analytical_dataset['SLP']=analytical_dataset.SLP.astype(float) 

#Calculate the standard deviation and mean of each city's SLP
#Concatonate sd and mean columns
#Sort by standard deviation

result1=analytical_dataset.groupby('USAF').std()
result1.rename(columns={'SLP':'Standard Deviation'}, inplace=True)

result2=analytical_dataset.groupby('USAF').mean()
result2.rename(columns={'SLP':'Mean'}, inplace=True)

result3 = pd.concat([result1['Standard Deviation'], result2['Mean']], axis=1, sort=False)
result3=result3.sort_values(by=['Standard Deviation'])
result3

#Export dataframe to csv so that we can create an html table for web

#result3.to_csv(r'Path where you want to store the exported CSV file\File Name.csv')
result3.to_csv("/Users/gellerttoth/Documents/Datasklr/Weather/Analysis Space/cities.csv")

#Transform the NEWDATE column to datetime format
analytical_dataset["NEWDATE"]= pd.to_datetime(analytical_dataset["NEWDATE"]) 
analytical_dataset.info()

#create year and month columns from datetime column (NEWDATE)
#this preparation wil be used later for crosstabs
analytical_dataset['year'] = pd.DatetimeIndex(analytical_dataset['NEWDATE']).year
analytical_dataset['month'] = pd.DatetimeIndex(analytical_dataset['NEWDATE']).month
analytical_dataset.head()

#categorize months and days as one of four seasons
#first step is to create a month/day column by slicing the first 4 and last 4 digits of the DATE_HOUR column
#DATE_HOUR needs to be converted to a string 

analytical_dataset['monthday'] = analytical_dataset['DATE_HOUR'].astype(str).str.slice(4,8)  #use string for slice
analytical_dataset['monthday'] = analytical_dataset['monthday'].astype(int) #must use integer for if elif else
analytical_dataset.head
#spring: march 20 - june 21
#summer: june 21 - september 23
#fall: september 23 - december 21
#winter: december 21 - march 19

#write function that assign season to date range

def which_season(x):
    if x >= 320 and x<= 621:
         return 'spring'
    elif x >= 622 and x<= 923:
         return 'summer'
    elif x >= 924 and x<= 1221:
         return 'fall'
    else:
        return 'winter'
    
#deploy function
analytical_dataset['seasons'] =  analytical_dataset['monthday'].apply(which_season)
analytical_dataset.head

#Analyze by season
#Calculate standard deviation and mean by season

result10=pd.DataFrame((analytical_dataset.groupby('seasons').std()))
result10.rename(columns={'SLP':'Standard Deviation'}, inplace=True)

result11=pd.DataFrame((analytical_dataset.groupby('seasons').mean()))
result11.rename(columns={'SLP':'Mean'}, inplace=True)

result12= pd.concat([result10['Standard Deviation'], result11['Mean']], axis=1, sort=False)
result12

#Analyze by year
#Calculate standard deviation and mean by season

result100=pd.DataFrame((analytical_dataset.groupby('year').std()))
result100.rename(columns={'SLP':'Standard Deviation'}, inplace=True)

result101=pd.DataFrame((analytical_dataset.groupby('year').mean()))
result101.rename(columns={'SLP':'Mean'}, inplace=True)

result102= pd.concat([result100['Standard Deviation'], result101['Mean']], axis=1, sort=False)
result102

#Calculate Standard Deviation by City and season
grouped1=analytical_dataset.groupby(['USAF','seasons'])['SLP'].std()
grouped1=analytical_dataset.groupby(['seasons','USAF'])['SLP'].std()
pd.DataFrame(grouped1)
#create visualizations
import numpy as np
from plotnine import *
  
#first create set index
data = analytical_dataset
data = (
    data.filter([
        'USAF',
        'SLP',
        'year',
        'season'
    ]).
    set_index('USAF')
)
data = analytical_dataset
fig = (
    ggplot(data) + 
    geom_boxplot(
        aes(x = 'USAF',
            y = 'SLP'),
        color='black'
    )+
     labs(
        title ='Variability in Atmospheric Pressure - Select US Cities',
        x = 'millibars',
        y = 'Variation around the mean'
    ) + theme(axis_text_x  = element_text(angle = 90, hjust = 1))
)        
fig  

fig = (
    ggplot(data) + 
    geom_boxplot(
        aes(x = 'seasons',
            y = 'SLP'),
        color='black'
    )+
     labs(
        title ='Variability in Atmospheric Pressure by Season',
        x = 'millibars (aggregate of selected cities)',
        y = 'Variation around the mean'
    ) 
)         
fig  
# Make a data frame with dots to show on the map
data = pd.DataFrame({
    'lat':[68.7712, 87.6298, 96.7970, 104.9903, 157.8583, 115.1398, 96.7026, 92.2896, 80.1918, 90.0715, 74.1724, 112.0740, 78.6382, 117.1611, 122.4194, 122.3321],  #EW
    'lon':[44.8016, 41.8781, 32.7767, 39.7392, 21.3069, 36.1699, 40.8136, 34.7465, 25.7617, 29.9511, 40.7357, 33.4484, 35.7796, 32.7157, 37.7749, 47.6062],  #SN
    'name':['Bangor', 'Chicago', 'Dallas', 'Denver', 'Honolulu', 'Las Vegas', 'Lincoln', 'Little Rock', 'Miami', 'New Orleans', 'Newark', 'Phoenix', 'Raleigh', 'San Diego', 'San Francisco', 'Seattle'] 
})
data
# import libraries
import folium
import pandas as pd
# Make a data frame with dots to show on the map
data = pd.DataFrame({
    'lat':[-68.7712, -87.6298, -96.7970, -104.9903, -157.8583, -115.1398, -96.7026, -92.2896, -80.1918, -90.0715, -74.1724, -112.0740, -78.6382, -117.1611, -122.4194, -122.3321],  #EW
    'lon':[44.8016, 41.8781, 32.7767, 39.7392, 21.3069, 36.1699, 40.8136, 34.7465, 25.7617, 29.9511, 40.7357, 33.4484, 35.7796, 32.7157, 37.7749, 47.6062],  #SN
    'name':['Bangor', 'Chicago', 'Dallas', 'Denver', 'Honolulu', 'Las Vegas', 'Lincoln', 'Little Rock', 'Miami', 'New Orleans', 'Newark', 'Phoenix', 'Raleigh', 'San Diego', 'San Francisco', 'Seattle'] 
})
data
# Create a map of the US
#I played with lat and lon to fit Honolulu
latitude = 40
longitude = -120
m = folium.Map(location=[latitude, longitude], zoom_start=3)  #played with zoom to fit better
m 
#  add marker points one by one on the map
for i in range(0,len(data)):
    folium.Marker([data.iloc[i]['lon'], data.iloc[i]['lat']], popup=data.iloc[i]['name']).add_to(m)
m
Data Manipulation Using Python

Data Manipulation Using Python

0