Python Code: Pressure Changes
The code below was used to analyze barometric changes in 15 cities in the United States. The code is a good example of basic Python programming that can be used in data cleaning and exploratory data analysis.
The example below shows the following topics: (a) concatonate csv files, (b) dropping columns, (c) dropping rows based on a condition, d) replacing values in a column, (e) datetime operations, (f) writing basic functions, (g) grouping observations based on a value, basic statistical functions, (h) visualizations such as marking coordinates on a map and ggplot graphs
#Import packages and set the working directory import os import glob import pandas as pd os.chdir("/Users/gellerttoth/Documents/Datasklr/Weather/Analysis Space")
# Match csv patern # save list of all file names ‘all_filenames’ variable. extension = 'csv' all_filenames = [i for i in glob.glob('*.{}'.format(extension))] #Concatonate all files on the list combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ]) combined_csv.to_csv( "combined_csv.csv", index=False, encoding='utf-8-sig') #Concatonate and craete a dataframe all_cities all_cities = pd.concat([pd.read_csv(f) for f in all_filenames ])
#Check the dataframe all_cities.head(5) #look at first 5 observations len(all_cities) #find the number of observations in the data all_cities.dtypes #look at the variable tyoes in the data all_cities.shape #checkhow many observations and variables are in the data type(all_cities) #Check the type of dataframe: see if it is pandas data frame all_cities.columns #list the column names print(all_cities.head(5),len(all_cities), all_cities.dtypes, all_cities.shape, type(all_cities), all_cities.columns)
#Drop unwanted columns/Select necessary columns #Save work in newe dataframe analytical_dataset analytical_dataset=all_cities[['USAF', 'WBAN', 'YR--MODAHRMN','SLP', 'ALT', 'STP']] analytical_dataset.head #Clean up data #Change the name of the column YR--MODAHRMN to DATE_HOUR analytical_dataset.rename(columns={'YR--MODAHRMN':'DATE_HOUR'}, inplace=True)
#Examinatipon of the data shows that most weather stations report in predetermined sequence but also #out of sequence reoports. These out of sequence reports do not have SLP measurements and are marked with stars **** # Note 6 stars for SLP and 5 stars for ALT #Clean up SLP column - drop row by condition #Proc edure drops measurements on the hour and measurements that are out of sequence. analytical_dataset=analytical_dataset[analytical_dataset.SLP != '******'] analytical_dataset=analytical_dataset[analytical_dataset.ALT != '*****']
#List of station codes are included in the dataframe #Look up name of each city for corresponding USAF ID on internet analytical_dataset.USAF.unique() #Recode each USAF ID with appropriate city name to be more meaningful analytical_dataset['USAF'].replace(725020,'Newark', inplace=True) analytical_dataset['USAF'].replace(722020,'Miami', inplace=True) analytical_dataset['USAF'].replace(722780,'Phoenix', inplace=True) analytical_dataset['USAF'].replace(724940,'San Francisco', inplace=True) analytical_dataset['USAF'].replace(722900,'San Diego', inplace=True) analytical_dataset['USAF'].replace(725650,'Denver', inplace=True) analytical_dataset['USAF'].replace(726070,'Bangor', inplace=True) analytical_dataset['USAF'].replace(722590,'Dallas', inplace=True) analytical_dataset['USAF'].replace(911820,'Honolulu', inplace=True) analytical_dataset['USAF'].replace(725300,'Chicago', inplace=True) analytical_dataset['USAF'].replace(727930,'Seattle', inplace=True) analytical_dataset['USAF'].replace(724846,'Las Vegas', inplace=True) analytical_dataset['USAF'].replace(723060,'Raleigh', inplace=True) analytical_dataset['USAF'].replace(725510,'Lincoln', inplace=True) analytical_dataset['USAF'].replace(723405,'Little Rock', inplace=True) analytical_dataset['USAF'].replace(722316,'New Orleans', inplace=True) analytical_dataset['USAF'].replace(722316,'New Orleans', inplace=True)
#MAKE DATE_HOUR COLUMN USEFUL FOR ANALYS from datetime import datetime #FIRST CHANGE DATE_HOUR TO STRING analytical_dataset['DATE_HOUR']=analytical_dataset.DATE_HOUR.astype(int).astype(str) #The date_time data was a bit unusual since it contained year, month, day, hour and minute in one string #The function creates datetime from the 12 digit string. #The first four digits are year, then month, day, hour and minute def date_convert(input_str): from datetime import datetime output = datetime(year = int(input_str[0:4]), month = int(input_str[4:6]), day = int(input_str[6:8]), hour = int(input_str[8:10]), minute = int(input_str[10:12])) return output #deploy function on data and create NEWDATE column which contains the date in time date format analytical_dataset['NEWDATE'] = analytical_dataset.DATE_HOUR.astype(int).astype(str).apply(date_convert) analytical_dataset.head
#object SLP needs to be converted to float for calculation of standard deviation and mean analytical_dataset['SLP']=analytical_dataset.SLP.astype(float) #Calculate the standard deviation and mean of each city's SLP #Concatonate sd and mean columns #Sort by standard deviation result1=analytical_dataset.groupby('USAF').std() result1.rename(columns={'SLP':'Standard Deviation'}, inplace=True) result2=analytical_dataset.groupby('USAF').mean() result2.rename(columns={'SLP':'Mean'}, inplace=True) result3 = pd.concat([result1['Standard Deviation'], result2['Mean']], axis=1, sort=False) result3=result3.sort_values(by=['Standard Deviation']) result3 #Export dataframe to csv so that we can create an html table for web #result3.to_csv(r'Path where you want to store the exported CSV file\File Name.csv') result3.to_csv("/Users/gellerttoth/Documents/Datasklr/Weather/Analysis Space/cities.csv") #Transform the NEWDATE column to datetime format analytical_dataset["NEWDATE"]= pd.to_datetime(analytical_dataset["NEWDATE"]) analytical_dataset.info()
#create year and month columns from datetime column (NEWDATE) #this preparation wil be used later for crosstabs analytical_dataset['year'] = pd.DatetimeIndex(analytical_dataset['NEWDATE']).year analytical_dataset['month'] = pd.DatetimeIndex(analytical_dataset['NEWDATE']).month analytical_dataset.head() #categorize months and days as one of four seasons #first step is to create a month/day column by slicing the first 4 and last 4 digits of the DATE_HOUR column #DATE_HOUR needs to be converted to a string analytical_dataset['monthday'] = analytical_dataset['DATE_HOUR'].astype(str).str.slice(4,8) #use string for slice analytical_dataset['monthday'] = analytical_dataset['monthday'].astype(int) #must use integer for if elif else analytical_dataset.head
#spring: march 20 - june 21 #summer: june 21 - september 23 #fall: september 23 - december 21 #winter: december 21 - march 19 #write function that assign season to date range def which_season(x): if x >= 320 and x<= 621: return 'spring' elif x >= 622 and x<= 923: return 'summer' elif x >= 924 and x<= 1221: return 'fall' else: return 'winter' #deploy function analytical_dataset['seasons'] = analytical_dataset['monthday'].apply(which_season) analytical_dataset.head #Analyze by season #Calculate standard deviation and mean by season result10=pd.DataFrame((analytical_dataset.groupby('seasons').std())) result10.rename(columns={'SLP':'Standard Deviation'}, inplace=True) result11=pd.DataFrame((analytical_dataset.groupby('seasons').mean())) result11.rename(columns={'SLP':'Mean'}, inplace=True) result12= pd.concat([result10['Standard Deviation'], result11['Mean']], axis=1, sort=False) result12 #Analyze by year #Calculate standard deviation and mean by season result100=pd.DataFrame((analytical_dataset.groupby('year').std())) result100.rename(columns={'SLP':'Standard Deviation'}, inplace=True) result101=pd.DataFrame((analytical_dataset.groupby('year').mean())) result101.rename(columns={'SLP':'Mean'}, inplace=True) result102= pd.concat([result100['Standard Deviation'], result101['Mean']], axis=1, sort=False) result102 #Calculate Standard Deviation by City and season grouped1=analytical_dataset.groupby(['USAF','seasons'])['SLP'].std() grouped1=analytical_dataset.groupby(['seasons','USAF'])['SLP'].std() pd.DataFrame(grouped1)
#create visualizations import numpy as np from plotnine import * #first create set index data = analytical_dataset data = ( data.filter([ 'USAF', 'SLP', 'year', 'season' ]). set_index('USAF') ) data = analytical_dataset fig = ( ggplot(data) + geom_boxplot( aes(x = 'USAF', y = 'SLP'), color='black' )+ labs( title ='Variability in Atmospheric Pressure - Select US Cities', x = 'millibars', y = 'Variation around the mean' ) + theme(axis_text_x = element_text(angle = 90, hjust = 1)) ) fig fig = ( ggplot(data) + geom_boxplot( aes(x = 'seasons', y = 'SLP'), color='black' )+ labs( title ='Variability in Atmospheric Pressure by Season', x = 'millibars (aggregate of selected cities)', y = 'Variation around the mean' ) ) fig
# Make a data frame with dots to show on the map data = pd.DataFrame({ 'lat':[68.7712, 87.6298, 96.7970, 104.9903, 157.8583, 115.1398, 96.7026, 92.2896, 80.1918, 90.0715, 74.1724, 112.0740, 78.6382, 117.1611, 122.4194, 122.3321], #EW 'lon':[44.8016, 41.8781, 32.7767, 39.7392, 21.3069, 36.1699, 40.8136, 34.7465, 25.7617, 29.9511, 40.7357, 33.4484, 35.7796, 32.7157, 37.7749, 47.6062], #SN 'name':['Bangor', 'Chicago', 'Dallas', 'Denver', 'Honolulu', 'Las Vegas', 'Lincoln', 'Little Rock', 'Miami', 'New Orleans', 'Newark', 'Phoenix', 'Raleigh', 'San Diego', 'San Francisco', 'Seattle'] }) data # import libraries import folium import pandas as pd # Make a data frame with dots to show on the map data = pd.DataFrame({ 'lat':[-68.7712, -87.6298, -96.7970, -104.9903, -157.8583, -115.1398, -96.7026, -92.2896, -80.1918, -90.0715, -74.1724, -112.0740, -78.6382, -117.1611, -122.4194, -122.3321], #EW 'lon':[44.8016, 41.8781, 32.7767, 39.7392, 21.3069, 36.1699, 40.8136, 34.7465, 25.7617, 29.9511, 40.7357, 33.4484, 35.7796, 32.7157, 37.7749, 47.6062], #SN 'name':['Bangor', 'Chicago', 'Dallas', 'Denver', 'Honolulu', 'Las Vegas', 'Lincoln', 'Little Rock', 'Miami', 'New Orleans', 'Newark', 'Phoenix', 'Raleigh', 'San Diego', 'San Francisco', 'Seattle'] }) data # Create a map of the US #I played with lat and lon to fit Honolulu latitude = 40 longitude = -120 m = folium.Map(location=[latitude, longitude], zoom_start=3) #played with zoom to fit better m # add marker points one by one on the map for i in range(0,len(data)): folium.Marker([data.iloc[i]['lon'], data.iloc[i]['lat']], popup=data.iloc[i]['name']).add_to(m) m