Statistical analysis for Climate datasets¶

Authors:¶

Md Tashin Ahammad (tashinahammad.03@gmail.com)

Abdullah Al Fahad (a.fahad@nasa.gov)

# from google.colab import drive
# drive.mount('/content/drive')

!pip uninstall -y shapely
!pip install cartopy==0.21
!pip install xskillscore

import numpy as np
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.feature as cf
import xarray as xr
import xskillscore as xs
import requests

xskillscore for statistical correlation https://xskillscore.readthedocs.io/en/stable/api/xskillscore.pearson_r.html#

Correlation¶

Correlation is a statistical measure that indicates the strength and direction of a relationship between two variables. It ranges from -1 to 1, where -1 represents a perfect negative correlation, 0 indicates no correlation, and 1 indicates a perfect positive correlation.

x = [1, 2, 3, 5, 8, 13, 21]
y = [1, 4, 9, 16, 25, 36, 49]

corr = np.corrcoef(x, y)[0, 1]
pvalue = 2 * (1 - np.abs(round(corr, 2))) #ref
print( corr)
print(pvalue)

0.9846915385844243
0.040000000000000036

To find the correlatin coefficient between 'x' and 'y', we use np.corrcoef() function. We will get a matrix or pearson correlation coefficients, as we pass 2 arrays we will get 2x2 matrix in which correlation coefficient between 'x' and 'x' is top left corner or array position [0, 0] and the correlation coefficiant between 'x' and 'y' is top right corner or array position is [0, 1] which is approximately 0.93.

Now, the p-value is the stastical significance of the correlation coefficient. In general, a p-value less than 0.05 is considered significant, which means that there is less than a 5% chance that the correlation is due to chance.

fig, ax = plt.subplots()
ax.scatter(x, y)
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title('Correlation: ' + str(np.around(corr.min(),2))+ '  Pvalue :'+ str(np.around(pvalue,2)))
plt.show()
#draw trendline

Data : https://drive.google.com/file/d/10Cf3CbMwX6Z2hnIZg8wWDVajhm6rZ2OU/view?usp=drivesdk

We specify the URL in the file_url variable.

Next, we make a request to the URL using requests.get(). The response object contains the content of the file.

We then open a file in binary write mode ("wb") at the desired location ("/content/data.nc") and write the content of the response into it.

Finally, we use xr.open_dataset() from xarray to open the downloaded file and load it into the data variable. You can replace "/content/data.nc" with your desired file path.

After executing this code, we have the data loaded into the data variable, and proceed with further analysis or operations using xarray.

file_url = "https://drive.google.com/uc?id=10Cf3CbMwX6Z2hnIZg8wWDVajhm6rZ2OU&export=download"
file_path = "/content/data.nc"

response = requests.get(file_url)
with open(file_path, "wb") as f:
    f.write(response.content)

data = xr.open_dataset(file_path).load()
data

<xarray.Dataset>
Dimensions:    (longitude: 29, latitude: 33, time: 8760)
Coordinates:
  * longitude  (longitude) float32 87.0 87.25 87.5 87.75 ... 93.5 93.75 94.0
  * latitude   (latitude) float32 28.0 27.75 27.5 27.25 ... 20.5 20.25 20.0
  * time       (time) datetime64[ns] 2022-01-01 ... 2022-12-31T23:00:00
Data variables:
    t2m        (time, latitude, longitude) float32 253.7 253.5 ... 286.0 284.4
    tp         (time, latitude, longitude) float32 1.02e-05 2.856e-05 ... 0.0
Attributes:
    Conventions:  CF-1.6
    history:      2023-07-04 14:04:14 GMT by grib_to_netcdf-2.25.1: /opt/ecmw...

array([87.  , 87.25, 87.5 , 87.75, 88.  , 88.25, 88.5 , 88.75, 89.  , 89.25,
       89.5 , 89.75, 90.  , 90.25, 90.5 , 90.75, 91.  , 91.25, 91.5 , 91.75,
       92.  , 92.25, 92.5 , 92.75, 93.  , 93.25, 93.5 , 93.75, 94.  ],
      dtype=float32)

array([28.  , 27.75, 27.5 , 27.25, 27.  , 26.75, 26.5 , 26.25, 26.  , 25.75,
       25.5 , 25.25, 25.  , 24.75, 24.5 , 24.25, 24.  , 23.75, 23.5 , 23.25,
       23.  , 22.75, 22.5 , 22.25, 22.  , 21.75, 21.5 , 21.25, 21.  , 20.75,
       20.5 , 20.25, 20.  ], dtype=float32)

array(['2022-01-01T00:00:00.000000000', '2022-01-01T01:00:00.000000000',
       '2022-01-01T02:00:00.000000000', ..., '2022-12-31T21:00:00.000000000',
       '2022-12-31T22:00:00.000000000', '2022-12-31T23:00:00.000000000'],
      dtype='datetime64[ns]')

array([[[253.68433, 253.47284, 250.54309, ..., 273.803  , 275.77994,
         280.56708],
        [255.01645, 257.21866, 256.58652, ..., 275.73624, 276.98102,
         279.7372 ],
        [265.98264, 265.0827 , 266.5263 , ..., 277.01205, 279.02805,
         281.2406 ],
        ...,
        [293.6056 , 295.0883 , 295.6055 , ..., 285.25653, 284.8485 ,
         285.80478],
        [295.13885, 295.8963 , 296.29514, ..., 287.36792, 286.9599 ,
         286.32428],
        [296.0526 , 296.57672, 296.78937, ..., 291.96887, 288.1426 ,
         287.07254]],

       [[253.81306, 253.42802, 249.54314, ..., 273.65472, 276.00174,
         279.15103],
        [255.16588, 257.00256, 255.76355, ..., 275.5742 , 276.9086 ,
         279.1522 ],
        [266.25848, 264.92178, 266.0378 , ..., 277.04538, 279.11887,
         281.33255],
...
        [296.0756 , 297.1204 , 297.0606 , ..., 283.3727 , 282.29688,
         283.0635 ],
        [297.25714, 297.21002, 297.1514 , ..., 285.75766, 284.6807 ,
         283.08307],
        [297.39392, 297.16406, 297.2514 , ..., 290.42987, 286.3473 ,
         284.49908]],

       [[254.96588, 257.13934, 260.73346, ..., 275.2627 , 276.11325,
         279.41998],
        [260.153  , 260.95184, 264.0023 , ..., 274.37885, 275.52017,
         277.99246],
        [271.16632, 271.5686 , 272.16284, ..., 274.1662 , 276.73737,
         279.2315 ],
        ...,
        [296.16638, 297.11923, 297.0468 , ..., 283.31638, 282.31644,
         283.33246],
        [297.24796, 297.19968, 297.12958, ..., 285.6818 , 284.68182,
         283.49683],
        [297.37555, 297.14453, 297.23303, ..., 290.30115, 285.97717,
         284.4083 ]]], dtype=float32)

array([[[1.0199845e-05, 2.8558075e-05, 8.2615763e-05, ...,
         0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
        [2.0377338e-06, 2.6518852e-04, 1.2647361e-04, ...,
         0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
        [3.6716461e-05, 7.1395189e-05, 6.1169267e-06, ...,
         3.0584633e-06, 1.0170043e-06, 0.0000000e+00],
        ...,
        [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ...,
         0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
        [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ...,
         0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
        [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ...,
         0.0000000e+00, 0.0000000e+00, 0.0000000e+00]],

       [[6.1169267e-06, 1.5296042e-05, 4.5895576e-05, ...,
         0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
        [1.0170043e-06, 1.7441064e-04, 7.7515841e-05, ...,
         0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
        [1.5296042e-05, 4.6916306e-05, 1.0170043e-06, ...,
         1.0170043e-06, 0.0000000e+00, 0.0000000e+00],
...
        [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ...,
         0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
        [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ...,
         0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
        [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ...,
         0.0000000e+00, 0.0000000e+00, 0.0000000e+00]],

       [[1.9378960e-05, 3.0595809e-05, 3.5695732e-05, ...,
         0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
        [5.0999224e-06, 3.5494566e-04, 1.5299395e-04, ...,
         0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
        [1.6421080e-04, 2.3050979e-04, 1.2545288e-04, ...,
         6.2216073e-05, 2.5495887e-05, 0.0000000e+00],
        ...,
        [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ...,
         0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
        [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ...,
         0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
        [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ...,
         0.0000000e+00, 0.0000000e+00, 0.0000000e+00]]], dtype=float32)

PandasIndex(Float64Index([ 87.0, 87.25,  87.5, 87.75,  88.0, 88.25,  88.5, 88.75,  89.0,
              89.25,  89.5, 89.75,  90.0, 90.25,  90.5, 90.75,  91.0, 91.25,
               91.5, 91.75,  92.0, 92.25,  92.5, 92.75,  93.0, 93.25,  93.5,
              93.75,  94.0],
             dtype='float64', name='longitude'))

PandasIndex(Float64Index([ 28.0, 27.75,  27.5, 27.25,  27.0, 26.75,  26.5, 26.25,  26.0,
              25.75,  25.5, 25.25,  25.0, 24.75,  24.5, 24.25,  24.0, 23.75,
               23.5, 23.25,  23.0, 22.75,  22.5, 22.25,  22.0, 21.75,  21.5,
              21.25,  21.0, 20.75,  20.5, 20.25,  20.0],
             dtype='float64', name='latitude'))

PandasIndex(DatetimeIndex(['2022-01-01 00:00:00', '2022-01-01 01:00:00',
               '2022-01-01 02:00:00', '2022-01-01 03:00:00',
               '2022-01-01 04:00:00', '2022-01-01 05:00:00',
               '2022-01-01 06:00:00', '2022-01-01 07:00:00',
               '2022-01-01 08:00:00', '2022-01-01 09:00:00',
               ...
               '2022-12-31 14:00:00', '2022-12-31 15:00:00',
               '2022-12-31 16:00:00', '2022-12-31 17:00:00',
               '2022-12-31 18:00:00', '2022-12-31 19:00:00',
               '2022-12-31 20:00:00', '2022-12-31 21:00:00',
               '2022-12-31 22:00:00', '2022-12-31 23:00:00'],
              dtype='datetime64[ns]', name='time', length=8760, freq=None))

precipitation = data['tp'] * 1000
temperature = data['t2m']

temperature=temperature.resample(time="1M").mean()
precipitation=precipitation.resample(time="1M").mean()
lon = data['longitude']
lat = data['latitude']

The two types of correlation mentioned in the code are:

Pearson Correlation: It measures the linear relationship between two continuous variables. It is sensitive to outliers and assumes that the variables are normally distributed.
Spearman Correlation: It assesses the monotonic relationship between two variables, which means it captures both linear and non-linear relationships. It is less sensitive to outliers and does not require the variables to be normally distributed. Instead, it relies on rank ordering the data.

# Calculate Pearson correlation coefficient and p-value for 'precipitation' and 'temperature' along the 'time' dimension
corre1 = xs.pearson_r(precipitation, temperature, dim='time')
p_value1 = xs.pearson_r_p_value(precipitation, temperature, dim='time')

# Calculate Spearman correlation coefficient and p-value for 'precipitation' and 'temperature' along the 'time' dimension
corre2 = xs.spearman_r(precipitation, temperature, dim='time')
p_value2 = xs.spearman_r_p_value(precipitation, temperature, dim='time')

# Store the calculated correlation coefficients in a list
corr = [corre1.data, corre2.data]

# Titles for the correlation coefficients, indicating the corresponding method used ('Pearson' or 'Spearman')
corr_title = ["a) Pearson Correlation", "b) Spearman Correlation"]

# Store the calculated p-values in a list
p_value = [p_value1, p_value2]

# Titles for the p-values, indicating the corresponding method used ('Pearson' or 'Spearman')
p_value_title = ["a) P-value (Pearson)", "b) P-value (Spearman)"]

# Create the subplots
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(14, 8),
                        subplot_kw={'projection': ccrs.PlateCarree()})

# Plot Pearson and Spearman Correlation between Precipitation and temperature on the subplots
for i, ax in enumerate(axs.flat):
    cs = ax.contourf(lon, lat, corr[i], 30, cmap='coolwarm', extend='both', transform=ccrs.PlateCarree(), levels=np.arange(-.9,0.91,.2))
    ax.add_feature(cf.COASTLINE)
    ax.add_feature(cf.BORDERS, linewidth=1)
    ax.add_feature(cf.OCEAN)
    ax.add_feature(cf.LAKES)
    ax.add_feature(cf.LAND)
    ax.add_feature(cf.RIVERS)
    ax.set_extent([87, 94, 20, 28])
    bx = ax.gridlines(draw_labels=True, linewidth=0.2)
    ax.set_title(corr_title[i], fontsize=12, pad=10, y=1.04, loc="left")

    # Add vertical colorbar to each subplot with a custom size
    cbar = plt.colorbar(cs, ax=ax, orientation='vertical', pad=0.1, shrink=0.7)
    cbar.ax.tick_params(labelsize=10)

# Add a title to the figure
fig.suptitle('Correlation Coefficient between Precipitaion and temperature', fontsize=20, fontweight='bold')

# Adjust the spacing between subplots and display the plot
plt.tight_layout()

plt.show()

# Create the subplots
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(14, 8),
                        subplot_kw={'projection': ccrs.PlateCarree()})

# Plot Pearson and Spearman Correlation P-value on the subplots
for i, ax in enumerate(axs.flat):
    cs = ax.contourf(lon, lat, p_value[i], 30,  transform=ccrs.PlateCarree(),levels=np.array([0,.01,0.05,1]),cmap='Pastel1')
    ax.add_feature(cf.COASTLINE)
    ax.add_feature(cf.BORDERS, linewidth=1)
    ax.add_feature(cf.OCEAN)
    ax.add_feature(cf.LAKES)
    ax.add_feature(cf.LAND)
    ax.add_feature(cf.RIVERS)
    ax.set_extent([87, 94, 20, 28])
    bx = ax.gridlines(draw_labels=True, linewidth=0.2)
    ax.set_title(p_value_title[i], fontsize=12, pad=10, y=1.04, loc="left")

    # Add vertical colorbar to each subplot with a custom size
    cbar = plt.colorbar(cs, ax=ax, orientation='vertical', pad=0.1, shrink=0.7)
    cbar.ax.tick_params(labelsize=10)
    # cbar.ax.set_title('mm/day', fontsize=10)

# Add a title to the figure
fig.suptitle('P-value associated with Correlation Coefficient between Precipitaion and temperature', fontsize=20, fontweight='bold')

# Adjust the spacing between subplots and display the plot
plt.tight_layout()

plt.show()