The last modifications of this post were around 4 years ago, some information may be outdated!
In this note, I use df as DataFrame, s as Series.
Libraries
import pandas as pd # import pandas package
import numpy as npImport and have a look
df = pd.read_csv('filename.csv', na_values=['none']) # "none" is missing data
df.head() # read first 5 rows
df.tail() # last 5 rows
df.head(10) # first 10 rowsGet general infos
df.info() # show dtype of dataframe
df.describe() # numerical features
df.describe(include=['O']) # categorical features
df.describe(include='all') # all types
df.shape # dataframe's shape
df.dtypes # type of each column
df.get_dtype_counts() # count the number of data typesCheck distribution of values using KDE (Kernel Density Estimation),
plt.figure(figsize=(20, 5))
df['value'].plot.kde()Get columns' info
# LIST OF COLUMNS
df.columns
len(df.columns) # #cols?# UNIQUE VALUES IN COL
df['col'].unique()
df['col'].unique().size #unique vals
df['col'].nunique() # number of unique valsCounting
# Counting #elements of each class in df
df.Classes.value_counts() # give number of each 0 and 1# count #elements each unique values in a col/series
df[col].value_counts()Missing values
👉 Handle missing values.
# total number of nans in df
df.isnull().sum().sum()# #nans in each col (including zeros)
df.isnull().sum()# #not-nans in each col
df.count()
# each row
df.count(axis=1)# columns having the nulls (any nan)
null_columns = df.columns[df.isna().any()].tolist()
# how many?
df[null_columns].isnull().sum()# number of rows having ALL nans
df.isna().all(axis=1).sum()# number of columns having ALL nans
df.isna().all(axis=0).sum()# find index of rows having ALL nans
df.index[df.isna().all(axis=1)].to_list()# number of nans in df
df.isnull().sum().sort_values(ascending=False)
# find % of null values
(df.isnull().sum()/df.isnull().count()*100).sort_values(ascending=False)# Visualize the locations of missing values,
import seaborn as sns
df = df.set_index('YEAR') # y-axis is YEAR
sns.heatmap(df.isnull(), cbar=False) # x-axis is columns' name# Plot the percentage of nans w.r.t. each column (feature)
df_tmp = (df.isnull().sum()/df.isnull().count()*100).sort_values(ascending=False).to_frame(name='percentage')
df_tmp.reset_index().plot(kind='bar', x='index', y='percentage', figsize=(20,5))
plt.xlabel('features', fontsize=14)
plt.ylabel('% of nans', fontsize=14)Duplicates
👉 Handle duplicates.
# Check if there are duplicated values?
df['col'].duplicated().any() # returns True/False# How many duplicates? (only count the first occurs)
df['col'].duplicated().sum()# How many (including the repeated occurs)
df['col'].duplicated(keep=False).sum()# List all duplicated values (LONG EXECUTING!!!)
pd.concat( g for _, g in df.groupby('col') if len(g)>1 )
💬 Comments