
In the recent years there has been a major role of Data Science in Crime Analysis. In this Data Science Project we will do some Exploratory Data Analysis on the Crime rate of the city in California, San Francisco.
Let’s start with importing the required libraries
# for some basic operations import numpy as np import pandas as pd # for visualizations import matplotlib.pyplot as plt import seaborn as sns import folium !pip install squarify import squarify
Download the data set
data = pd.read_csv('crime.csv') # check the shape of the data data.shape
#Output
(150500, 13)
To see the first 5 rows in the data set
data.head()

To Describe the data set
data.describe()

To check if there are any null values
data.isnull().sum()
#Output IncidntNum 0 Category 0 Descript 0 DayOfWeek 0 Date 0 Time 0 PdDistrict 1 Resolution 0 Address 0 X 0 Y 0 Location 0 PdId 0 dtype: int64
Filling the missing value in PdDistrict using the mode values
data['PdDistrict'].fillna(data['PdDistrict'].mode()[0], inplace = True) data.isnull().any().any()
Data Visualization
Different categories of crime
plt.rcParams['figure.figsize'] = (20, 9) plt.style.use('dark_background') sns.countplot(data['Category'], palette = 'gnuplot') plt.title('Major Crimes in Sanfrancisco', fontweight = 30, fontsize = 20) plt.xticks(rotation = 90) plt.show()

Plotting a tree map
y = data['Category'].value_counts().head(25) plt.rcParams['figure.figsize'] = (15, 15) plt.style.use('fivethirtyeight') color = plt.cm.magma(np.linspace(0, 1, 15)) squarify.plot(sizes = y.values, label = y.index, alpha=.8, color = color) plt.title('Tree Map for Top 25 Crimes', fontsize = 20) plt.axis('off') plt.show()

Description of the Crime
from wordcloud import WordCloud plt.rcParams['figure.figsize'] = (15, 15) plt.style.use('fast') wc = WordCloud(background_color = 'orange', width = 1500, height = 1500).generate(str(data['Descript'])) plt.title('Description of the Crime', fontsize = 20) plt.imshow(wc) plt.axis('off') plt.show()

Regions with count of crimes
plt.rcParams['figure.figsize'] = (20, 9) plt.style.use('seaborn') color = plt.cm.spring(np.linspace(0, 1, 15)) data['PdDistrict'].value_counts().plot.bar(color = color, figsize = (15, 10)) plt.title('District with Most Crime',fontsize = 30) plt.xticks(rotation = 90) plt.show()

Top 15 Addresses in San Francisco in Crime
plt.rcParams['figure.figsize'] = (20, 9) plt.style.use('seaborn') color = plt.cm.ocean(np.linspace(0, 1, 15)) data['Address'].value_counts().head(15).plot.bar(color = color, figsize = (15, 10)) plt.title('Top 15 Regions in Crime',fontsize = 20) plt.xticks(rotation = 90) plt.show()

Regions with days of crimes
plt.style.use('seaborn') data['DayOfWeek'].value_counts().head(15).plot.pie(figsize = (15, 8), explode = (0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1)) plt.title('Crime count on each day',fontsize = 20) plt.xticks(rotation = 90) plt.show()

Crimes in Each Month
data['Date'] = pd.to_datetime(data['Date']) data['Month'] = data['Date'].dt.month plt.style.use('fivethirtyeight') plt.rcParams['figure.figsize'] = (15, 8) sns.countplot(data['Month'], palette = 'autumn',) plt.title('Crimes in each Months', fontsize = 20) plt.show()

Checking the time at which crime occurs mostly
import warnings warnings.filterwarnings('ignore') color = plt.cm.twilight(np.linspace(0, 5, 100)) data['Time'].value_counts().head(20).plot.bar(color = color, figsize = (15, 9)) plt.title('Distribution of crime over the day', fontsize = 20) plt.show()

District vs Category of Crime
df = pd.crosstab(data['Category'], data['PdDistrict']) color = plt.cm.Greys(np.linspace(0, 1, 10)) df.div(df.sum(1).astype(float), axis = 0).plot.bar(stacked = True, color = color, figsize = (18, 12)) plt.title('District vs Category of Crime', fontweight = 30, fontsize = 20) plt.xticks(rotation = 90) plt.show()

Geographical Visualization
t = data.PdDistrict.value_counts() table = pd.DataFrame(data=t.values, index=t.index, columns=['Count']) table = table.reindex(["CENTRAL", "NORTHERN", "PARK", "SOUTHERN", "MISSION", "TENDERLOIN", "RICHMOND", "TARAVAL", "INGLESIDE", "BAYVIEW"]) table = table.reset_index() table.rename({'index': 'Neighborhood'}, axis='columns', inplace=True) table
#Output Neighborhood Count 0 CENTRAL 17666 1 NORTHERN 20100 2 PARK 8699 3 SOUTHERN 28446 4 MISSION 19503 5 TENDERLOIN 9942 6 RICHMOND 8922 7 TARAVAL 11325 8 INGLESIDE 11594 9 BAYVIEW 14303
gjson = r'https://cocl.us/sanfran_geojson' sf_map = folium.Map(location = [37.77, -122.42], zoom_start = 12)
Density of crime in San Francisco
#generate map sf_map.choropleth( geo_data=gjson, data=table, columns=['Neighborhood', 'Count'], key_on='feature.properties.DISTRICT', fill_color='YlOrRd', fill_opacity=0.7, line_opacity=0.2, legend_name='Crime Rate in San Francisco' ) sf_map
