!pip install pyjanitor pyvis --quiet
import pandas as pd
import janitor
import datetime
from IPython.core.display import display, HTML
from pyvis import network as net
import networkx as nx
Goto Linkedin
df_connection = pd.read_csv("Connections.csv", skiprows=2)
df_connection.info()
Cleaning the data with the janitor
df = (
df_connection
.clean_names() # remove spacing and capitalization
.drop(columns=['first_name', 'last_name', 'email_address']) # drop for privacy
.dropna(subset=['company', 'position']) # drop missing values in company and position
.to_datetime('connected_on', format='%d %b %Y')
)
df.head()
The janitor cleaning syntax is following a chaining operations one after the other.
For example, df.clean_names().drop(...) means you’re cleaning the column names, then dropping the columns, and so on, where the output of the first operation becomes the input of the next.
bold text### Depending on your dataset you can remove freelance and self-employed titles
pattern = "retired|self-employed"
df = df[~df['company'].str.contains(pattern, case=False)]
df_company = df['company'].value_counts().reset_index()
df_company.columns = ['company', 'count']
df_company = df_company.sort_values(by="count", ascending=False)
df_company.head(10)
df_position = df['position'].value_counts().reset_index()
df_position.columns = ['position', 'count']
df_position = df_position.sort_values(by="count", ascending=False)
df_position.head(10)
import pandas as pd
df_contacts = pd.read_csv('Contacts.csv')
print(df_contacts.head())
df_contacts = df_contacts.fillna('')
import plotly.express as px
fig = px.scatter(df_contacts, x="Title", y="CreatedAt")
fig.show()
!pip install --upgrade plotly
I want to delete all entries in the panda dataframe that has no values for Title. I am using numpy to help out. To replace any empty title with nan and in the last step remove all the nan in the Title.
Display a Treemap
import numpy as np
df_contacts['Title'].replace('', np.nan, inplace=True)
df_contacts.dropna(subset=['Title'], inplace=True)
fig = px.treemap(df_contacts, path=['Title', 'Companies'], width=800, height=800)
fig.show()
print(f"number of nodes: {g.number_of_nodes()}")
print(f"number of edges: {g.number_of_edges()}")
for _, row in df_company.head(5).iterrows():
print(row['company'] + "-" + str(row['count']))
print(df_company.shape)
df_company_reduced = df_company.loc[df_company['count']>=5]
print(df_company_reduced.shape)
print(df_position.shape)
df_position_reduced = df_position.loc[df_position['count']>=5]
print(df_position_reduced.shape)
# initialize graph
g = nx.Graph()
g.add_node('root') # intialize yourself as central
# use iterrows tp iterate through the data frame
for _, row in df_company_reduced.iterrows():
# store company name and count
company = row['company']
count = row['count']
title = f"<b>{company}</b> – {count}"
positions = set([x for x in df[company == df['company']]['position']])
positions = ''.join('<li>{}</li>'.format(x) for x in positions)
position_list = f"<ul>{positions}</ul>"
hover_info = title + position_list
g.add_node(company, size=count*2, title=hover_info, color='#3449eb')
g.add_edge('root', company, color='grey')
# generate the graph
nt = net.Network(height='700px', width='700px', bgcolor="black", font_color='white')
nt.from_nx(g)
nt.hrepulsion()
# more customization https://tinyurl.com/yf5lvvdm
nt.show('company_graph.html')
display(HTML('company_graph.html'))
# initialize graph
g = nx.Graph()
g.add_node('root') # intialize yourself as central
# use iterrows tp iterate through the data frame
for _, row in df_position_reduced.iterrows():
count = f"{row['count']}"
position= row['position']
g.add_node(position, size=count, color='#3449eb', title=count)
g.add_edge('root', position, color='grey')
# generate the graph
nt = net.Network(height='700px', width='700px', bgcolor="black", font_color='white')
nt.from_nx(g)
nt.hrepulsion()
# more customization https://tinyurl.com/yf5lvvdm
nt.show('position_graph.html')
display(HTML('position_graph.html'))