Algorithms [Degree Centrality / Eigenvector Centrality / Katz Centrality / PageRank / HITS Hubs and Authorities / Closeness Centrality / Betweenness Centrality] : https://aksakalli.github.io/2017/07/17/network-centrality-measures-and-their-visualization.html

1. Set up connection with Neo4j¶

from py2neo import Graph

graph = Graph("bolt://localhost:7687", user="neo4j", password="france")
graph.delete_all()

2. Load CSV file into Neo4j database¶

A. 1st example with Riders¶

import pandas as pd 

data_rider = pd.read_csv('https://raw.githubusercontent.com/inserpio/tour-de-france-2014/master/tour-de-france-2014-0001-teams-and-riders.csv')
data_rider.head(3)

list(data_rider)

['RACE_ID',
 'RACE_NAME',
 'RACE_YEAR',
 'RACE_FROM',
 'RACE_TO',
 'RACE_DISTANCE',
 'RACE_NUMBER_OF_STAGES',
 'RACE_EDITION',
 'RACE_WEBSITE',
 'TEAM_ID',
 'TEAM_NAME',
 'TEAM_COUNTRY',
 'TEAM_MANAGERS',
 'RIDER_NUMBER',
 'RIDER_NAME',
 'RIDER_COUNTRY',
 'RIDER_INFO']

Understand my queries :

Line 1 : load CSV file

Line 2 : r:Race

RACE_ID (id)
RACE_NAME
RACE_FROM
RACE_TO
RACE_EDITION
RACE_DISTANCE
RACE_NUMBER_OF_STAGES
RACE_WEBSITE

Line 3 : t:Team

TEAM_ID (id)
TEAM_NAME
TEAM_COUNTRY
TEAM_MANAGERS

Line 4 : p:Rider

RIDER_NUMBER (number)
RIDER_NAME
RIDER_COUNTRY

Line 5 : Create link and bring information to links

query_rider = """
LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/inserpio/tour-de-france-2014/master/tour-de-france-2014-0001-teams-and-riders.csv" AS csvLine
MERGE (r:Race { id: toInt(csvLine.RACE_ID), name: csvLine.RACE_NAME, from: csvLine.RACE_FROM, to: csvLine.RACE_TO, edition: csvLine.RACE_EDITION, distance: csvLine.RACE_DISTANCE, number_of_stages: csvLine.RACE_NUMBER_OF_STAGES, website: csvLine.RACE_WEBSITE })
MERGE (t:Team { id: toInt(csvLine.TEAM_ID), name: csvLine.TEAM_NAME, country: csvLine.TEAM_COUNTRY, sportingDirectors: csvLine.TEAM_MANAGERS }) 
MERGE (p:Rider { name: csvLine.RIDER_NAME, country: csvLine.RIDER_COUNTRY })
CREATE (t)-[:TAKES_PART_IN]->(r)<-[:TAKES_PART_IN { number: toInt(csvLine.RIDER_NUMBER), info: csvLine.RIDER_INFO }]-(p), (p)-[:RIDES_FOR { year: toInt(csvLine.RACE_YEAR) }]->(t);
"""

graph.run(query_rider)

<py2neo.database.Cursor at 0x28c3e77ef60>

from IPython.display import Image
Image(filename="pictures/Riders_links_info.gif")

<IPython.core.display.Image object>

#graph.delete_all()

B. 2nd examples with Etapes¶

import pandas as pd 

data_etapes = pd.read_csv('https://raw.githubusercontent.com/inserpio/tour-de-france-2014/master/tour-de-france-2014-0002-stages.csv')
data_etapes.head(3)

list(data_etapes)

['RACE_ID',
 'STAGE_NUMBER',
 'STAGE_TYPE',
 'STAGE_DATE',
 'STAGE_START',
 'STAGE_START_COUNTRY',
 'STAGE_START_LATITUDE',
 'STAGE_START_LONGITUDE',
 'STAGE_FINISH',
 'STAGE_FINISH_COUNTRY',
 'STAGE_FINISH_LATITUDE',
 'STAGE_FINISH_LONGITUDE',
 'STAGE_DISTANCE',
 'STAGE_INFO']

query_etapes = """
LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/inserpio/tour-de-france-2014/master/tour-de-france-2014-0002-stages.csv" AS csvLine MATCH (r:Race { id: 1 })
MERGE (s:Stage { name: csvLine.STAGE_START + " / " + csvLine.STAGE_FINISH, number: toInt(csvLine.STAGE_NUMBER), type: csvLine.STAGE_TYPE, date: csvLine.STAGE_DATE, distance: toFloat(csvLine.STAGE_DISTANCE), info: csvLine.STAGE_INFO})
MERGE (cs: City { name: csvLine.STAGE_START, country: csvLine.STAGE_START_COUNTRY, lat: toFloat(csvLine.STAGE_START_LATITUDE), lon: toFloat(csvLine.STAGE_START_LONGITUDE) })
MERGE (cf: City { name: csvLine.STAGE_FINISH, country: csvLine.STAGE_FINISH_COUNTRY,  lat: toFloat(csvLine.STAGE_FINISH_LATITUDE), lon: toFloat(csvLine.STAGE_FINISH_LONGITUDE) })
CREATE (s)-[:IS_A_STAGE_OF]->(r), (s)-[:STARTS_FROM]->(cs), (s)-[:FINISHED_AT]->(cf);
"""

graph.run(query_etapes)

<py2neo.database.Cursor at 0x28c4890c080>

from IPython.display import Image
Image(filename="pictures/Etapes_links.gif")

<IPython.core.display.Image object>

C. 3rd examples with Climbs¶

import pandas as pd 

data_climbs = pd.read_csv('https://raw.githubusercontent.com/inserpio/tour-de-france-2014/master/tour-de-france-2014-0003-climbs.csv')
data_climbs.head(3)

list(data_climbs)

['STAGE_NUMBER',
 'STARTING_AT_KM',
 'NAME',
 'INITIAL_ALTITUDE',
 'DISTANCE',
 'AVERAGE_SLOPE',
 'CATEGORY']

query_climbs = """
LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/inserpio/tour-de-france-2014/master/tour-de-france-2014-0003-climbs.csv" AS csvLine
MATCH (s:Stage { number: toInt(csvLine.STAGE_NUMBER) })
CREATE (s)-[:INCLUDES]->(c:Climb { startingAtKm: toFloat(csvLine.STARTING_AT_KM), name: csvLine.NAME, initialAltitude: toFloat(csvLine.INITIAL_ALTITUDE), averageSlope: toFloat(csvLine.AVERAGE_SLOPE), distance: toFloat(csvLine.DISTANCE), category: csvLine.CATEGORY });
"""

graph.run(query_climbs)

<py2neo.database.Cursor at 0x28c4890ca90>

Image(filename="pictures/climbs.PNG")

D. 4th example Intermediate Sprints¶

import pandas as pd 

data_intermediate_sprints = pd.read_csv('https://raw.githubusercontent.com/inserpio/tour-de-france-2014/master/tour-de-france-2014-0004-intermediate_sprints.csv')
data_intermediate_sprints.head(3)

query_intermediate_sprints = """
LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/inserpio/tour-de-france-2014/master/tour-de-france-2014-0003-climbs.csv" AS csvLine
MATCH (s:Stage { number: toInt(csvLine.STAGE_NUMBER) })
CREATE (s)-[:INCLUDES]->(c:Climb { startingAtKm: toFloat(csvLine.STARTING_AT_KM), name: csvLine.NAME, initialAltitude: toFloat(csvLine.INITIAL_ALTITUDE), averageSlope: toFloat(csvLine.AVERAGE_SLOPE), distance: toFloat(csvLine.DISTANCE), category: csvLine.CATEGORY });
"""

graph.run(query_intermediate_sprints)

<py2neo.database.Cursor at 0x28c489fa978>

Image(filename="pictures/intermediate_sprints.PNG")

3. Queries in Cypher¶

https://github.com/mneedham/strava/blob/master/Strava.ipynb

How many teams per country will take part in TDF 2014?¶

questions_teams_per_country = """
MATCH (t:Team) RETURN DISTINCT t.country, collect(t.name), count(t.name) AS teamsPerCountry ORDER BY teamsPerCountry DESC;
"""

response_teams_per_country = graph.run(questions_teams_per_country)

for d in response_teams_per_country:
    print(d)

<Record t.country='FRA\xa0' collect(t.name)=['AG2R LA MONDIALE', 'FDJ.FR', 'TEAM EUROPCAR', 'COFIDIS, SOLUTIONS CREDITS', 'BRETAGNE - SECHE ENVIRONNEMENT'] teamsPerCountry=5>
<Record t.country='USA\xa0' collect(t.name)=['GARMIN - SHARP', 'BMC RACING TEAM', 'TREK FACTORY RACING'] teamsPerCountry=3>
<Record t.country='RUS\xa0' collect(t.name)=['TEAM KATUSHA', 'TINKOFF – SAXO'] teamsPerCountry=2>
<Record t.country='ITA\xa0' collect(t.name)=['CANNONDALE', 'LAMPRE - MERIDA'] teamsPerCountry=2>
<Record t.country='NED\xa0' collect(t.name)=['BELKIN PRO CYCLING', 'TEAM GIANT - SHIMANO'] teamsPerCountry=2>
<Record t.country='BEL\xa0' collect(t.name)=['OMEGA PHARMA - QUICK STEP', 'LOTTO – BELISOL'] teamsPerCountry=2>
<Record t.country='GBR\xa0' collect(t.name)=['TEAM SKY'] teamsPerCountry=1>
<Record t.country='ESP\xa0' collect(t.name)=['MOVISTAR TEAM'] teamsPerCountry=1>
<Record t.country='KAZ\xa0' collect(t.name)=['ASTANA PRO TEAM'] teamsPerCountry=1>
<Record t.country='AUS\xa0' collect(t.name)=['ORICA GREENEDGE'] teamsPerCountry=1>
<Record t.country='SUI\xa0' collect(t.name)=['IAM CYCLING'] teamsPerCountry=1>
<Record t.country='GER\xa0' collect(t.name)=['TEAM NETAPP – ENDURA'] teamsPerCountry=1>

graph.run(questions_teams_per_country).to_data_frame()

How many riders per country will take part in TDF2014?¶

questions_riders_per_country = """
MATCH (r:Rider) RETURN DISTINCT r.country, count(r.name) AS ridersPerCountry ORDER BY ridersPerCountry DESC;
"""

graph.run(questions_riders_per_country).to_data_frame().head()

NetworkX¶

Goal: Graph algorithms are used to compute metrics for graphs, nodes, or relationships. Provide insights on relevant

- Entities in the graph (centralities, ranking)
- Inherent structures like communities (community-detection, graph-partitioning, clustering).

How does it work? : Many graph algorithms are iterative approaches that frequently traverse the graph for the computation using random walks, breadth-first or depth-first searches, or pattern matching.

Hard?: Due to the exponential growth of possible paths with increasing distance, many of the approaches also have high algorithmic complexity.

Fortunately, optimized algorithms exist that utilize certain structures of the graph, memorize already explored parts, and parallelize operations.

https://neo4j.com/docs/graph-algorithms/current/introduction/

1. Centrality algorithms determine the importance of distinct nodes in a network¶

PageRank
ArticleRank
Betweenness Centrality
Closeness Centrality
Harmonic Centrality
Eigenvector Centrality
Degree Centrality

2. Community detection algorithms evaluate how a group is clustered or partitioned, as well as its tendency to strengthen or break apart¶

Louvain
Label Propagation
Connected Components
Strongly Connected Components
Triangle Counting / Clustering Coefficient
Balanced Triads

3. Path finding algorithms help find the shortest path or evaluate the availability and quality of routes¶

Minimum Weight Spanning Tree
Shortest Path
Single Source Shortest Path
All Pairs Shortest Path
A*
Yen’s K-shortest paths
Random Walk

4. Link Prediction algorithms help determine the closeness of a pair of nodes. We would then use the computed scores as part of a link prediction solution¶

Adamic Adar
Common Neighbors
Preferential Attachment
Resource Allocation
Same Community
Total Neighbors

Symetric vs. Asymetric Graph¶

my_network = [("Adrien", "Charles"), ("Charles", "Louis"), ("Charles", "Yann"), ("Yann", "Adrien"), ("Louis", "Yann")]

G_symmetric = nx.Graph()
G_asymmetric = nx.DiGraph()

for node in my_network:
    # Symetric Graph
    G_symmetric.add_edge(node[0],node[1])
    
    # Asymetric Graph
    G_asymmetric.add_edge(node[0],node[1])
    
nx.draw_networkx(G_symmetric)

nx.draw_networkx(G_asymmetric)

c:\users\adsieg\desktop\nutella\neo4j_graph\.neo4j_venv\lib\site-packages\networkx\drawing\nx_pylab.py:676: MatplotlibDeprecationWarning: 
The iterable function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use np.iterable instead.
  if cb.iterable(node_size):  # many node sizes

Weighted Graph¶

my_network = [("Adrien", "Charles",2), ("Charles", "Louis",18), ("Charles", "Yann",30), ("Yann", "Adrien",5), ("Louis", "Yann", 11)]

G_weighted = nx.Graph()

for node in my_network:
    G_weighted.add_edge(node[0], node[1], weight=node[2])
    
nx.draw_networkx(G_weighted)

Use Case Facebook¶

PATH_FACEBOOK = "dataset/facebook_combined.txt"
G_fb = nx.read_edgelist(PATH_FACEBOOK, create_using = nx.Graph(), nodetype=int)

print(nx.info(G_fb))

Name: 
Type: Graph
Number of nodes: 4039
Number of edges: 88234
Average degree:  43.6910

nx.draw_networkx(G_fb)

	RACE_ID	RACE_NAME	RACE_YEAR	RACE_FROM	RACE_TO	RACE_DISTANCE	RACE_NUMBER_OF_STAGES	RACE_EDITION	RACE_WEBSITE	TEAM_ID	TEAM_NAME	TEAM_COUNTRY	TEAM_MANAGERS	RIDER_NUMBER	RIDER_NAME	RIDER_COUNTRY	RIDER_INFO
0	1	TOUR DE FRANCE	2014	05/07/2014	27/07/2014	3663.5	21	101	http://www.letour.com/le-tour/2014/us/	1	TEAM SKY	GBR	PORTAL Nicolas, KNAVEN Servais	1	FROOME Christopher	GBR	http://www.letour.com/le-tour/2014/us/riders/t...
1	1	TOUR DE FRANCE	2014	05/07/2014	27/07/2014	3663.5	21	101	http://www.letour.com/le-tour/2014/us/	1	TEAM SKY	GBR	PORTAL Nicolas, KNAVEN Servais	2	EISEL Bernhard	AUT	http://www.letour.com/le-tour/2014/us/riders/t...
2	1	TOUR DE FRANCE	2014	05/07/2014	27/07/2014	3663.5	21	101	http://www.letour.com/le-tour/2014/us/	1	TEAM SKY	GBR	PORTAL Nicolas, KNAVEN Servais	3	KIRYIENKA Vasili	BLR	http://www.letour.com/le-tour/2014/us/riders/t...

	RACE_ID	STAGE_NUMBER	STAGE_TYPE	STAGE_DATE	STAGE_START	STAGE_START_COUNTRY	STAGE_START_LATITUDE	STAGE_START_LONGITUDE	STAGE_FINISH	STAGE_FINISH_COUNTRY	STAGE_FINISH_LATITUDE	STAGE_FINISH_LONGITUDE	STAGE_DISTANCE	STAGE_INFO
0	1	1	Flat	05/07/2014	Leeds	ENG	53.799722	-1.549167	Harrogate	ENG	53.991000	-1.539000	190.5	http://www.letour.com/le-tour/2014/us/stage-1....
1	1	2	Hilly	06/07/2014	York	ENG	53.958333	-1.080278	Sheffield	ENG	53.383611	-1.466944	201.0	http://www.letour.com/le-tour/2014/us/stage-2....
2	1	3	Flat	07/07/2014	Cambridge	ENG	52.205000	0.119000	Londres	ENG	51.507222	-0.127500	155.0	http://www.letour.com/le-tour/2014/us/stage-3....

	STAGE_NUMBER	STARTING_AT_KM	NAME	DISTANCE	AVERAGE_SLOPE	CATEGORY
0	1	68.0	Côte de Cray	1.6	7.1	4
1	1	103.5	Côte de Buttertubs	4.5	6.8	3
2	1	129.5	Côte de Griton Moor	3.0	6.6	3

	STAGE_NUMBER	AT_KM	CITY	COUNTRY	LATITUDE	LONGITUDE
0	1	77.0	Newbiggin	ENG	54.26929	-2.00449
1	2	68.5	Keighley	ENG	53.86700	-1.91100
2	3	108.0	Epping Forest	ENG	51.66000	0.05000

	collect(t.name)	t.country	teamsPerCountry
0	[AG2R LA MONDIALE, FDJ.FR, TEAM EUROPCAR, COFI...	FRA	5
1	[GARMIN - SHARP, BMC RACING TEAM, TREK FACTORY...	USA	3
2	[TEAM KATUSHA, TINKOFF – SAXO]	RUS	2
3	[CANNONDALE, LAMPRE - MERIDA]	ITA	2
4	[BELKIN PRO CYCLING, TEAM GIANT - SHIMANO]	NED	2
5	[OMEGA PHARMA - QUICK STEP, LOTTO – BELISOL]	BEL	2
6	[TEAM SKY]	GBR	1
7	[MOVISTAR TEAM]	ESP	1
8	[ASTANA PRO TEAM]	KAZ	1
9	[ORICA GREENEDGE]	AUS	1
10	[IAM CYCLING]	SUI	1
11	[TEAM NETAPP – ENDURA]	GER	1