Question
{ cells: [ { cell_type: code, execution_count: 2, metadata: { collapsed: false, deletable: true, editable: true }, outputs: [], source: [ # Data file at
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# Data file at https://www.cse.ust.hk/msbd5003/data ",
" ",
"lines = sc.textFile('../data/adj_noun_pairs.txt', 8)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true,
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"3162692"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lines.count()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"8"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lines.getNumPartitions()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true,
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"[u'early radical', u'french revolution', u'pejorative way', u'violent means', u'positive label']"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lines.take(5)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"PythonRDD[7] at RDD at PythonRDD.scala:48"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Converting lines into word pairs. ",
"# Data is dirty: some lines have more than 2 words, so filter them out. ",
"pairs = lines.map(lambda l: tuple(l.split())).filter(lambda p: len(p)==2) ",
"pairs.cache()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"[(u'early', u'radical'), (u'french', u'revolution'), (u'pejorative', u'way'), (u'violent', u'means'), (u'positive', u'label')]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pairs.take(5)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"N = pairs.count()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"3162674"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"N"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# Compute the frequency of each pair. ",
"# Ignore pairs that not frequent enough ",
"pair_freqs = pairs.map(lambda p: (p,1)).reduceByKey(lambda f1, f2: f1 + f2) \\ ",
" .filter(lambda pf: pf[1] >= 100)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"[((u'lead', u'role'), 298), ((u'other', u'means'), 202), ((u'huge', u'number'), 129), ((u'young', u'boy'), 156), ((u'old', u'age'), 174)]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pair_freqs.take(5)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# Computing the frequencies of the adjectives and the nouns ",
"a_freqs = pairs.map(lambda p: (p[0],1)).reduceByKey(lambda x,y: x+y) ",
"n_freqs = pairs.map(lambda p: (p[1],1)).reduceByKey(lambda x,y: x+y)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"[(u'algeria-related', 1), (u'funereal', 5), (u'datalink', 1), (u'then-leading', 1), (u'214th', 3)]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"a_freqs.take(5)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"106333"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"n_freqs.count()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"1191"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Broadcasting the adjective and noun frequencies. ",
"#a_dict = a_freqs.collectAsMap() ",
"#a_dict = sc.parallelize(a_dict).map(lambda x: x) ",
"n_dict = sc.broadcast(n_freqs.collectAsMap()) ",
"a_dict = sc.broadcast(a_freqs.collectAsMap()) ",
"a_dict.value['violent']"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"from math import * ",
" ",
"# Computing the PMI for a pair. ",
"def pmi_score(pair_freq): ",
" w1, w2 = pair_freq[0] ",
" f = pair_freq[1] ",
" pmi = log(float(f)*N/(a_dict.value[w1]*n_dict.value[w2]), 2) ",
" return pmi, (w1, w2)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# Computing the PMI for all pairs. ",
"scored_pairs = pair_freqs.map(pmi_score)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
"data": {
"text/plain": [
"[(14.41018838546462, (u'magna', u'carta')), (13.071365888694997, (u'polish-lithuanian', u'Commonwealth')), (12.990597616733414, (u'nitrous', u'oxide')), (12.64972604311254, (u'latter-day', u'Saints')), (12.50658937509916, (u'stainless', u'steel')), (12.482331020687814, (u'pave', u'runway')), (12.19140721768055, (u'corporal', u'punishment')), (12.183248694293388, (u'capital', u'punishment')), (12.147015483562537, (u'rush', u'yard')), (12.109945794428935, (u'globular', u'cluster'))]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Printing the most strongly associated pairs. ",
"scored_pairs.top(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Apache Toree - PySpark",
"language": "python",
"name": "apache_toree_pyspark"
},
"language_info": {
"file_extension": ".py",
"name": "python",
"pygments_lexer": "python",
"version": "2.7.14 "
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Modify the PMI example by sending a_dict and n_dict inside the closure. Do not use broadcast variables.
Step by Step Solution
There are 3 Steps involved in it
Step: 1
Get Instant Access to Expert-Tailored Solutions
See step-by-step solutions with expert insights and AI powered tools for academic success
Step: 2
Step: 3
Ace Your Homework with AI
Get the answers you need in no time with our AI-driven, step-by-step assistance
Get Started