Data Engineer & DataOps
My LinkedIn
My GitHub
import pandas as pd
# read input file
data = pd.read_csv('HN_posts_year_to_Sep_26_2016.csv')
# keep titles receiving comments, and randomly sample 20000 rows
sample = data[data['num_comments']>0].sample(20000)
sample
id | title | url | num_points | num_comments | author | created_at | |
---|---|---|---|---|---|---|---|
76795 | 11902973 | Why Team Happiness Can Be the Wrong Thing to A... | https://vimeo.com/143894732 | 91 | 44 | michaelfeathers | 6/14/2016 16:10 |
9300 | 12494791 | There's only one business worth starting | https://medium.com/hi-my-name-is-jon/theres-on... | 3 | 1 | hccampos | 9/14/2016 7:25 |
79128 | 11878535 | Video streaming for all | http://getlawd.com/ | 3 | 7 | stoufa88 | 6/10/2016 18:10 |
275008 | 10307145 | Google and Microsoft make patent peace | http://www.zdnet.com/article/google-and-micros... | 89 | 56 | tanglesome | 9/30/2015 21:02 |
132643 | 11423114 | What is your story of finding your cofounder, ... | NaN | 2 | 5 | PeterTMayer | 4/4/2016 16:24 |
... | ... | ... | ... | ... | ... | ... | ... |
174186 | 11082832 | The Godfather of Digital Maps | http://www.forbes.com/sites/miguelhelft/2016/0... | 30 | 8 | dll | 2/11/2016 20:13 |
117737 | 11546882 | MongoDB Twitter Spam Campaign | http://imgur.com/a/iY5C7 | 1 | 1 | snurk | 4/22/2016 3:29 |
245098 | 10523794 | The unmanned aerial drones of WW2 | https://en.wikipedia.org/wiki/Operation_Outward | 1 | 2 | fivedogit | 11/7/2015 4:58 |
230676 | 10633042 | How Cereal Is Made | http://luckypeach.com/how-cereal-is-made/ | 23 | 13 | zdw | 11/26/2015 14:14 |
60515 | 12045437 | Italian banking is the next shoe to drop | http://marginalrevolution.com/marginalrevoluti... | 40 | 6 | jseliger | 7/6/2016 19:56 |
20000 rows × 7 columns
# keep only rows that contain 'ask hn'
sampleAsk = sample[['ask hn' in i.lower() for i in sample['title']]]
sampleAsk
id | title | url | num_points | num_comments | author | created_at | |
---|---|---|---|---|---|---|---|
188898 | 10969705 | Ask HN: Are GMail's new features making spam e... | NaN | 3 | 2 | aleem | 1/25/2016 20:27 |
66651 | 11992582 | Ask HN: Why is academic language so redundant? | NaN | 2 | 4 | 50CNT | 6/28/2016 10:14 |
184006 | 11006270 | Ask HN: What are some examples of great B2B la... | NaN | 5 | 3 | bossx | 1/31/2016 13:08 |
242932 | 10539626 | Ask HN: Accused of email hacking | NaN | 1 | 1 | dfraser992 | 11/10/2015 14:56 |
287609 | 10215239 | Ask HN: Is it common knowledge that Yahoo bene... | NaN | 1 | 1 | superplussed | 9/14/2015 14:34 |
... | ... | ... | ... | ... | ... | ... | ... |
40330 | 12219043 | Ask HN: Do you still play with VR actively? | NaN | 105 | 108 | billconan | 8/3/2016 16:14 |
271208 | 10335717 | Ask HN: Hackintosh vs Mac Mini | NaN | 2 | 6 | siquick | 10/5/2015 23:24 |
228610 | 10650345 | Ask HN: Learning path for React and Flux | NaN | 3 | 1 | rufus42 | 11/30/2015 16:56 |
120244 | 11526536 | Ask HN: Is this a Python bug? | NaN | 3 | 4 | wslh | 4/19/2016 12:47 |
149820 | 11276960 | Ask HN: Am I getting old? | NaN | 11 | 5 | greenspot | 3/13/2016 9:24 |
1693 rows × 7 columns
# keep only rows that contain 'show hn'
sampleShow = sample[['show hn' in i.lower() for i in sample['title']]]
sampleShow
id | title | url | num_points | num_comments | author | created_at | |
---|---|---|---|---|---|---|---|
288163 | 10211892 | Show HN: Best places to work remotely by actua... | https://workfrom.co | 17 | 9 | darrenbuckner | 9/13/2015 16:33 |
170868 | 11109744 | Show HN: The fastest way to discover fashion o... | http://frowse.fashion/home/3 | 1 | 2 | xShirase | 2/16/2016 13:50 |
139946 | 11360582 | SHOW HN: Left-Pad could be the next FizzBuzz s... | https://www.educative.io/collection/page/10370... | 4 | 1 | fahimulhaq | 3/25/2016 15:27 |
261028 | 10411041 | Show HN: Microphone Self-Announcing .NET Serv... | https://github.com/rogeralsing/Microphone | 54 | 6 | RogerAlsing | 10/19/2015 4:02 |
247565 | 10506163 | Show HN: Vivilio Discover books peers and inf... | https://www.vivilio.com | 9 | 4 | soumitrasg | 11/4/2015 13:05 |
... | ... | ... | ... | ... | ... | ... | ... |
143733 | 11327679 | Show HN: Micro a microservice toolkit | https://blog.micro.mu/2016/03/20/micro.html | 101 | 32 | chuhnk | 3/21/2016 12:52 |
179074 | 11043959 | Show HN: Swift and VR Google Cardboard Ported... | https://github.com/nzff/cardboard-swift | 64 | 19 | nzff | 2/5/2016 19:31 |
42857 | 12197474 | Show HN: Trading platform for Pokemon Go | https://medium.com/@deadlocked_d/pok%C3%A9mon-... | 1 | 1 | liongate2 | 7/31/2016 16:02 |
126956 | 11471060 | Show HN: Musicsaur Multi-room audio synchroni... | http://www.musicsaur.com/ | 3 | 2 | qrv3w | 4/11/2016 12:36 |
77493 | 11896000 | Show HN: Golf Tradr fantasy golf with a stock... | https://golftradr.com | 3 | 1 | rob_zim | 6/13/2016 18:21 |
1285 rows × 7 columns
averageComAsk = sum(sampleAsk['num_comments'])/1692
averageComAsk
12.789598108747045
averageComShow = sum(sampleShow['num_comments'])/1284
averageComShow
8.483644859813085
import datetime as dt
hour = pd.DataFrame([dt.datetime.strftime(i, '%H') for i in [dt.datetime.strptime(i, "%m/%d/%Y %H:%M") for i in sampleAsk['created_at']]], columns=['hour'])
# make a DataFrame of hour (of the post) and (its) num_comments
hourCom = pd.concat([hour, sampleAsk['num_comments'].reset_index()], axis=1, ignore_index=True)[[0, 2]].rename({0: 'hour', 2: 'num_comments'}, axis=1)
hourCom
hour | num_comments | |
---|---|---|
0 | 20 | 2 |
1 | 10 | 4 |
2 | 13 | 3 |
3 | 14 | 1 |
4 | 14 | 1 |
... | ... | ... |
1688 | 16 | 108 |
1689 | 23 | 6 |
1690 | 16 | 1 |
1691 | 12 | 4 |
1692 | 09 | 5 |
1693 rows × 2 columns
# total of comments per hour
sumCom = hourCom.groupby('hour').sum()
sumCom.sort_values(['num_comments'], ascending=False).head()
num_comments | |
---|---|
hour | |
15 | 4091 |
17 | 1286 |
20 | 1273 |
13 | 1149 |
18 | 1108 |
# total of counts of posts per hour
countCom = hourCom.groupby('hour').count()
countCom
num_comments | |
---|---|
hour | |
00 | 57 |
01 | 52 |
02 | 48 |
03 | 65 |
04 | 47 |
05 | 39 |
06 | 47 |
07 | 51 |
08 | 47 |
09 | 33 |
10 | 64 |
11 | 63 |
12 | 67 |
13 | 69 |
14 | 91 |
15 | 115 |
16 | 104 |
17 | 93 |
18 | 123 |
19 | 87 |
20 | 100 |
21 | 103 |
22 | 69 |
23 | 59 |
averageHour = []
for i in range(0, 24):
averageHour.append(sumCom['num_comments'][i]/countCom['num_comments'][i])
averageCom = pd.concat([sumCom, countCom, pd.DataFrame(averageHour, index=sumCom.index)], axis=1)
averageCom.columns = ['sum_comments', 'count_comments', 'average_comments']
averageCom
sum_comments | count_comments | average_comments | |
---|---|---|---|
hour | |||
00 | 493 | 57 | 8.649123 |
01 | 485 | 52 | 9.326923 |
02 | 333 | 48 | 6.937500 |
03 | 523 | 65 | 8.046154 |
04 | 402 | 47 | 8.553191 |
05 | 916 | 39 | 23.487179 |
06 | 644 | 47 | 13.702128 |
07 | 693 | 51 | 13.588235 |
08 | 692 | 47 | 14.723404 |
09 | 146 | 33 | 4.424242 |
10 | 763 | 64 | 11.921875 |
11 | 523 | 63 | 8.301587 |
12 | 806 | 67 | 12.029851 |
13 | 1149 | 69 | 16.652174 |
14 | 972 | 91 | 10.681319 |
15 | 4091 | 115 | 35.573913 |
16 | 1049 | 104 | 10.086538 |
17 | 1286 | 93 | 13.827957 |
18 | 1108 | 123 | 9.008130 |
19 | 558 | 87 | 6.413793 |
20 | 1273 | 100 | 12.730000 |
21 | 1088 | 103 | 10.563107 |
22 | 1021 | 69 | 14.797101 |
23 | 626 | 59 | 10.610169 |
averageCom.sort_values(by=['average_comments'], ascending=False).head()
sum_comments | count_comments | average_comments | |
---|---|---|---|
hour | |||
15 | 4091 | 115 | 35.573913 |
05 | 916 | 39 | 23.487179 |
13 | 1149 | 69 | 16.652174 |
22 | 1021 | 69 | 14.797101 |
08 | 692 | 47 | 14.723404 |