Data Engineer & DataOps
My LinkedIn
My GitHub
http://archive.ics.uci.edu/ml/datasets/Dresses_Attribute_Sales
import pandas as pd
import numpy as np
attr = pd.read_excel('Attribute DataSet.xlsx')
attr
Dress_ID | Style | Price | Rating | Size | Season | NeckLine | SleeveLength | waiseline | Material | FabricType | Decoration | Pattern Type | Recommendation | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1006032852 | Sexy | Low | 4.6 | M | Summer | o-neck | sleevless | empire | NaN | chiffon | ruffles | animal | 1 |
1 | 1212192089 | Casual | Low | 0.0 | L | Summer | o-neck | Petal | natural | microfiber | NaN | ruffles | animal | 0 |
2 | 1190380701 | vintage | High | 0.0 | L | Automn | o-neck | full | natural | polyster | NaN | NaN | 0 | |
3 | 966005983 | Brief | Average | 4.6 | L | Spring | o-neck | full | natural | silk | chiffon | embroidary | 1 | |
4 | 876339541 | cute | Low | 4.5 | M | Summer | o-neck | butterfly | natural | chiffonfabric | chiffon | bow | dot | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
495 | 713391965 | Casual | Low | 4.7 | M | Spring | o-neck | full | natural | polyster | NaN | NaN | solid | 1 |
496 | 722565148 | Sexy | Low | 4.3 | free | Summer | o-neck | full | empire | cotton | NaN | NaN | NaN | 0 |
497 | 532874347 | Casual | Average | 4.7 | M | Summer | v-neck | full | empire | cotton | NaN | lace | solid | 1 |
498 | 655464934 | Casual | Average | 4.6 | L | winter | boat-neck | sleevless | empire | silk | broadcloth | applique | 1 | |
499 | 919930954 | Casual | Low | 4.4 | free | Summer | v-neck | short | empire | cotton | Corduroy | lace | solid | 0 |
500 rows × 14 columns
sales = pd.read_excel('Dress Sales.xlsx')
sales
Dress_ID | 29/8/2013 | 31/8/2013 | 2013-02-09 00:00:00 | 2013-04-09 00:00:00 | 2013-06-09 00:00:00 | 2013-08-09 00:00:00 | 2013-10-09 00:00:00 | 2013-12-09 00:00:00 | 14/9/2013 | ... | 24/9/2013 | 26/9/2013 | 28/9/2013 | 30/9/2013 | 2013-02-10 00:00:00 | 2013-04-10 00:00:00 | 2013-06-10 00:00:00 | 2010-08-10 00:00:00 | 2013-10-10 00:00:00 | 2013-12-10 00:00:00 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1006032852 | 2114 | 2274 | 2491 | 2660 | 2727 | 2887 | 2930 | 3119 | 3204 | ... | 3554 | 3624.0 | 3706 | 3746.0 | 3795.0 | 3832.0 | 3897 | 3923.0 | 3985.0 | 4048 |
1 | 1212192089 | 151 | 275 | 570 | 750 | 813 | 1066 | 1164 | 1558 | 1756 | ... | 2710 | 2942.0 | 3258 | 3354.0 | 3475.0 | 3654.0 | 3911 | 4024.0 | 4125.0 | 4277 |
2 | 1190380701 | 6 | 7 | 7 | 7 | 8 | 8 | 9 | 10 | 10 | ... | 11 | 11.0 | 11 | 11.0 | 11.0 | 11.0 | 11 | 11.0 | 11.0 | 11 |
3 | 966005983 | 1005 | 1128 | 1326 | 1455 | 1507 | 1621 | 1637 | 1723 | 1746 | ... | 1878 | 1892.0 | 1914 | 1924.0 | 1929.0 | 1941.0 | 1952 | 1955.0 | 1959.0 | 1963 |
4 | 876339541 | 996 | 1175 | 1304 | 1396 | 1432 | 1559 | 1570 | 1638 | 1655 | ... | 2032 | 2156.0 | 2252 | 2312.0 | 2387.0 | 2459.0 | 2544 | 2614.0 | 2693.0 | 2736 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
495 | 713391965 | 0 | 0 | 0 | 560 | 554 | 544 | 537 | 525 | 519 | ... | 400 | 388.0 | 360 | 364.0 | 372.0 | 377.0 | 380 | 382.0 | 384.0 | 285 |
496 | 722565148 | 0 | 0 | 0 | 875 | 866 | 861 | 854 | 850 | 844 | ... | 859 | 866.0 | 882 | 888.0 | 895.0 | 898.0 | 906 | 913.0 | 919.0 | 931 |
497 | 532874347 | 0 | 0 | 0 | 734 | 728 | 726 | 715 | 694 | 690 | ... | 616 | 597.0 | 586 | 569.0 | 561.0 | 555.0 | 551 | 546.0 | 535.0 | 520 |
498 | 655464934 | 0 | 0 | 0 | 254 | 259 | 261 | 263 | 268 | 270 | ... | 257 | 256.0 | 255 | 254.0 | 253.0 | 250.0 | 249 | 249.0 | 249.0 | 248 |
499 | 919930954 | 0 | 0 | 0 | 538 | 545 | 558 | 563 | 578 | 585 | ... | 628 | 632.0 | 639 | 645.0 | 651.0 | 655.0 | 660 | 668.0 | 674.0 | 680 |
500 rows × 24 columns
# set Dress_ID as index
attr = attr.set_index('Dress_ID')
attr
Style | Price | Rating | Size | Season | NeckLine | SleeveLength | waiseline | Material | FabricType | Decoration | Pattern Type | Recommendation | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Dress_ID | |||||||||||||
1006032852 | Sexy | Low | 4.6 | M | Summer | o-neck | sleevless | empire | NaN | chiffon | ruffles | animal | 1 |
1212192089 | Casual | Low | 0.0 | L | Summer | o-neck | Petal | natural | microfiber | NaN | ruffles | animal | 0 |
1190380701 | vintage | High | 0.0 | L | Automn | o-neck | full | natural | polyster | NaN | NaN | 0 | |
966005983 | Brief | Average | 4.6 | L | Spring | o-neck | full | natural | silk | chiffon | embroidary | 1 | |
876339541 | cute | Low | 4.5 | M | Summer | o-neck | butterfly | natural | chiffonfabric | chiffon | bow | dot | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
713391965 | Casual | Low | 4.7 | M | Spring | o-neck | full | natural | polyster | NaN | NaN | solid | 1 |
722565148 | Sexy | Low | 4.3 | free | Summer | o-neck | full | empire | cotton | NaN | NaN | NaN | 0 |
532874347 | Casual | Average | 4.7 | M | Summer | v-neck | full | empire | cotton | NaN | lace | solid | 1 |
655464934 | Casual | Average | 4.6 | L | winter | boat-neck | sleevless | empire | silk | broadcloth | applique | 1 | |
919930954 | Casual | Low | 4.4 | free | Summer | v-neck | short | empire | cotton | Corduroy | lace | solid | 0 |
500 rows × 13 columns
# clean typo faults for 'Season'
attr['Season'] = attr['Season'].str.replace('summer', 'Summer')
attr['Season'] = attr['Season'].str.replace('spring', 'Spring')
attr['Season'] = attr['Season'].str.replace('Automn', 'Autumn')
attr['Season'] = attr['Season'].str.replace('winter', 'Winter')
attr['Season'].value_counts()
Summer 160
Winter 145
Spring 124
Autumn 69
Name: Season, dtype: int64
# set Dress_ID as index
sales.set_index('Dress_ID', inplace=True)
# reformat headers as datetime
sales.columns = pd.to_datetime(sales.columns)
sales
2013-08-29 | 2013-08-31 | 2013-02-09 | 2013-04-09 | 2013-06-09 | 2013-08-09 | 2013-10-09 | 2013-12-09 | 2013-09-14 | 2013-09-16 | ... | 2013-09-24 | 2013-09-26 | 2013-09-28 | 2013-09-30 | 2013-02-10 | 2013-04-10 | 2013-06-10 | 2010-08-10 | 2013-10-10 | 2013-12-10 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Dress_ID | |||||||||||||||||||||
1006032852 | 2114 | 2274 | 2491 | 2660 | 2727 | 2887 | 2930 | 3119 | 3204 | 3277 | ... | 3554 | 3624.0 | 3706 | 3746.0 | 3795.0 | 3832.0 | 3897 | 3923.0 | 3985.0 | 4048 |
1212192089 | 151 | 275 | 570 | 750 | 813 | 1066 | 1164 | 1558 | 1756 | 1878 | ... | 2710 | 2942.0 | 3258 | 3354.0 | 3475.0 | 3654.0 | 3911 | 4024.0 | 4125.0 | 4277 |
1190380701 | 6 | 7 | 7 | 7 | 8 | 8 | 9 | 10 | 10 | 10 | ... | 11 | 11.0 | 11 | 11.0 | 11.0 | 11.0 | 11 | 11.0 | 11.0 | 11 |
966005983 | 1005 | 1128 | 1326 | 1455 | 1507 | 1621 | 1637 | 1723 | 1746 | 1783 | ... | 1878 | 1892.0 | 1914 | 1924.0 | 1929.0 | 1941.0 | 1952 | 1955.0 | 1959.0 | 1963 |
876339541 | 996 | 1175 | 1304 | 1396 | 1432 | 1559 | 1570 | 1638 | 1655 | 1681 | ... | 2032 | 2156.0 | 2252 | 2312.0 | 2387.0 | 2459.0 | 2544 | 2614.0 | 2693.0 | 2736 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
713391965 | 0 | 0 | 0 | 560 | 554 | 544 | 537 | 525 | 519 | 511 | ... | 400 | 388.0 | 360 | 364.0 | 372.0 | 377.0 | 380 | 382.0 | 384.0 | 285 |
722565148 | 0 | 0 | 0 | 875 | 866 | 861 | 854 | 850 | 844 | 841 | ... | 859 | 866.0 | 882 | 888.0 | 895.0 | 898.0 | 906 | 913.0 | 919.0 | 931 |
532874347 | 0 | 0 | 0 | 734 | 728 | 726 | 715 | 694 | 690 | 686 | ... | 616 | 597.0 | 586 | 569.0 | 561.0 | 555.0 | 551 | 546.0 | 535.0 | 520 |
655464934 | 0 | 0 | 0 | 254 | 259 | 261 | 263 | 268 | 270 | 272 | ... | 257 | 256.0 | 255 | 254.0 | 253.0 | 250.0 | 249 | 249.0 | 249.0 | 248 |
919930954 | 0 | 0 | 0 | 538 | 545 | 558 | 563 | 578 | 585 | 590 | ... | 628 | 632.0 | 639 | 645.0 | 651.0 | 655.0 | 660 | 668.0 | 674.0 | 680 |
500 rows × 23 columns
# change data type to int
sales = sales.dropna().astype('int64')
sales
2013-08-29 | 2013-08-31 | 2013-02-09 | 2013-04-09 | 2013-06-09 | 2013-08-09 | 2013-10-09 | 2013-12-09 | 2013-09-14 | 2013-09-16 | ... | 2013-09-24 | 2013-09-26 | 2013-09-28 | 2013-09-30 | 2013-02-10 | 2013-04-10 | 2013-06-10 | 2010-08-10 | 2013-10-10 | 2013-12-10 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Dress_ID | |||||||||||||||||||||
1006032852 | 2114 | 2274 | 2491 | 2660 | 2727 | 2887 | 2930 | 3119 | 3204 | 3277 | ... | 3554 | 3624 | 3706 | 3746 | 3795 | 3832 | 3897 | 3923 | 3985 | 4048 |
1212192089 | 151 | 275 | 570 | 750 | 813 | 1066 | 1164 | 1558 | 1756 | 1878 | ... | 2710 | 2942 | 3258 | 3354 | 3475 | 3654 | 3911 | 4024 | 4125 | 4277 |
1190380701 | 6 | 7 | 7 | 7 | 8 | 8 | 9 | 10 | 10 | 10 | ... | 11 | 11 | 11 | 11 | 11 | 11 | 11 | 11 | 11 | 11 |
966005983 | 1005 | 1128 | 1326 | 1455 | 1507 | 1621 | 1637 | 1723 | 1746 | 1783 | ... | 1878 | 1892 | 1914 | 1924 | 1929 | 1941 | 1952 | 1955 | 1959 | 1963 |
876339541 | 996 | 1175 | 1304 | 1396 | 1432 | 1559 | 1570 | 1638 | 1655 | 1681 | ... | 2032 | 2156 | 2252 | 2312 | 2387 | 2459 | 2544 | 2614 | 2693 | 2736 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
713391965 | 0 | 0 | 0 | 560 | 554 | 544 | 537 | 525 | 519 | 511 | ... | 400 | 388 | 360 | 364 | 372 | 377 | 380 | 382 | 384 | 285 |
722565148 | 0 | 0 | 0 | 875 | 866 | 861 | 854 | 850 | 844 | 841 | ... | 859 | 866 | 882 | 888 | 895 | 898 | 906 | 913 | 919 | 931 |
532874347 | 0 | 0 | 0 | 734 | 728 | 726 | 715 | 694 | 690 | 686 | ... | 616 | 597 | 586 | 569 | 561 | 555 | 551 | 546 | 535 | 520 |
655464934 | 0 | 0 | 0 | 254 | 259 | 261 | 263 | 268 | 270 | 272 | ... | 257 | 256 | 255 | 254 | 253 | 250 | 249 | 249 | 249 | 248 |
919930954 | 0 | 0 | 0 | 538 | 545 | 558 | 563 | 578 | 585 | 590 | ... | 628 | 632 | 639 | 645 | 651 | 655 | 660 | 668 | 674 | 680 |
225 rows × 23 columns
# aggregate sales data by month
sales_by_month = sales.T
sales_by_month.columns = sales.index
sales_by_month = sales_by_month.groupby(sales_by_month.index.month).sum().T
sales_by_month
2 | 4 | 6 | 8 | 9 | 10 | 12 | |
---|---|---|---|---|---|---|---|
Dress_ID | |||||||
1006032852 | 6286 | 6492 | 6624 | 11198 | 31297 | 6915 | 7167 |
1212192089 | 4045 | 4404 | 4724 | 5516 | 22443 | 5289 | 5835 |
1190380701 | 18 | 18 | 19 | 32 | 95 | 20 | 21 |
966005983 | 3255 | 3396 | 3459 | 5709 | 16590 | 3596 | 3686 |
876339541 | 3691 | 3855 | 3976 | 6344 | 17574 | 4263 | 4374 |
... | ... | ... | ... | ... | ... | ... | ... |
713391965 | 372 | 937 | 934 | 926 | 4015 | 921 | 810 |
722565148 | 895 | 1773 | 1772 | 1774 | 7740 | 1773 | 1781 |
532874347 | 561 | 1289 | 1279 | 1272 | 5741 | 1250 | 1214 |
655464934 | 253 | 504 | 508 | 510 | 2364 | 512 | 516 |
919930954 | 651 | 1193 | 1205 | 1226 | 5542 | 1237 | 1258 |
225 rows × 7 columns
sales_by_season = sales_by_month.merge(attr['Season'], on='Dress_ID').dropna().groupby('Season').sum()
sales_by_season
2 | 4 | 6 | 8 | 9 | 10 | 12 | |
---|---|---|---|---|---|---|---|
Season | |||||||
Autumn | 6560 | 9402 | 9434 | 11544 | 43575 | 9731 | 10022 |
Spring | 30356 | 34150 | 34475 | 57915 | 158064 | 35022 | 35332 |
Summer | 37839 | 45722 | 46917 | 66687 | 216637 | 49354 | 51224 |
Winter | 13344 | 15692 | 15815 | 24549 | 73591 | 16703 | 17315 |
# 9/2013 is the best-selling month
sales_by_month.max().sort_values(ascending=False)
9 61147
8 28089
2 13764
4 13516
6 13400
10 13102
12 12909
dtype: int64
# total sales by month
totalSales_by_month = sales_by_month.sum()
totalSales_by_month
2 85197
4 99748
6 101378
8 155130
9 467791
10 105451
12 108456
dtype: int64
output = []
for i in totalSales_by_month.index:
ratioSales = sales_by_month[i]/totalSales_by_month[totalSales_by_month.index == i].values[0]
output.append(pd.DataFrame(ratioSales.sort_values(ascending=False)[:5].index))
top10_items_by_month = pd.concat(output, axis=1)
top10_items_by_month.columns = totalSales_by_month.index
top10_items_by_month
2 | 4 | 6 | 8 | 9 | 10 | 12 | |
---|---|---|---|---|---|---|---|
0 | 629131530 | 629131530 | 629131530 | 629131530 | 629131530 | 629131530 | 629131530 |
1 | 749031896 | 749031896 | 1006032852 | 749031896 | 1006032852 | 1006032852 | 1006032852 |
2 | 1006032852 | 1006032852 | 749031896 | 1006032852 | 749031896 | 749031896 | 749031896 |
3 | 957723897 | 1212192089 | 1212192089 | 624314841 | 1212192089 | 1212192089 | 1212192089 |
4 | 624314841 | 957723897 | 957723897 | 957723897 | 957723897 | 957723897 | 957723897 |
top10_items = top10_items_by_month.melt()
top10_items.columns = ['Month', 'Dress_ID']
top10_items = pd.DataFrame(top10_items['Dress_ID'].unique())
top10_items.columns = ['Dress_ID']
top10_items.merge(attr['Recommendation'], how='inner', on='Dress_ID')
Dress_ID | Recommendation | |
---|---|---|
0 | 629131530 | 1 |
1 | 749031896 | 1 |
2 | 1006032852 | 1 |
3 | 957723897 | 1 |
4 | 624314841 | 1 |
5 | 1212192089 | 0 |
top10_items.merge(attr, how='inner', on='Dress_ID')
Dress_ID | Style | Price | Rating | Size | Season | NeckLine | SleeveLength | waiseline | Material | FabricType | Decoration | Pattern Type | Recommendation | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 629131530 | cute | Low | 4.7 | M | Spring | ruffled | short | empire | chiffonfabric | chiffon | bow | dot | 1 |
1 | 749031896 | vintage | Average | 4.8 | M | Summer | o-neck | short | empire | cotton | jersey | NaN | animal | 1 |
2 | 1006032852 | Sexy | Low | 4.6 | M | Summer | o-neck | sleevless | empire | NaN | chiffon | ruffles | animal | 1 |
3 | 957723897 | sexy | Low | 4.7 | M | Winter | o-neck | threequarter | NaN | NaN | chiffon | lace | 1 | |
4 | 624314841 | cute | Average | 4.7 | L | Spring | o-neck | short | NaN | cotton | NaN | sashes | solid | 1 |
5 | 1212192089 | Casual | Low | 0.0 | L | Summer | o-neck | Petal | natural | microfiber | NaN | ruffles | animal | 0 |