Compare commits
1 Commits
0067273831
...
feature/po
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5ddd81ea05 |
Binary file not shown.
|
Before Width: | Height: | Size: 605 KiB After Width: | Height: | Size: 1.7 MiB |
File diff suppressed because one or more lines are too long
@@ -18,13 +18,14 @@ series:
|
|||||||
## The idea
|
## The idea
|
||||||
|
|
||||||
I want to test some of the pandas functionality so I try the import from HTML table for make some data analisys.
|
I want to test some of the pandas functionality so I try the import from HTML table for make some data analisys.
|
||||||
So I choose a web page with data in a table (or two in this case)
|
So I choose a web page with data in a table (or two in this case) about manga.
|
||||||
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import matplotlib
|
import matplotlib
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import numpy as np
|
||||||
```
|
```
|
||||||
|
|
||||||
Here we have the basic import for the needed package for the project.
|
Here we have the basic import for the needed package for the project.
|
||||||
@@ -61,8 +62,8 @@ Starting with the scrape of the page with Pandas. In this case it returnes 2 tab
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
if len(tables) >= 2:
|
if len(tables) >= 2:
|
||||||
table_series = tables[0]
|
table_series = tables[1]
|
||||||
table_volumes = tables[1]
|
table_volumes = tables[0]
|
||||||
|
|
||||||
else:
|
else:
|
||||||
print("Error: The page does not contain enough tables.")
|
print("Error: The page does not contain enough tables.")
|
||||||
@@ -85,7 +86,7 @@ print(table_volumes.isnull().sum())
|
|||||||
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-
|
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-
|
||||||
Missing value stats for Series:
|
Missing value stats for Series:
|
||||||
Ranking 0
|
Ranking 0
|
||||||
Title 0
|
Volume 0
|
||||||
Sales 0
|
Sales 0
|
||||||
Year 0
|
Year 0
|
||||||
dtype: int64
|
dtype: int64
|
||||||
@@ -93,42 +94,24 @@ print(table_volumes.isnull().sum())
|
|||||||
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-
|
-*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*-
|
||||||
Missing value stats for Volumes:
|
Missing value stats for Volumes:
|
||||||
Ranking 0
|
Ranking 0
|
||||||
Volume 0
|
Title 0
|
||||||
Sales 0
|
Sales 0
|
||||||
Year 0
|
Year 0
|
||||||
dtype: int64
|
dtype: int64
|
||||||
|
|
||||||
|
|
||||||
## Start the analysis
|
|
||||||
|
|
||||||
So we know the data is consistant so we need to know some generic data about this two dataset.
|
So we know the data is consistant so we need to know some generic data about this two dataset.
|
||||||
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
print("-*-")
|
print("-*-")
|
||||||
print(table_series.head())
|
print(table_series.head())
|
||||||
print(table_series.info(verbose=False))
|
|
||||||
print()
|
print()
|
||||||
print("-*-")
|
print("-*-")
|
||||||
print(table_volumes.head())
|
print(table_volumes.head())
|
||||||
print(table_volumes.info(verbose=False))
|
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
-*-
|
|
||||||
Ranking Title Sales Year
|
|
||||||
0 1 One Piece #50 1678208 2008
|
|
||||||
1 2 One Piece #51 1646978 2008
|
|
||||||
2 3 Nana #19 1645128 2008
|
|
||||||
3 4 One Piece #49 1544000 2008
|
|
||||||
4 5 Nana #20 1431335 2008
|
|
||||||
<class 'pandas.DataFrame'>
|
|
||||||
RangeIndex: 3413 entries, 0 to 3412
|
|
||||||
Columns: 4 entries, Ranking to Year
|
|
||||||
dtypes: int64(3), str(1)
|
|
||||||
memory usage: 106.8 KB
|
|
||||||
None
|
|
||||||
|
|
||||||
-*-
|
-*-
|
||||||
Ranking Volume Sales Year
|
Ranking Volume Sales Year
|
||||||
0 1 One Piece 5956540 2008
|
0 1 One Piece 5956540 2008
|
||||||
@@ -136,33 +119,128 @@ print(table_volumes.info(verbose=False))
|
|||||||
2 3 20th Century Boys 3710054 2008
|
2 3 20th Century Boys 3710054 2008
|
||||||
3 4 Hitman Reborn! 3371618 2008
|
3 4 Hitman Reborn! 3371618 2008
|
||||||
4 5 Bleach 3161825 2008
|
4 5 Bleach 3161825 2008
|
||||||
<class 'pandas.DataFrame'>
|
|
||||||
RangeIndex: 860 entries, 0 to 859
|
|
||||||
Columns: 4 entries, Ranking to Year
|
|
||||||
dtypes: int64(3), str(1)
|
|
||||||
memory usage: 27.0 KB
|
|
||||||
None
|
|
||||||
|
|
||||||
|
-*-
|
||||||
|
Ranking Title Sales Year
|
||||||
|
0 1 One Piece #50 1678208 2008
|
||||||
|
1 2 One Piece #51 1646978 2008
|
||||||
|
2 3 Nana #19 1645128 2008
|
||||||
|
3 4 One Piece #49 1544000 2008
|
||||||
|
4 5 Nana #20 1431335 2008
|
||||||
|
|
||||||
|
|
||||||
|
Ok now I need to reformat data from table_volumes and check the output
|
||||||
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
table_series[["Title","Sales","Year"]].plot(title="Manga Series")
|
table_volumes[['Volume', 'Volume_Number']] = table_volumes['Title'].str.split(' #', expand=True)
|
||||||
|
|
||||||
|
print()
|
||||||
|
print("-*-")
|
||||||
|
print(table_volumes.head())
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
-*-
|
||||||
|
Ranking Title Sales Year Volume Volume_Number
|
||||||
|
0 1 One Piece #50 1678208 2008 One Piece 50
|
||||||
|
1 2 One Piece #51 1646978 2008 One Piece 51
|
||||||
|
2 3 Nana #19 1645128 2008 Nana 19
|
||||||
|
3 4 One Piece #49 1544000 2008 One Piece 49
|
||||||
|
4 5 Nana #20 1431335 2008 Nana 20
|
||||||
|
|
||||||
|
|
||||||
|
## Start the analysis
|
||||||
|
|
||||||
|
We start with all the selling data for year.
|
||||||
|
|
||||||
|
|
||||||
|
```python
|
||||||
|
df_pivot = table_series.pivot(index='Year', columns='Volume', values='Sales')
|
||||||
|
|
||||||
|
ax = df_pivot.plot()
|
||||||
|
plt.title('Serie Selling data for Year')
|
||||||
|
plt.show()
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|

|
||||||
<Axes: title={'center': 'Manga Series'}>
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Ok we need to clean some of this caos of a plot.
|
||||||
|
|
||||||
|
* Remove the HUGE legend
|
||||||

|
* Having a plot type which is readable and usefull (an area plot?)
|
||||||
|
* Fix the X increment ( I want full year, not halfs)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
df_pivot = table_series.pivot(index='Year', columns='Volume', values='Sales')
|
||||||
|
|
||||||
|
# Remove the legend
|
||||||
|
ax = df_pivot.plot(kind='area', alpha=0.5, figsize=(10, 6), legend=False)
|
||||||
|
|
||||||
|
# Fix the year thinks
|
||||||
|
years = df_pivot.index.unique()
|
||||||
|
plt.xticks(np.arange(min(years), max(years) + 1, 1))
|
||||||
|
|
||||||
|
# Add some labels
|
||||||
|
plt.title('Serie Selling data for Year')
|
||||||
|
plt.xlabel('Year')
|
||||||
|
plt.ylabel('Sells')
|
||||||
|
|
||||||
|
plt.grid(True)
|
||||||
|
plt.show()
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Ok now can we put some time reference for Japan?
|
||||||
|
|
||||||
|
|
||||||
|
```python
|
||||||
|
df_pivot = table_series.pivot(index='Year', columns='Volume', values='Sales')
|
||||||
|
|
||||||
|
ax = df_pivot.plot(kind='area', alpha=0.5, figsize=(10, 6), legend=False)
|
||||||
|
years = df_pivot.index.unique()
|
||||||
|
plt.xticks(np.arange(min(years), max(years) + 1, 1))
|
||||||
|
|
||||||
|
plt.title('Serie Selling data for Year')
|
||||||
|
plt.xlabel('Year')
|
||||||
|
plt.ylabel('Sells')
|
||||||
|
plt.axvline(x=2014, color='red', linestyle='--', linewidth=2)
|
||||||
|
|
||||||
|
|
||||||
|
# Marker for Shonen Jump+
|
||||||
|
plt.axvline(x=2014, color='red', linestyle='--', linewidth=2)
|
||||||
|
plt.text(2014, ax.get_ylim()[1]*0.9, 'Shonen Jump+ launched', color='red', fontweight='bold')
|
||||||
|
|
||||||
|
# Marker for Demon Slayer: Kimetsu no Yaiba
|
||||||
|
plt.axvline(x=2019, color='red', linestyle='--', linewidth=2)
|
||||||
|
plt.text(2019, ax.get_ylim()[1]*0.7, 'Unprecedented success of Demon Slayer', color='blue', fontweight='bold')
|
||||||
|
|
||||||
|
# Marker for digital manga sales have surpassed physical manga source: https://hon.jp/news/1.0/0/30684
|
||||||
|
plt.axvline(x=2019, color='red', linestyle='--', linewidth=2)
|
||||||
|
plt.text(2019, ax.get_ylim()[1]*0.5, 'Digital sales surpassed physical', color='black', fontweight='bold')
|
||||||
|
|
||||||
|
# Gray area for the COVID-19 years with label
|
||||||
|
plt.axvspan(2020, 2022, color='gray', alpha=0.3)
|
||||||
|
plt.text(2021, ax.get_ylim()[1]*0.85, 'COVID-19', color='black', fontweight='bold', ha='center')
|
||||||
|
|
||||||
|
plt.grid(True)
|
||||||
|
plt.show()
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Ok now we select some of the manga for having a better view.
|
||||||
|
|||||||
Binary file not shown.
|
After Width: | Height: | Size: 394 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 306 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 314 KiB |
Reference in New Issue
Block a user