
# Software Development Trends in South Asia

In [110]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import pycountry
from countryinfo import CountryInfo as CInfo # Se conecta con la informacion de cada país
import matplotlib.pyplot as plt
import  altair  as  alt # Graficos de promedios
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from bokeh.plotting import figure, output_notebook, show
from bokeh.models import ColumnDataSource, HoverTool, Tabs, TabPanel
from bokeh.palettes import Category20

### Number of Pushes per 100k by Country (2020Q1 - 2024Q1)

In [111]:
url = "https://raw.githubusercontent.com/github/innovationgraph/main/data/languages.csv"
data = pd.read_csv(url, delimiter=',', keep_default_na=False)
# Filtro de EU
data = data[data.iso2_code != "EU"]

# Filtro de XK = Kosovo
data = data[data.iso2_code != "XK"]

nan_rows_count = data.isna().any(axis=1).sum()
print(f"There are {nan_rows_count} rows with NaN values in the dataset.")
# Seleccionamos los 10 lenguajes mas usados en Data Science
top_program_lang = programming_languages = [
    "Python", "Java",
    "C++", "C","JavaScript", "C#",
    "Ruby", "Go", "PHP", "TypeScript"
]
data2 = data[data['language'].isin(top_program_lang)]
data2 = data2.reset_index(drop=True)

data2['year_quarter'] = data2['year'].astype(str) + '-Q' + data2['quarter'].astype(str)
data2
# Reseteamos el index
data2 = data2.reset_index(drop=True)
# Creating a unique identifier
data2['unique_id'] = data2['iso2_code'] + '-' + data2['language']
# Create a DataFrame of unique identifiers
iso2_code = pd.DataFrame({'iso2_code': data2['iso2_code'].unique()})

# Create a DataFrame of unique identifiers for languages
language = pd.DataFrame({'language': data2['language'].unique()})

# Create a DataFrame of all time periods
# time_periods = pd.DataFrame({'year_quarter': range(data2['year_quarter'].min(), data2['year_quarter'].max() + 1)})
year_quarter = pd.DataFrame({'year_quarter': data2['year_quarter'].unique()})

# Create the Cartesian product of unique_ids and time_periods
balanced_panel = iso2_code.merge(language, how='cross').merge(year_quarter, how='cross')

balanced_panel["unique_id"] = balanced_panel["iso2_code"] + "-" +balanced_panel["language"]
balanced_panel

# # Merge the balanced panel with the original data
balanced_df = balanced_panel.merge(data2, on=['unique_id', 'year_quarter'], how='left')

# # Merge the DataFrames with suffixes
balanced_df = balanced_panel.merge(data2, on=['unique_id', 'year_quarter'], how='left', suffixes=('', '_y'))

# # Now, drop the columns with '_y' suffix, which are from the right DataFrame
balanced_df = balanced_df.loc[:, ~balanced_df.columns.str.endswith('_y')]

There are 0 rows with NaN values in the dataset.


In [112]:
url = "https://raw.githubusercontent.com/github/innovationgraph/main/data/git_pushes.csv"
pushes = pd.read_csv(url, delimiter=',', keep_default_na=False)
pushes = pushes[pushes.iso2_code != "EU"]
git_update = pushes[pushes['iso2_code'].isin(['BD', 'LK', 'IN', "CN", 'TR', 'ZA', 'ID', 'US'])]
git_update = git_update.reset_index(drop=True)

In [113]:
countries = balanced_df.iso2_code.unique()

def create_populations_dictionary():
    country_populations = {}
    special_cases = {"MM": 54688774, "PS": 5483450, "ME": 602445, "AD":79824}
    # special_cases = {}
    for country in countries:
        try:
            country_populations.update({country: CInfo(country).info()["population"]})
        except KeyError:
            try:
                fallback_name = pycountry.countries.lookup(country).name
                country_populations.update({country: CInfo(fallback_name).info()["population"]})
            except KeyError:
                print(country)
                country_populations.update({country: special_cases[country]})

    return country_populations

country_populations = create_populations_dictionary()

ME
MM
PS
AD


In [114]:
# Create population
git_update["population"] = git_update["iso2_code"].map(country_populations)

# Create organizations_pc
git_update["git_pushes_pc"] = (git_update["git_pushes"] / git_update["population"])*100000

#create year-quarter
git_update['year_quarter'] = git_update['year'].astype(str) + '-Q' + git_update['quarter'].astype(str)

# Mapeo de códigos ISO a nombres de países
country_map = {
    'BD': 'Bangladesh',
    'IN': 'India',
    'LK': 'Sri Lanka',
    'CN': 'China', 
    'TR': 'Turkey',
    'ZA': 'South Africa',
    'ID': 'Indonesia'
}

# Crear una nueva columna con los nombres de los países
git_update['country'] = git_update['iso2_code'].map(country_map)

In [115]:
# Initialize Bokeh in the notebook
output_notebook()

def prepare_data(git_update):
    """Prepare the data by ensuring formats and filtering invalid values."""
    # Remove rows with null values in key columns
    git_update = git_update.dropna(subset=['year_quarter', 'git_pushes_pc', 'git_pushes'])

    # Filter positive values only
    git_update = git_update[
        (git_update['git_pushes_pc'] > 0) & (git_update['git_pushes'] > 0)
    ]

    return git_update

def create_bokeh_plots(git_update):
    """Create interactive Bokeh plots based on the provided data format."""
    def get_plot(value_col, title, ylabel):
        """Create an interactive plot for a specific metric."""
        x_quarters = sorted(git_update['year_quarter'].astype(str).unique().tolist())
        p = figure(
            title=title,
            x_range=x_quarters, 
            width=800,
            height=500,
            background_fill_color="#ffffff"  # White background for consistency
        )

        # Colors for the lines (adjusted to a maximum of 20 countries)
        countries = git_update['country'].unique()
        colors = Category20[20] if len(countries) <= 20 else [Category20[20][i % 20] for i in range(len(countries))]

        for i, country in enumerate(countries):
            country_data = git_update[git_update['country'] == country]
            if not country_data.empty:
                source = ColumnDataSource(country_data)
                p.line(
                    x='year_quarter',
                    y=value_col,
                    source=source,
                    line_width=3,  # Thicker lines
                    legend_label=country.capitalize(),
                    color=colors[i % len(colors)]
                )
                p.scatter(
                    x='year_quarter',
                    y=value_col,
                    source=source,
                    size=8,  # Larger points
                    color=colors[i % len(colors)],
                    legend_label=country.capitalize()
                )

        # Configure legend and tools
        p.legend.title = "Countries"
        p.legend.label_text_font_size = "12pt"
        p.legend.title_text_font_size = "14pt"
        p.legend.click_policy = "hide"

        # Move legend outside of the plot area to the right
        p.add_layout(p.legend[0], 'right')

        p.add_tools(HoverTool(
            tooltips=[
                ("Quarter", "@year_quarter"),
                (ylabel, f"@{value_col}{{0.00}}"),
            ],
            mode='vline'
        ))

        # Configure axes
        p.xaxis.axis_label = 'Quarter'
        p.yaxis.axis_label = ylabel
        p.y_range.start = 0  # Ensure the y-axis starts at 0
        p.xaxis.major_label_orientation = 0.8  # Rotate x-axis labels
        p.xaxis.axis_label_text_font_size = "14pt"
        p.yaxis.axis_label_text_font_size = "14pt"
        p.title.text_font_size = "18pt"

        return p

    # Create the plots
    fig_pc = get_plot(
        'git_pushes_pc',
        'Number of Git Pushes per 100k by Country (2020Q1-2024Q1)',
        'Pushes per 100k habitants'
    )

    fig_abs = get_plot(
        'git_pushes',
        'Number of Git Pushes by Country (2020Q1-2024Q1)',
        'Total Pushes'
    )

    # Create tabs for both plots
    tabs = Tabs(tabs=[
        TabPanel(child=fig_pc, title="Per Capita"),
        TabPanel(child=fig_abs, title="Absolute Values")
    ])

    return tabs

# Simulate loading the DataFrame (or use your real DataFrame)
# git_update = pd.read_csv('path_to_your_data.csv')  # Load your data here
# Ensure data is prepared before plotting
git_update = prepare_data(git_update)

# Create and show the plots
tabs = create_bokeh_plots(git_update)

show(tabs)



#### Analysis
This graph presents the number of Git pushes per 100,000 inhabitants by country (India, Bangladesh, and Sri Lanka) from Q1 2020 to Q1 2024. Key observations include:

- **Sri Lanka** exhibits the highest activity in terms of Git pushes per capita, with a significant increase, reaching over 2,500 pushes per 100k inhabitants by Q1 2024. This shows a highly active developer community that consistently contributes to repositories.
- **India** maintains steady growth, with Git pushes increasing from around 500 per 100k in 2020 to over 1,000 per 100k by 2024. This reflects India’s growing base of developers and continuous contributions to code repositories.
- **Bangladesh**, while starting from a lower base, shows a gradual increase in pushes, reaching around 600 per 100k by Q1 2024. Although the growth is slower compared to the other two countries, it indicates a steady expansion in developer activity.

#### Key Observations:
- **Sri Lanka** clearly leads in terms of Git pushes per capita, which indicates that its developers are highly engaged in collaborative and open-source projects.
- **India** shows a consistent upward trend, which aligns with the country’s broader role in the global software development industry, reflecting growing engagement from its developer population.
- **Bangladesh** has a slower growth rate but shows steady progress, indicating that its tech ecosystem is gradually becoming more active.

#### Comparative Insights:
- **Sri Lanka**’s rapid rise in Git pushes per capita suggests a strong emphasis on open-source development and collaborative projects, making it a regional leader in software contributions.
- **India**’s growth, though moderate, reflects the massive scale of its developer base and consistent contributions to both local and global projects.
- **Bangladesh** shows slower growth but represents an emerging tech scene, where increasing contributions suggest a positive trajectory for the country’s software development activities.

# Repos

### Number of Repositories per 100k by Country (2020Q1 - 2024Q1)

In [116]:
url = "https://raw.githubusercontent.com/github/innovationgraph/main/data/repositories.csv"
repos = pd.read_csv(url, delimiter=',', keep_default_na=False)
repos_update = repos[repos['iso2_code'].isin(['BD', 'LK', 'IN', 'CN', 'TR', 'ZA', 'ID', 'US'])]
repos_update = repos_update.reset_index(drop=True)
repos_update.head()

Unnamed: 0,repositories,iso2_code,year,quarter
0,30669781,US,2020,1
1,20179087,CN,2020,1
2,9254535,IN,2020,1
3,1645127,ID,2020,1
4,772897,TR,2020,1


In [117]:
# Create population
repos_update["population"] = repos_update["iso2_code"].map(country_populations)

# Create pushes_pc
repos_update["repositories_pc"] = (repos_update["repositories"] / repos_update["population"])*100000

#create year-quarter
repos_update['year_quarter'] = repos_update['year'].astype(str) + '-Q' + repos_update['quarter'].astype(str)


# Mapeo de códigos ISO a nombres de países
country_map = {
    'BD': 'Bangladesh',
    'IN': 'India',
    'LK': 'Sri Lanka',
    'CN': 'China', 
    'TR': 'Turkiye',
    'ZA': 'South Africa',
    'ID': 'Indonesia', 
    'US': 'United States'
}

# Crear una nueva columna con los nombres de los países
repos_update['country'] = repos_update['iso2_code'].map(country_map)

In [118]:
import pandas as pd
from bokeh.plotting import figure, output_notebook, show
from bokeh.models import ColumnDataSource, HoverTool, Tabs, TabPanel
from bokeh.palettes import Category20

# Initialize Bokeh in the notebook
output_notebook()

def create_bokeh_plots(repos_update):
    """Create interactive Bokeh plots based on the provided data."""

    def get_plot(value_col, title, ylabel):
        """Create an interactive plot for a specific metric."""
        p = figure(
            title=title,
            x_range=sorted(repos_update['year_quarter'].unique()),  # Use categorical labels for the x-axis
            width=800,
            height=500,
            background_fill_color="#ffffff"  # White background for consistency
        )

        # Colors for lines (max of 20 countries)
        countries = repos_update['country'].unique()
        colors = Category20[20] if len(countries) <= 20 else [Category20[20][i % 20] for i in range(len(countries))]

        for i, country in enumerate(countries):
            country_data = repos_update[repos_update['country'] == country]
            if not country_data.empty:
                source = ColumnDataSource(country_data)
                p.line(
                    x='year_quarter',
                    y=value_col,
                    source=source,
                    line_width=3,  # Thicker lines
                    legend_label=country.capitalize(),
                    color=colors[i % len(colors)]
                )
                p.scatter(
                    x='year_quarter',
                    y=value_col,
                    source=source,
                    size=8,  # Larger points
                    color=colors[i % len(colors)],
                    legend_label=country.capitalize()
                )

        # Configure legend and tools
        p.legend.title = "Countries"
        p.legend.label_text_font_size = "12pt"
        p.legend.title_text_font_size = "14pt"
        p.legend.click_policy = "hide"

        # Move legend outside of the plot area to the right
        p.add_layout(p.legend[0], 'right')

        p.add_tools(HoverTool(
            tooltips=[
                ("Quarter", "@year_quarter"),
                (ylabel, f"@{value_col}{{0.00}}"),
            ],
            mode='vline'
        ))

        # Configure axes
        p.xaxis.axis_label = 'Quarter'
        p.yaxis.axis_label = ylabel
        p.y_range.start = 0
        p.xaxis.major_label_orientation = 0.8  # Rotate x-axis labels
        p.xaxis.axis_label_text_font_size = "14pt"
        p.yaxis.axis_label_text_font_size = "14pt"
        p.title.text_font_size = "18pt"

        return p

    # Create the plots
    fig_pc = get_plot(
        'repositories_pc',
        'Number of Repositories per 100k by Country (2020Q1-2024Q1)',
        'Repositories per 100k habitants'
    )

    fig_abs = get_plot(
        'repositories',
        'Number of Repositories by Country (2020Q1-2024Q1)',
        'Total Repositories'
    )

    # Create tabs with both plots
    tabs = Tabs(tabs=[
        TabPanel(child=fig_pc, title="Per Capita"),
        TabPanel(child=fig_abs, title="Absolute Values")
    ])

    return tabs

# Example of how to load your data
# repos_update = pd.read_csv('path_to_your_data.csv')  # Load your real dataset here

# Create and show the plots
tabs = create_bokeh_plots(repos_update)
show(tabs)


#### Analysis
This graph shows the number of repositories per 100,000 inhabitants by country (India, Bangladesh, and Sri Lanka) from Q1 2020 to Q1 2024. The following key insights can be drawn from the data:

- **Sri Lanka** leads in the number of repositories per 100k inhabitants, consistently staying ahead of both India and Bangladesh. By Q1 2024, Sri Lanka has exceeded 4,000 repositories per 100k inhabitants.
- **India** shows steady growth, following closely behind Sri Lanka. India's repository creation per capita has surpassed 2,500 by Q1 2024, indicating the country's solid and growing developer base.
- **Bangladesh**, while demonstrating growth, lags behind both Sri Lanka and India. By Q1 2024, Bangladesh reaches just over 1,000 repositories per 100k inhabitants.

#### Key Observations:
- **Sri Lanka**'s higher number of repositories per 100k inhabitants suggests a highly active and engaged developer community relative to its population size. This could be due to a thriving tech startup ecosystem and active government support for digital initiatives.
- **India** shows substantial growth in repository creation, reflecting the country's growing importance in the global software industry, though its per capita repository count remains lower than that of Sri Lanka.
- **Bangladesh**'s lower repository creation rate indicates that while the tech industry is growing, it is still developing compared to its regional counterparts.

#### Comparative Insights:
- The data highlights **Sri Lanka** as an outlier in terms of repository creation per capita. Despite being smaller in population, Sri Lanka outpaces India and Bangladesh significantly.
- **India**, with its much larger population, has a lower number of repositories per capita but still shows impressive growth, reflecting the scale of its developer ecosystem.
- **Bangladesh**, although trailing in repository creation, is showing consistent progress, suggesting that the country's software development sector is steadily maturing.

# Number of Developers per 100k by Country (2020Q1 - 2024Q1)

In [119]:
url = "https://raw.githubusercontent.com/github/innovationgraph/main/data/developers.csv"
dev = pd.read_csv(url, delimiter=',', keep_default_na=False)
dev = dev[dev.iso2_code != "EU"]
dev_update = dev[dev['iso2_code'].isin(['BD', 'LK', 'IN', 'CN', 'TR', 'ZA', 'ID', 'US'])]
dev_update = dev_update.reset_index(drop=True)
# dev_update.head()

In [120]:
# Create population
dev_update["population"] = dev_update["iso2_code"].map(country_populations)

# Create pushes_pc
dev_update["developers_pc"] = (dev_update["developers"] / dev_update["population"])*100000

#create year-quarter
dev_update['year_quarter'] = dev_update['year'].astype(str) + '-Q' + dev_update['quarter'].astype(str)

# Mapeo de códigos ISO a nombres de países
country_map = {
    'BD': 'Bangladesh',
    'IN': 'India',
    'LK': 'Sri Lanka',
    'CN': 'China', 
    'TR': 'Turkiye',
    'ZA': 'South Africa',
    'ID': 'Indonesia', 
    'US': 'United States'
}

# Crear una nueva columna con los nombres de los países
dev_update['country'] = dev_update['iso2_code'].map(country_map)

# dev_update.head()

In [121]:
import pandas as pd
from bokeh.plotting import figure, output_notebook, show
from bokeh.models import ColumnDataSource, HoverTool, Tabs, TabPanel
from bokeh.palettes import Category20

# Initialize Bokeh in the notebook
output_notebook()

def create_bokeh_plots(dev_update):
    """Create interactive Bokeh plots based on the provided data."""

    def get_plot(value_col, title, ylabel):
        """Create an interactive plot for a specific metric."""
        p = figure(
            title=title,
            x_range=sorted(dev_update['year_quarter'].unique()),  # Use categorical labels for the x-axis
            width=800,
            height=500,
            background_fill_color="#ffffff"  # White background for consistency
        )

        # Colors for lines (max of 20 countries)
        countries = dev_update['country'].unique()
        colors = Category20[20] if len(countries) <= 20 else [Category20[20][i % 20] for i in range(len(countries))]

        for i, country in enumerate(countries):
            country_data = dev_update[dev_update['country'] == country]
            if not country_data.empty:
                source = ColumnDataSource(country_data)
                p.line(
                    x='year_quarter',
                    y=value_col,
                    source=source,
                    line_width=3,  # Thicker lines
                    legend_label=country.capitalize(),
                    color=colors[i % len(colors)]
                )
                p.scatter(
                    x='year_quarter',
                    y=value_col,
                    source=source,
                    size=8,  # Larger points
                    color=colors[i % len(colors)],
                    legend_label=country.capitalize()
                )

        # Configure legend and tools
        p.legend.title = "Countries"
        p.legend.label_text_font_size = "12pt"
        p.legend.title_text_font_size = "14pt"
        p.legend.click_policy = "hide"

        # Move legend outside of the plot area to the right
        p.add_layout(p.legend[0], 'right')

        p.add_tools(HoverTool(
            tooltips=[
                ("Quarter", "@year_quarter"),
                (ylabel, f"@{value_col}{{0.00}}"),
            ],
            mode='vline'
        ))

        # Configure axes
        p.xaxis.axis_label = 'Quarter'
        p.yaxis.axis_label = ylabel
        p.y_range.start = 0
        p.xaxis.major_label_orientation = 0.8  # Rotate x-axis labels
        p.xaxis.axis_label_text_font_size = "14pt"
        p.yaxis.axis_label_text_font_size = "14pt"
        p.title.text_font_size = "18pt"

        return p

    # Create the plots
    fig_pc = get_plot(
        'developers_pc',
        'Number of Developers per 100k by Country (2020Q1-2024Q1)',
        'Developers per 100k habitants'
    )

    fig_abs = get_plot(
        'developers',
        'Number of Developers by Country (2020Q1-2024Q1)',
        'Total Developers'
    )

    # Create tabs with both plots
    tabs = Tabs(tabs=[
        TabPanel(child=fig_pc, title="Per Capita"),
        TabPanel(child=fig_abs, title="Absolute Values")
    ])

    return tabs

# Execute the function and show the plots
tabs = create_bokeh_plots(dev_update)
show(tabs)


#### Analysis
This graph displays the number of developers per 100,000 inhabitants by country (India, Bangladesh, and Sri Lanka) from Q1 2020 to Q1 2024. The key observations from the data include:

- **Sri Lanka** consistently has the highest number of developers per 100k inhabitants, exceeding 1,500 developers by Q1 2024. This suggests a highly active and engaged developer community relative to the country’s population.
- **India** shows steady and significant growth, reaching close to 1,000 developers per 100k inhabitants by Q1 2024. This reflects India's large and rapidly growing developer base, even though the per capita numbers remain behind Sri Lanka.
- **Bangladesh**, while showing slower growth than the other two countries, has increased its number of developers per capita from just over 200 per 100k inhabitants in 2020 to over 500 by 2024. This indicates a steady expansion of the tech industry and developer population in the country.

#### Key Observations:
- **Sri Lanka**'s high number of developers per capita emphasizes the country’s commitment to growing its software development sector and fostering technical skills.
- **India**, while showing lower numbers per capita compared to Sri Lanka, has a massive and rapidly expanding developer population, which continues to grow at a steady pace.
- **Bangladesh** shows promising growth, though it still lags behind both Sri Lanka and India in terms of the number of developers per 100k inhabitants. This could be attributed to its developing tech ecosystem.

#### Comparative Insights:
- **Sri Lanka**’s lead in developers per capita shows a robust focus on technology education and opportunities, despite its smaller population.
- **India**’s steady rise highlights the country’s growing global influence in the tech industry, supported by a massive and diverse developer base.
- **Bangladesh**'s growth, while slower, indicates the country is catching up, albeit at a different pace, likely due to its different economic and infrastructural context.

# Number of Organizations per 100k by Country (2020Q1 - 2024Q1)

In [122]:
url = "https://raw.githubusercontent.com/github/innovationgraph/main/data/organizations.csv"
org = pd.read_csv(url, delimiter=',', keep_default_na=False)
org = org[org.iso2_code != "EU"]

org_update = org[org['iso2_code'].isin(['BD', 'LK', 'IN', 'CN', 'TR', 'ZA', 'ID', 'US'])]
org_update = org_update.reset_index(drop=True)
# org_update.head()

In [123]:
# Create population
org_update["population"] = org_update["iso2_code"].map(country_populations)

# Create organizations_pc
org_update["organizations_pc"] = (org_update["organizations"] / org_update["population"])*100000

#create year-quarter
org_update['year_quarter'] = org_update['year'].astype(str) + '-Q' + org_update['quarter'].astype(str)

# Mapeo de códigos ISO a nombres de países
country_map = {
    'BD': 'Bangladesh',
    'IN': 'India',
    'LK': 'Sri Lanka',
    'CN': 'China', 
    'TR': 'Turkiye',
    'ZA': 'South Africa',
    'ID': 'Indonesia', 
    'US': 'United States'
}

# Crear una nueva columna con los nombres de los países
org_update['country'] = org_update['iso2_code'].map(country_map)

# org_update.head()

In [124]:
import pandas as pd
from bokeh.plotting import figure, output_notebook, show
from bokeh.models import ColumnDataSource, HoverTool, Tabs, TabPanel
from bokeh.palettes import Category20

# Initialize Bokeh in the notebook
output_notebook()

def create_bokeh_plots(org_update):
    """Create interactive Bokeh plots based on the provided data."""

    def get_plot(value_col, title, ylabel):
        """Create an interactive plot for a specific metric."""
        p = figure(
            title=title,
            x_range=sorted(org_update['year_quarter'].unique()),  # Use categorical labels for the x-axis
            width=800,
            height=500,
            background_fill_color="#ffffff"  # White background for consistency
        )

        # Colors for lines (max of 20 countries)
        countries = org_update['country'].unique()
        colors = Category20[20] if len(countries) <= 20 else [Category20[20][i % 20] for i in range(len(countries))]

        for i, country in enumerate(countries):
            country_data = org_update[org_update['country'] == country]
            if not country_data.empty:
                source = ColumnDataSource(country_data)
                p.line(
                    x='year_quarter',
                    y=value_col,
                    source=source,
                    line_width=3,  # Thicker lines
                    legend_label=country.capitalize(),
                    color=colors[i % len(colors)]
                )
                p.scatter(
                    x='year_quarter',
                    y=value_col,
                    source=source,
                    size=8,  # Larger points
                    color=colors[i % len(colors)],
                    legend_label=country.capitalize()
                )

        # Configure legend and tools
        p.legend.title = "Countries"
        p.legend.label_text_font_size = "12pt"
        p.legend.title_text_font_size = "14pt"
        p.legend.click_policy = "hide"

        # Move legend outside of the plot area to the right
        p.add_layout(p.legend[0], 'right')

        p.add_tools(HoverTool(
            tooltips=[
                ("Quarter", "@year_quarter"),
                (ylabel, f"@{value_col}{{0.00}}"),
            ],
            mode='vline'
        ))

        # Configure axes
        p.xaxis.axis_label = 'Quarter'
        p.yaxis.axis_label = ylabel
        p.y_range.start = 0
        p.xaxis.major_label_orientation = 0.8  # Rotate x-axis labels
        p.xaxis.axis_label_text_font_size = "14pt"
        p.yaxis.axis_label_text_font_size = "14pt"
        p.title.text_font_size = "18pt"

        return p

    # Create the plots
    fig_pc = get_plot(
        'organizations_pc',
        'Number of Organizations per 100k by Country (2020Q1-2024Q1)',
        'Organizations per 100k habitants'
    )

    fig_abs = get_plot(
        'organizations',
        'Number of Organizations by Country (2020Q1-2024Q1)',
        'Total Organizations'
    )

    # Create tabs with both plots
    tabs = Tabs(tabs=[
        TabPanel(child=fig_pc, title="Per Capita"),
        TabPanel(child=fig_abs, title="Absolute Values")
    ])

    return tabs

# Execute the function and show the plots
tabs = create_bokeh_plots(org_update)
show(tabs)


#### Analysis
This graph shows the number of organizations per 100,000 inhabitants by country (India, Bangladesh, and Sri Lanka) from Q1 2020 to Q1 2024. The key insights from the data include:

- **Sri Lanka** shows the highest number of organizations per 100k inhabitants, with a steep growth from 30 in Q1 2020 to over 90 by Q1 2024. This indicates a highly active organizational base relative to its population size, likely reflecting a thriving startup ecosystem.
- **India** shows steady growth, increasing from around 20 organizations per 100k in Q1 2020 to over 50 by Q1 2024. This growth reflects India's large and expanding tech industry, with more organizations being established over time.
- **Bangladesh** remains behind both Sri Lanka and India, but still shows growth, increasing from around 20 organizations per 100k to approximately 30 by Q1 2024. This suggests a more gradual growth of organizations in the tech sector compared to its regional counterparts.

#### Key Observations:
- **Sri Lanka**’s rapid growth in organizations per capita suggests a strong support system for new tech businesses and an entrepreneurial culture that encourages the creation of new organizations.
- **India**’s steady growth is consistent with its position as a global tech hub, with more organizations being formed as its tech ecosystem expands, even if the per capita numbers remain lower than Sri Lanka.
- **Bangladesh** shows slower but steady growth, reflecting the development of its tech sector. While it lags behind, the increase in the number of organizations is promising for the future of tech innovation in the country.

#### Comparative Insights:
- **Sri Lanka** continues to lead in terms of organizations per capita, suggesting a strong focus on fostering new businesses in the tech sector.
- **India**’s numbers, though lower per capita than Sri Lanka, reflect its vast organizational landscape and continuous growth in tech-related industries.
- **Bangladesh**'s slower pace of growth compared to the other two countries may be due to infrastructural or economic challenges, but the upward trend indicates ongoing development in its tech sector.

## Licenses

In [125]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

This code loads a dataset of software licenses, filters it for Bangladesh, and creates a bar plot to visualize the number of "pushers" (contributors) for each license type in the country.

In [126]:
licenses = pd.read_csv('https://raw.githubusercontent.com/github/innovationgraph/refs/heads/main/data/licenses.csv')

The chart shows a clear distribution of contributors across various software licenses in Bangladesh. Some licenses, such as MIT and GPL-3.0, have a significantly higher number of contributors, while others, like Proprietary and Apache-2.0, have fewer contributors. This suggests that certain open-source licenses, such as MIT, attract more developer participation compared to others. The bars are ordered to highlight this variation, providing insight into the preferred licenses within the country.

Here we've selected five groups depending to the licenses.


*   MIT License
*   Apache license
*   GPL 3.0
*   NOASSERTION
*   Others licenses

The chart shows the number of contributors for each software license in Bangladesh, with several licenses collapsed into the "Other Licenses" category. The "Other Licenses" category now represents a significant portion of the contributions, while licenses like MIT and GPL-3.0 still have the most contributors.

In [127]:
import pandas as pd
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool, Tabs, TabPanel
from bokeh.layouts import column

# Map of ISO codes to country names
country_map = {
    'BD': 'Bangladesh',
    'IN': 'India',
    'LK': 'Sri Lanka',
    'CN': 'China', 
    'TR': 'Turkey',
    'ZA': 'South Africa',
    'ID': 'Indonesia', 
    'US': 'United States'   
}

# Load the dataset
licenses = pd.read_csv('https://raw.githubusercontent.com/github/innovationgraph/refs/heads/main/data/licenses.csv')

def create_country_plot(country_code, country_name):
    """Create a plot for a specific country."""
    # Filter data for the country
    country_data = licenses[licenses.iso2_code == country_code]
    
    # Get top 5 licenses by number of pushers
    top_licenses = (
        country_data.groupby('spdx_license')['num_pushers']
        .sum()
        .nlargest(5)
        .reset_index()
    )
    
    # Create ColumnDataSource
    source = ColumnDataSource(top_licenses)
    
    # Create figure
    p = figure(
        x_range=source.data['spdx_license'],
        height=600,
        width=800,
        title=f"Number of Pushers by License in {country_name}",
        toolbar_location="above",
        tools="pan,box_zoom,reset,save"
    )
    
    # Add bars
    p.vbar(
        x='spdx_license',
        top='num_pushers',
        width=0.6,
        source=source,
        color="#4A90E2",
        fill_alpha=0.8
    )
    
    # Add text labels
    p.text(
        x='spdx_license',
        y='num_pushers',
        text='num_pushers',
        source=source,
        text_align='center',
        text_baseline='bottom',
        text_font_size="10pt",
        text_color="#333333"
    )
    
    # Add hover tool
    p.add_tools(HoverTool(tooltips=[
        ("License", "@spdx_license"),
        ("Pushers", "@num_pushers")
    ]))
    
    # Customize appearance
    p.background_fill_color = "#f5f5f5"
    p.grid.grid_line_color = "#e0e0e0"
    p.grid.grid_line_dash = "dotted"
    p.title.text_color = "#333333"
    p.title.text_font_size = "16pt"
    p.xaxis.axis_label = "License"
    p.xaxis.axis_label_text_color = "#333333"
    p.xaxis.major_label_text_color = "#555555"
    p.yaxis.axis_label = "Number of Pushers"
    p.yaxis.axis_label_text_color = "#333333"
    p.yaxis.major_label_text_color = "#555555"
    p.xaxis.major_label_orientation = 0.8
    p.y_range.start = 0
    
    return p

def create_tabs():
    """Create tabs for all countries."""
    tabs = []
    
    # Create a tab for each country
    for code, name in country_map.items():
        plot = create_country_plot(code, name)
        tab = TabPanel(child=plot, title=name)
        tabs.append(tab)
    
    # Combine all tabs
    return Tabs(tabs=tabs)

# Create and show the visualization
tabs = create_tabs()
show(tabs)

In [128]:
import pandas as pd
from bokeh.plotting import figure, output_notebook, show
from bokeh.models import ColumnDataSource, HoverTool, Tabs, TabPanel, Div
from bokeh.palettes import Category20

# Initialize Bokeh in the notebook
output_notebook()

# Map of ISO codes to country names
country_map = {
    'BD': 'Bangladesh',
    'IN': 'India',
    'LK': 'Sri Lanka',
    'CN': 'China', 
    'TR': 'Turkey',
    'ZA': 'South Africa',
    'ID': 'Indonesia', 
    'US': 'United States'
}

# Load the dataset
licenses = pd.read_csv('https://raw.githubusercontent.com/github/innovationgraph/refs/heads/main/data/licenses.csv')

# Filter only the countries included in the map
licenses = licenses[licenses['iso2_code'].isin(country_map.keys())]

def create_bokeh_plots(data):
    """Create interactive plots with Bokeh for each country in tabs."""
    tabs = []

    for iso2_code, country_name in country_map.items():
        # Filter data by country
        country_data = data[data['iso2_code'] == iso2_code]

        if country_data.empty:
            print(f"No data available for {country_name}.")
            continue

        # Create an explicit copy to avoid SettingWithCopyWarning
        country_data = country_data.copy()

        # Identify the top 5 licenses by number of pushers
        top_licenses = country_data.groupby('spdx_license')['num_pushers'].sum().nlargest(5).index.tolist()

        # Calculate "Other Licenses"
        country_data['collapsed_license'] = country_data['spdx_license'].apply(
            lambda x: x if x in top_licenses else 'Other Licenses'
        )

        # Group by quarter and license
        grouped_data = country_data.groupby(['year', 'quarter', 'collapsed_license'])['num_pushers'].sum().reset_index()
        grouped_data['year_quarter'] = grouped_data['year'].astype(str) + "-Q" + grouped_data['quarter'].astype(str)

        # Create data source
        source = ColumnDataSource(grouped_data)

        # Configure plot
        p = figure(
            title=f"Licenses in {country_name}",
            x_range=sorted(grouped_data['year_quarter'].unique()),
            width=800,
            height=500,
            background_fill_color="#ffffff"  # White background for consistency
        )

        # Dynamic colors for each license
        licenses_list = grouped_data['collapsed_license'].unique()
        colors = Category20[len(licenses_list)] if len(licenses_list) <= 20 else Category20[20]

        for i, license_type in enumerate(licenses_list):
            license_data = grouped_data[grouped_data['collapsed_license'] == license_type]
            license_source = ColumnDataSource(license_data)

            p.line(
                x='year_quarter',
                y='num_pushers',
                source=license_source,
                line_width=3,
                legend_label=license_type,
                color=colors[i % len(colors)]
            )
            p.scatter(
                x='year_quarter',
                y='num_pushers',
                source=license_source,
                size=8,
                color=colors[i % len(colors)],
                legend_label=license_type
            )

        # Configure legend
        p.legend.title = "Licenses"
        p.legend.label_text_font_size = "12pt"
        p.legend.title_text_font_size = "14pt"
        p.legend.click_policy = "hide"

        # Move legend outside of the plot area to the right
        p.add_layout(p.legend[0], 'right')

        # Configure tools and axes
        p.add_tools(HoverTool(
            tooltips=[
                ("Quarter", "@year_quarter"),
                ("Pushers", "@num_pushers"),
                ("License", "@collapsed_license"),
            ],
            mode='vline'
        ))

        p.xaxis.axis_label = 'Quarter'
        p.yaxis.axis_label = 'Number of Pushers'
        p.y_range.start = 0
        p.xaxis.major_label_orientation = 0.8
        p.xaxis.axis_label_text_font_size = "14pt"
        p.yaxis.axis_label_text_font_size = "14pt"
        p.title.text_font_size = "18pt"

        # Additional annotations as a `Div` object
        num_periods = len(grouped_data['year_quarter'].unique())
        num_observations = len(country_data)
        num_licenses = len(licenses_list)
        annotation = Div(
            text=(
                f"<div style='font-size: 14px; line-height: 1.5;'>"
                f"<b>Number of periods:</b> {num_periods}<br>"
                f"<b>Number of observations:</b> {num_observations}<br>"
                f"<b>Number of licenses:</b> {num_licenses}<br>"
                "<b>Source:</b> GitHub Innovation Graph"
                f"</div>"
            )
        )

        # Add plot as a tab
        tab = TabPanel(child=p, title=country_name)
        tabs.append(tab)

    # Create and show tabs
    if tabs:
        tabs_layout = Tabs(tabs=tabs)
        show(tabs_layout)
    else:
        print("No sufficient data to generate plots.")

# Execute function
create_bokeh_plots(licenses)


# Topics

In [129]:
import pandas as pd
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool, Tabs, TabPanel
from bokeh.layouts import column
from bokeh.palettes import Category10

# Map of ISO codes to country names
country_map = {
    'BD': 'Bangladesh',
    'IN': 'India',
    'LK': 'Sri Lanka',
    'CN': 'China', 
    'TR': 'Turkey',
    'ZA': 'South Africa',
    'ID': 'Indonesia', 
    'US': 'United States'
}

# Load the dataset
topics = pd.read_csv('https://raw.githubusercontent.com/github/innovationgraph/refs/heads/main/data/topics.csv')

# Drop "config" and "github-config" from the topic column
topics = topics[~topics['topic'].isin(['config', 'github-config'])]

def create_country_plot(country_code, country_name):
    """Create a plot for a specific country."""
    # Filter data for the country
    country_data = topics[topics.iso2_code == country_code]
    
    # Get top 10 topics by number of pushers
    top_topics = (
        country_data.groupby('topic')['num_pushers']
        .sum()
        .nlargest(10)
        .reset_index()
    )
    
    # Create ColumnDataSource
    source = ColumnDataSource(top_topics)
    
    # Create figure
    p = figure(
        x_range=source.data['topic'],
        height=500,
        width=800,
        title=f"Number of Pushers for Each Topic in {country_name}",
        toolbar_location="above",
        tools="pan,box_zoom,reset,save"
    )
    
    # Add bars
    p.vbar(
        x='topic',
        top='num_pushers',
        width=0.6,
        source=source,
        color="#4A90E2",
        fill_alpha=0.8
    )
    
    # Add text labels
    p.text(
        x='topic',
        y='num_pushers',
        text='num_pushers',
        source=source,
        text_align='center',
        text_baseline='bottom',
        text_font_size="10pt",
        text_color="#333333"
    )
    
    # Add hover tool
    p.add_tools(HoverTool(tooltips=[
        ("Topic", "@topic"),
        ("Pushers", "@num_pushers")
    ]))
    
    # Customize appearance
    p.background_fill_color = "#f5f5f5"
    p.grid.grid_line_color = "#e0e0e0"
    p.grid.grid_line_dash = "dotted"
    p.title.text_color = "#333333"
    p.title.text_font_size = "16pt"
    p.xaxis.axis_label = "Topic"
    p.xaxis.axis_label_text_color = "#333333"
    p.xaxis.major_label_orientation = 0.8
    p.xaxis.major_label_text_font_size = "12pt"  # Changed this line to adjust x-axis label font size
    p.yaxis.axis_label = "Number of Pushers"
    p.yaxis.axis_label_text_color = "#333333"
    p.y_range.start = 0
    
    return p

def create_tabs():
    """Create tabs for all countries."""
    tabs = []
    
    # Create a tab for each country
    for code, name in country_map.items():
        plot = create_country_plot(code, name)
        tab = TabPanel(child=plot, title=name)
        tabs.append(tab)
    
    # Combine all tabs
    return Tabs(tabs=tabs)

# Create and show the visualization
tabs = create_tabs()
show(tabs)

# Programming Languages

In [130]:
url = "https://raw.githubusercontent.com/github/innovationgraph/main/data/languages.csv"
data = pd.read_csv(url, delimiter=',', keep_default_na=False)
# Filtro de EU
data = data[data.iso2_code != "EU"]

# Filtro de XK = Kosovo
data = data[data.iso2_code != "XK"]

nan_rows_count = data.isna().any(axis=1).sum()
print(f"There are {nan_rows_count} rows with NaN values in the dataset.")

There are 0 rows with NaN values in the dataset.


In [131]:
# from typing import Dict, List

# # Define the categories
# categories = {
#     "high_level_general_purpose": "High-Level General Purpose Languages",
#     "low_level_system": "Low-Level System Programming Languages",
#     "shell_scripting": "Shell Scripting Languages",
#     "domain_specific": "Domain-Specific Languages"
# }

# # Classify programming languages
# programming_languages: Dict[str, List[str]] = {
#     "high_level_general_purpose": [
#         "Python", "Java", "C#", "JavaScript", "TypeScript", "Ruby", "PHP", "Go",
#         "Swift", "Kotlin", "Rust", "Dart", "Scala", "C++", "Haskell", "Clojure",
#         "Elixir", "F#", "Groovy", "Perl", "Lua", "Erlang", "OCaml", "Julia",
#         "Lisp", "Objective-C", "R", "Delphi", "Visual Basic"
#     ],
#     "low_level_system": [
#         "C", "Assembly", "Fortran", "COBOL", "Ada", "VHDL", "Verilog",
#         "SystemVerilog", "WebAssembly", "x86 Assembly", "ARM Assembly"
#     ],
#     "shell_scripting": [
#         "Bash", "PowerShell", "Zsh", "Fish", "Tcsh", "Ksh", "CMD", "AppleScript"
#     ],
#     "domain_specific": [
#         "SQL", "HTML", "CSS", "MATLAB", "GLSL", "HLSL", "Apex", "ABAP",
#         "Solidity", "XSLT", "VBA", "SAS", "LabVIEW", "AutoHotkey", "VHDL",
#         "Prolog", "COBOL", "Scratch", "YAML", "LaTeX", "Mathematica", "PL/SQL",
#         "T-SQL", "Terraform", "HCL", "Haxe", "D3.js", "GraphQL", "QML", "SASS",
#         "LESS", "Nix", "Postscript", "Max/MSP", "Pure Data", "Verilog", "Puppet"
#     ]
# }

# def export_languages():
#     """Export the list of languages by category."""
#     print("# Programming Languages by Category\n")
    
#     for category_key, category_name in categories.items():
#         print(f"## {category_name}\n")
        
#         languages = sorted(programming_languages[category_key])
#         for language in languages:
#             print(f"- {language}")
        
#         print("\n")

# if __name__ == "__main__":
#     export_languages()

In [132]:
import pandas as pd

url = "https://raw.githubusercontent.com/github/innovationgraph/main/data/languages.csv"

data = pd.read_csv(url, delimiter=',', keep_default_na=False)
# Filtro de EU
data = data[data.iso2_code != "EU"]

# Filtro de XK = Kosovo
data = data[data.iso2_code != "XK"]

In [133]:
import pandas as pd

# Load the dataset
url = "https://raw.githubusercontent.com/github/innovationgraph/main/data/languages.csv"
data = pd.read_csv(url, delimiter=',', keep_default_na=False)

# Filter out EU and XK (Kosovo)
data = data[data.iso2_code != "EU"]
data = data[data.iso2_code != "XK"]

# Check for NaN values
nan_rows_count = data.isna().any(axis=1).sum()
print(f"There are {nan_rows_count} rows with NaN values in the dataset.")

# Extract unique programming languages from the dataset
programming_languages = data['language'].unique()

# Define the categorization function
def categorize_language(language: str) -> str:
    """Categorize a programming language into one of the four categories."""
    # Define categories in lowercase
    high_level_general_purpose = [
        "python", "java", "c#", "javascript", "typescript", "ruby", "php", "go",
        "swift", "kotlin", "rust", "dart", "scala", "c++", "haskell", "clojure",
        "elixir", "f#", "groovy", "perl", "lua", "erlang", "ocaml", "julia",
        "lisp", "objective-c", "r", "delphi", "visual basic"
    ]
    low_level_system = [
        "c", "assembly", "fortran", "cobol", "ada", "vhdl", "verilog",
        "systemverilog", "webassembly", "x86 assembly", "arm assembly"
    ]
    shell_scripting = [
        "bash", "powershell", "zsh", "fish", "tcsh", "ksh", "cmd", "applescript"
    ]
    domain_specific = [
        "sql", "html", "css", "matlab", "glsl", "hlsl", "apex", "abap",
        "solidity", "xslt", "vba", "sas", "labview", "autohotkey", "vhdl",
        "prolog", "cobol", "scratch", "yaml", "latex", "mathematica", "pl/sql",
        "t-sql", "terraform", "hcl", "haxe", "d3.js", "graphql", "qml", "sass",
        "less", "nix", "postscript", "max/msp", "pure data", "verilog", "puppet"
    ]
    
    # Convert the input language to lowercase for comparison
    language = language.lower()
    
    if language in high_level_general_purpose:
        return "High-Level General Purpose"
    elif language in low_level_system:
        return "Low-Level System"
    elif language in shell_scripting:
        return "Shell Scripting"
    elif language in domain_specific:
        return "Domain-Specific"
    else:
        return "Unknown"

# Apply the categorization function to the dataset
data['language_function'] = data['language'].apply(categorize_language)

# Keep only the relevant columns
data = data[['language', 'language_function']]

# Drop duplicates to ensure each language appears only once
data_domain = data.drop_duplicates()

# Display the updated dataframe
# data_domain
# Optionally, save the updated dataframe to a CSV file
# data.to_csv('unique_languages_with_categories.csv', index=False)
# Optionally, save the updated dataframe to a CSV file
# data.to_csv('languages_with_categories.csv', index=False)

There are 0 rows with NaN values in the dataset.


In [134]:
import pandas as pd

# Read the CSV file
file_path = 'programming_languages.csv'
df = pd.read_csv(file_path)

# Define disruptive tech skills - these are programming languages associated with 
# emerging/disruptive technologies like AI, blockchain, quantum computing, etc.
disruptive_tech_languages = [
    # AI/ML related
    "Python", "R", "Julia", "TensorFlow", "PyTorch", "CUDA", "MLIR",
    
    # Blockchain/Crypto
    "Solidity", "Move", "Cairo", "Vyper", "Yul",
    
    # Quantum Computing
    "Q#", "Silq", "QML", "Quil",
    
    # AR/VR
    "ShaderLab", "HLSL", "GLSL", "Metal",
    
    # Robotics
    "ROS", "URDF", "RobotFramework",
    
    # IoT/Edge Computing
    "Rust", "WebAssembly", "MicroPython", "Arduino",
    
    # New systems programming
    "Zig", "Nim", "Odin", "Crystal", "V",
    
    # Modern data processing
    "Scala", "Spark", "Flink", "Presto", "DataWeave", "jq",
    
    # Cloud native
    "HCL", "Terraform", "Jsonnet", "CUE", "Bicep", "Pulumi", "Nix",
    
    # Smart contracts/Decentralized apps
    "Solidity", "Clarity", "Cairo",
    
    # Modern functional languages
    "Haskell", "Elm", "Reason", "ReScript", "F#", "Elixir", "Erlang", "OCaml", "PureScript", "Idris", "Lean"
]

# Function to classify programming languages
def classify_linkedin_skill(language):
    if language in disruptive_tech_languages:
        return "Disruptive Tech Skills"
    else:
        return "Tech Skills"

# Apply the classification function
df['linkedin_skill_category'] = df['Programming Language'].apply(classify_linkedin_skill)


# Count of each category
category_counts = df['linkedin_skill_category'].value_counts()
print("\nLinkedIn Skill Category Distribution:")
print(category_counts)
df_linkedin_skill = df.rename(columns={"Programming Language": "language"})[['language', 'linkedin_skill_category']]


LinkedIn Skill Category Distribution:
linkedin_skill_category
Tech Skills               344
Disruptive Tech Skills     43
Name: count, dtype: int64


In [135]:
import pandas as pd

# Load the dataset
url = "https://raw.githubusercontent.com/github/innovationgraph/main/data/languages.csv"
data = pd.read_csv(url, delimiter=',', keep_default_na=False)

# Define the classification function
def classify_language(language: str) -> list:
    """Classify a programming language into one or more categories."""
    # Define categories
    categories = {
        "Web Development (Frontend & Backend)": [
            "HTML", "CSS", "JavaScript", "TypeScript", "PHP", "Ruby", "Python", "ASP.NET", "Vue", "React", "Angular", "Svelte"
        ],
        "Mobile App Development": [
            "Swift", "Kotlin", "Dart", "Java", "Objective-C", "Flutter", "React Native"
        ],
        "Data Science / AI / Machine Learning": [
            "Python", "R", "Julia", "MATLAB", "Scala", "TensorFlow", "PyTorch", "Jupyter Notebook"
        ],
        "Systems Programming": [
            "C", "C++", "Rust", "Go", "Assembly", "Zig", "Ada", "Fortran"
        ],
        "Game Development": [
            "C#", "C++", "Lua", "ShaderLab", "HLSL", "GLSL", "UnrealScript", "Godot", "GDScript"
        ],
        "Scripting / Automation / DevOps": [
            "Bash", "PowerShell", "Python", "Shell", "Groovy", "Makefile", "Dockerfile", "HCL", "Terraform", "Ansible"
        ],
        "Embedded / IoT": [
            "C", "C++", "Rust", "MicroPython", "Arduino", "Verilog", "VHDL", "SystemVerilog", "Embedded C"
        ],
        "Academic / Research / Scientific Computing": [
            "MATLAB", "R", "Python", "Julia", "Fortran", "Wolfram", "Maple", "Maxima"
        ],
        "Database / Query Languages": [
            "SQL", "TSQL", "PLSQL", "PLpgSQL", "HiveQL", "GraphQL", "MongoDB Query Language"
        ],
        "Functional Programming / Language Design": [
            "Haskell", "F#", "OCaml", "Erlang", "Elixir", "Clojure", "Scala", "Idris", "PureScript", "Reason"
        ],
        "UI / UX / Markup / Templating": [
            "HTML", "CSS", "Markdown", "XML", "XSLT", "SASS", "LESS", "Pug", "Handlebars", "Blade", "Twig"
        ],
        "Miscellaneous / Niche / Historical / Low Market Demand": [
            "COBOL", "Pascal", "Smalltalk", "Prolog", "Lisp", "Delphi", "Forth", "APL", "Ada", "Modula-2", "BASIC"
        ]
    }
    
    # Convert language to uppercase for case-insensitive matching
    language = language.upper()
    
    # Find all categories the language belongs to
    matched_categories = [category for category, langs in categories.items() if language in map(str.upper, langs)]
    
    # Return the list of categories (or "Unknown" if no match is found)
    return matched_categories if matched_categories else ["Unknown"]

# Apply the classification function to the dataset
data['language_categories'] = data['language'].apply(classify_language)

# Combine multiple categories into a single string
data['language_job_cat'] = data['language_categories'].apply(lambda x: ", ".join(x))

# Drop duplicates to ensure each language appears only once
data_unique = data[['language', 'language_job_cat']].drop_duplicates()

# Rename columns for clarity
data_job_cat = data_unique.rename(columns={'Programming Language': 'language'})

# Display the updated dataframe
# print(data_unique.head())
# data_job_cat

# Optionally, save the updated dataframe to a CSV file
# data_unique.to_csv('unique_languages_with_categories.csv', index=False)

In [136]:
import pandas as pd

# Load the dataset
url = "https://raw.githubusercontent.com/github/innovationgraph/main/data/languages.csv"
data = pd.read_csv(url, delimiter=',', keep_default_na=False)

# Create a clean list of unique languages
unique_languages = data['language'].unique()

# Create an empty DataFrame for the results
result_df = pd.DataFrame(index=unique_languages)
result_df.index.name = 'language'

# Define the categories with improved names
categories = {
    "WebDev": [  # Web Development (Frontend & Backend)
        "HTML", "CSS", "JavaScript", "TypeScript", "PHP", "Ruby", "ASP.NET", "Vue", "React", "Angular", "Svelte"
    ],
    "MobileDev": [  # Mobile App Development
        "Swift", "Kotlin", "Dart", "Java", "Objective-C", "Flutter", "React Native"
    ],
    "DataScience": [  # Data Science / AI / Machine Learning
        "Python", "R", "Julia", "MATLAB", "Scala", "TensorFlow", "PyTorch", "Jupyter Notebook"
    ],
    "SystemsProg": [  # Systems Programming
        "C", "C++", "Rust", "Go", "Assembly", "Zig", "Ada", "Fortran"
    ],
    "GameDev": [  # Game Development
        "C#", "C++", "Lua", "ShaderLab", "HLSL", "GLSL", "UnrealScript", "Godot", "GDScript"
    ],
    "DevOps": [  # Scripting / Automation / DevOps
        "Bash", "PowerShell", "Python", "Shell", "Groovy", "Makefile", "Dockerfile", "HCL", "Terraform", "Ansible"
    ],
    "Embedded": [  # Embedded / IoT
        "C", "C++", "Rust", "MicroPython", "Arduino", "Verilog", "VHDL", "SystemVerilog", "Embedded C"
    ],
    "Scientific": [  # Academic / Research / Scientific Computing
        "MATLAB", "R", "Python", "Julia", "Fortran", "Wolfram", "Maple", "Maxima"
    ],
    "Database": [  # Database / Query Languages
        "SQL", "TSQL", "PLSQL", "PLpgSQL", "HiveQL", "GraphQL", "MongoDB Query Language"
    ],
    "Functional": [  # Functional Programming / Language Design
        "Haskell", "F#", "OCaml", "Erlang", "Elixir", "Clojure", "Scala", "Idris", "PureScript", "Reason"
    ],
    "UI_UX": [  # UI / UX / Markup / Templating
        "HTML", "CSS", "Markdown", "XML", "XSLT", "SASS", "LESS", "Pug", "Handlebars", "Blade", "Twig"
    ],
    "Legacy": [  # Miscellaneous / Niche / Historical / Low Market Demand
        "COBOL", "Pascal", "Smalltalk", "Prolog", "Lisp", "Delphi", "Forth", "APL", "Ada", "Modula-2", "BASIC"
    ]
}

# Define a function to check if a language belongs to a category
def belongs_to_category(language, category_languages):
    return 1 if language.upper() in [lang.upper() for lang in category_languages] else 0

# For each category, create a binary column
for category_name, category_languages in categories.items():
    column_name = f"cat_{category_name}"  # Prefix with 'cat_' for clarity
    result_df[column_name] = result_df.index.map(lambda lang: belongs_to_category(lang, category_languages))

# Reset index to make 'language' a column
result_df = result_df.reset_index()

# Add a string column with all the categories a language belongs to
def get_categories_list(row):
    # Get all categories where the value is 1
    matched_categories = [cat_name for cat_name in categories.keys() if row[f'cat_{cat_name}'] == 1]
    return ", ".join(matched_categories) if matched_categories else "Unknown"

result_df['language_categories'] = result_df.apply(get_categories_list, axis=1)

# Display the first few rows
# print(result_df.head())

# Verify the values are binary for a specific category
# print("\nStatistics for Scientific Computing:")
# print(result_df["cat_Scientific"].describe())
data_job_cat_2 = result_df

# Save the results to a CSV file
# result_df.to_csv('languages_with_categories.csv', index=False)

In [137]:
# Merge the dataframes on the 'language' column
language_categ = data_domain.merge(df_linkedin_skill, on='language', how='outer') \
                         .merge(data_job_cat, on='language', how='outer') \
                         .merge(data_job_cat_2, on='language', how='outer')
                         

# Export the merged dataframe to a CSV file
language_categ.to_csv('program_languages_categ.csv', index=False)

# language_categ
# Display the first few rows of the merged dataframe

In [138]:
import pandas as pd

url = "https://raw.githubusercontent.com/github/innovationgraph/main/data/languages.csv"

data = pd.read_csv(url, delimiter=',', keep_default_na=False)
# Filtro de EU
data = data[data.iso2_code != "EU"]

# Filtro de XK = Kosovo
data = data[data.iso2_code != "XK"]

In [139]:
# Seleccionamos los 10 lenguajes mas usados en Data Science
top_program_lang = programming_languages = [
    "Python", "Java",
    "C++", "C","JavaScript", "C#",
    "Ruby", "Go", "PHP", "TypeScript"
]

In [140]:
data2 = data[data['language'].isin(top_program_lang)]
data2 = data2.reset_index(drop=True)

data2['year_quarter'] = data2['year'].astype(str) + '-Q' + data2['quarter'].astype(str)
data2
# Reseteamos el index
data2 = data2.reset_index(drop=True)

In [141]:
# Creating a unique identifier
data2['unique_id'] = data2['iso2_code'] + '-' + data2['language']

In [142]:
# Create a DataFrame of unique identifiers
iso2_code = pd.DataFrame({'iso2_code': data2['iso2_code'].unique()})

# Create a DataFrame of unique identifiers for languages
language = pd.DataFrame({'language': data2['language'].unique()})

# Create a DataFrame of all time periods
# time_periods = pd.DataFrame({'year_quarter': range(data2['year_quarter'].min(), data2['year_quarter'].max() + 1)})
year_quarter = pd.DataFrame({'year_quarter': data2['year_quarter'].unique()})

# Create the Cartesian product of unique_ids and time_periods
balanced_panel = iso2_code.merge(language, how='cross').merge(year_quarter, how='cross')

balanced_panel["unique_id"] = balanced_panel["iso2_code"] + "-" +balanced_panel["language"]
balanced_panel

# # Merge the balanced panel with the original data
balanced_df = balanced_panel.merge(data2, on=['unique_id', 'year_quarter'], how='left')

# # Merge the DataFrames with suffixes
balanced_df = balanced_panel.merge(data2, on=['unique_id', 'year_quarter'], how='left', suffixes=('', '_y'))

# # Now, drop the columns with '_y' suffix, which are from the right DataFrame
balanced_df = balanced_df.loc[:, ~balanced_df.columns.str.endswith('_y')]

In [143]:
# Function to convert quarter format to integer
def quarter_to_int(quarter_string):
    year, q = quarter_string.split('-')
    year = int(year)
    quarter_number = int(q[1])  # Q1, Q2, Q3, Q4 -> 1, 2, 3, 4
    base_year = 2020  # Adjust based on your balanced_df, or set dynamically
    return 4 * (year - base_year) + quarter_number

# Applying the function
balanced_df['quarter'] = balanced_df['year_quarter'].apply(quarter_to_int)
balanced_df['year'] = balanced_df['year_quarter'].str.split('-').str[0]
balanced_df.loc[balanced_df["num_pushers"].isnull(), "num_pushers"] = 0
# balanced_df

In [144]:
# Definimos una función que convierte los nombres de países en códigos iso2
def country_to_iso2(country_name):
    try:
        # Intentamos obtener el código iso2 del país usando pycountry
        return pycountry.countries.get(name=country_name).alpha_2
    except AttributeError:
        try:
            # Manejamos casos especiales donde el nombre del país no coincide exactamente con la base de datos de pycountry
            special_cases = {
                "Czechia (Czech Republic)": "CZ",
                "Congo (Congo-Brazzaville)": "CG",
                "Holy See": "VA",
                "Timor-Leste (East Timor)": "TL",
                "Ukraine (with certain exceptions)": "UA",
                "Taiwan": "TW",
                "Bolivia": "BO",
                "Tanzania": "TZ",
                "South Korea": "KR",
                "Moldova": "MD",
                "Brunei": "BN"
            }
            return special_cases[country_name]
        except KeyError:
            return None

# Creamos una lista de países y obtenemos sus códigos iso2 usando la función country_to_iso2
gpt_countries_list = [
    "Albania", "Algeria", "Andorra", "Angola", "Antigua and Barbuda", "Argentina", "Armenia", "Australia", "Austria",
    "Azerbaijan", "Bahamas", "Bangladesh", "Barbados", "Belgium", "Belize", "Benin", "Bhutan", "Bolivia",
    "Bosnia and Herzegovina", "Botswana", "Brazil", "Brunei", "Bulgaria", "Burkina Faso", "Cabo Verde", "Canada",
    "Chile", "Colombia", "Comoros", "Congo (Congo-Brazzaville)", "Costa Rica", "Côte d'Ivoire", "Croatia", "Cyprus",
    "Czechia", "Denmark", "Djibouti", "Dominica", "Dominican Republic", "Ecuador", "El Salvador", "Estonia", "Fiji",
    "Finland", "France", "Gabon", "Gambia", "Georgia", "Germany", "Ghana", "Greece", "Grenada", "Guatemala", "Guinea",
    "Guinea-Bissau", "Guyana", "Haiti", "Holy See", "Honduras", "Hungary", "Iceland", "India", "Indonesia", "Iraq",
    "Ireland", "Israel", "Italy", "Jamaica", "Japan", "Jordan", "Kazakhstan", "Kenya", "Kiribati", "Kuwait",
    "Kyrgyzstan", "Latvia", "Lebanon", "Lesotho", "Liberia", "Liechtenstein", "Lithuania", "Luxembourg", "Madagascar",
    "Malawi", "Malaysia", "Maldives", "Mali", "Malta", "Marshall Islands", "Mauritania", "Mauritius", "Mexico",
    "Micronesia", "Moldova", "Monaco", "Mongolia", "Montenegro", "Morocco", "Mozambique", "Myanmar", "Namibia",
    "Nauru", "Nepal", "Netherlands", "New Zealand", "Nicaragua", "Niger", "Nigeria", "North Macedonia", "Norway",
    "Oman", "Pakistan", "Palau", "Palestine, State of", "Panama", "Papua New Guinea", "Paraguay", "Peru", "Philippines",
    "Poland", "Portugal", "Qatar", "Romania", "Rwanda", "Saint Kitts and Nevis", "Saint Lucia",
    "Saint Vincent and the Grenadines", "Samoa", "San Marino", "Sao Tome and Principe", "Saudi Arabia", "Senegal",
    "Serbia", "Seychelles", "Sierra Leone", "Singapore", "Slovakia", "Slovenia", "Solomon Islands", "South Africa",
    "South Korea", "Spain", "Sri Lanka", "Suriname", "Sweden", "Switzerland", "Taiwan", "Tanzania", "Thailand",
    "Timor-Leste", "Togo", "Tonga", "Trinidad and Tobago", "Tunisia", "Turkey", "Tuvalu", "Uganda", "Ukraine",
    "United Arab Emirates", "United Kingdom", "United States", "Uruguay", "Vanuatu", "Zambia"
]

gpt_countries_iso = [country_to_iso2(country) for country in gpt_countries_list]

# Agregamos una nueva columna gpt_available, con valor 1 si el código iso2 está en la lista gpt_countries_iso, y 0 en caso contrario
balanced_df["gpt_available"] = balanced_df["iso2_code"].apply(lambda row: 1 if row in gpt_countries_iso else 0)

In [145]:
countries = balanced_df.iso2_code.unique()

def create_populations_dictionary():
    country_populations = {}
    special_cases = {"MM": 54688774, "PS": 5483450, "ME": 602445, "AD":79824}
    # special_cases = {}
    for country in countries:
        try:
            country_populations.update({country: CInfo(country).info()["population"]})
        except KeyError:
            try:
                fallback_name = pycountry.countries.lookup(country).name
                country_populations.update({country: CInfo(fallback_name).info()["population"]})
            except KeyError:
                print(country)
                country_populations.update({country: special_cases[country]})

    return country_populations

country_populations = create_populations_dictionary()

ME
MM
PS
AD


In [146]:
# Create population
balanced_df["population"] = balanced_df["iso2_code"].map(country_populations)

# Create pushes_pc
balanced_df["num_pushers_pc"] = (balanced_df["num_pushers"] / balanced_df["population"])*100000

In [147]:
import pandas as pd
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool, Tabs, TabPanel, Div
from bokeh.palettes import Category10

# Initialize Bokeh in the notebook
output_notebook()

# Map of ISO codes to country names
country_map = {
    'BD': 'Bangladesh',
    'IN': 'India',
    'LK': 'Sri Lanka',
    'CN': 'China', 
    'ZA': 'South Africa',
    'ID': 'Indonesia', 
    'US': 'United States'
}

def create_country_plot_bokeh(df_filtered, country_code, country_name):
    """Create an interactive Bokeh plot for a specific country."""
    # Filter data by country
    country_data = df_filtered[df_filtered['iso2_code'] == country_code]

    if country_data.empty:
        print(f"No data available for {country_name}.")
        return None

    # Create figure with adjusted size
    p = figure(
        title=f'Number of pushers per 100k by programming language - {country_name}',
        x_range=sorted(country_data['year_quarter'].unique()),
        width=800,  # Wider plot
        height=500   # Taller plot
    )

    # List of unique languages
    languages = sorted(country_data['language'].unique())
    colors = Category10[len(languages)] if len(languages) <= 10 else Category10[10]

    # Add lines and points for each language
    for i, lang in enumerate(languages):
        lang_data = country_data[country_data['language'] == lang]
        source = ColumnDataSource(lang_data)
        p.line(
            x='year_quarter',
            y='num_pushers_pc',
            source=source,
            line_width=3,  # Thicker lines
            legend_label=lang,
            color=colors[i % len(colors)]
        )
        p.scatter(
            x='year_quarter',
            y='num_pushers_pc',
            source=source,
            size=10,  # Larger points
            color=colors[i % len(colors)],
            legend_label=lang
        )

    # Configure legend
    p.legend.title = "Programming Languages"
    p.legend.label_text_font_size = "12pt"
    p.legend.title_text_font_size = "14pt"
    p.legend.click_policy = "hide"

    # Move legend outside of the plot area to the right
    p.add_layout(p.legend[0], 'right')

    # Configure tools
    p.add_tools(HoverTool(
        tooltips=[
            ("Quarter", "@year_quarter"),
            ("Pushers per 100k", "@num_pushers_pc"),
            ("Language", "@language"),
        ],
        mode='vline'
    ))

    # Configure axes and background
    p.xaxis.axis_label = 'Quarter'
    p.yaxis.axis_label = 'Number of pushers per 100k'
    p.y_range.start = 0
    p.xaxis.major_label_orientation = 0.8
    p.xaxis.axis_label_text_font_size = "14pt"
    p.yaxis.axis_label_text_font_size = "14pt"
    p.title.text_font_size = "18pt"
    p.background_fill_color = "#f9f9f9"

    # Additional annotations as a `Div` object
    num_periods = len(country_data['year_quarter'].unique())
    num_observations = len(country_data)
    num_languages = len(languages)
    annotation = Div(
        text=(
            f"<div style='font-size: 14px; line-height: 1.5;'>"
            f"<b>Number of periods:</b> {num_periods}<br>"
            f"<b>Number of observations:</b> {num_observations}<br>"
            f"<b>Number of languages:</b> {num_languages}<br>"
            "<b>Source:</b> GitHub Innovation Graph"
            f"</div>"
        )
    )

    return p, annotation

# Create tabs for each country
def create_tabs_for_countries(df):
    """Create tabs with plots for each country."""
    tabs = []
    for code, name in country_map.items():
        plot, annotation = create_country_plot_bokeh(df, code, name)
        if plot:
            # Add plot to a tab
            tab = TabPanel(child=plot, title=name)
            tabs.append(tab)

    if tabs:
        tabs_layout = Tabs(tabs=tabs)
        show(tabs_layout)
    else:
        print("No sufficient data to generate plots.")

# Execute with the provided data
# Assuming `balanced_df` is already defined and loaded
# Replace `balanced_df` with your actual DataFrame variable
create_tabs_for_countries(balanced_df)


In [148]:
import pandas as pd

url = "https://raw.githubusercontent.com/github/innovationgraph/main/data/languages.csv"

data = pd.read_csv(url, delimiter=',', keep_default_na=False)
# Filtro de EU
data = data[data.iso2_code != "EU"]

# Filtro de XK = Kosovo
data = data[data.iso2_code != "XK"]

In [149]:
import pandas as pd

url = "https://raw.githubusercontent.com/github/innovationgraph/main/data/languages.csv"

data = pd.read_csv(url, delimiter=',', keep_default_na=False)
# Filtro de EU
data = data[data.iso2_code != "EU"]

# Filtro de XK = Kosovo
data = data[data.iso2_code != "XK"]

# # Seleccionamos los 10 lenguajes mas usados en Data Science
# top_program_lang = programming_languages = [
#     "Python", "Java",
#     "C++", "C","JavaScript", "C#",
#     "Ruby", "Go", "PHP", "TypeScript"
# ]

data2 = data.copy()
data2 = data2.reset_index(drop=True)

data2['year_quarter'] = data2['year'].astype(str) + '-Q' + data2['quarter'].astype(str)
data2
# Reseteamos el index
data2 = data2.reset_index(drop=True)
# Creating a unique identifier
data2['unique_id'] = data2['iso2_code'] + '-' + data2['language']
# Create a DataFrame of unique identifiers
iso2_code = pd.DataFrame({'iso2_code': data2['iso2_code'].unique()})

# Create a DataFrame of unique identifiers for languages
language = pd.DataFrame({'language': data2['language'].unique()})

# Create a DataFrame of all time periods
# time_periods = pd.DataFrame({'year_quarter': range(data2['year_quarter'].min(), data2['year_quarter'].max() + 1)})
year_quarter = pd.DataFrame({'year_quarter': data2['year_quarter'].unique()})

# Create the Cartesian product of unique_ids and time_periods
balanced_panel = iso2_code.merge(language, how='cross').merge(year_quarter, how='cross')

balanced_panel["unique_id"] = balanced_panel["iso2_code"] + "-" +balanced_panel["language"]
balanced_panel

# # Merge the balanced panel with the original data
balanced_df = balanced_panel.merge(data2, on=['unique_id', 'year_quarter'], how='left')

# # Merge the DataFrames with suffixes
balanced_df = balanced_panel.merge(data2, on=['unique_id', 'year_quarter'], how='left', suffixes=('', '_y'))

# # Now, drop the columns with '_y' suffix, which are from the right DataFrame
balanced_df = balanced_df.loc[:, ~balanced_df.columns.str.endswith('_y')]
# Function to convert quarter format to integer
def quarter_to_int(quarter_string):
    year, q = quarter_string.split('-')
    year = int(year)
    quarter_number = int(q[1])  # Q1, Q2, Q3, Q4 -> 1, 2, 3, 4
    base_year = 2020  # Adjust based on your balanced_df, or set dynamically
    return 4 * (year - base_year) + quarter_number

# Applying the function
balanced_df['quarter'] = balanced_df['year_quarter'].apply(quarter_to_int)
balanced_df['year'] = balanced_df['year_quarter'].str.split('-').str[0]
balanced_df.loc[balanced_df["num_pushers"].isnull(), "num_pushers"] = 0
# balanced_df
# Definimos una función que convierte los nombres de países en códigos iso2
def country_to_iso2(country_name):
    try:
        # Intentamos obtener el código iso2 del país usando pycountry
        return pycountry.countries.get(name=country_name).alpha_2
    except AttributeError:
        try:
            # Manejamos casos especiales donde el nombre del país no coincide exactamente con la base de datos de pycountry
            special_cases = {
                "Czechia (Czech Republic)": "CZ",
                "Congo (Congo-Brazzaville)": "CG",
                "Holy See": "VA",
                "Timor-Leste (East Timor)": "TL",
                "Ukraine (with certain exceptions)": "UA",
                "Taiwan": "TW",
                "Bolivia": "BO",
                "Tanzania": "TZ",
                "South Korea": "KR",
                "Moldova": "MD",
                "Brunei": "BN"
            }
            return special_cases[country_name]
        except KeyError:
            return None

# Creamos una lista de países y obtenemos sus códigos iso2 usando la función country_to_iso2
gpt_countries_list = [
    "Albania", "Algeria", "Andorra", "Angola", "Antigua and Barbuda", "Argentina", "Armenia", "Australia", "Austria",
    "Azerbaijan", "Bahamas", "Bangladesh", "Barbados", "Belgium", "Belize", "Benin", "Bhutan", "Bolivia",
    "Bosnia and Herzegovina", "Botswana", "Brazil", "Brunei", "Bulgaria", "Burkina Faso", "Cabo Verde", "Canada",
    "Chile", "Colombia", "Comoros", "Congo (Congo-Brazzaville)", "Costa Rica", "Côte d'Ivoire", "Croatia", "Cyprus",
    "Czechia", "Denmark", "Djibouti", "Dominica", "Dominican Republic", "Ecuador", "El Salvador", "Estonia", "Fiji",
    "Finland", "France", "Gabon", "Gambia", "Georgia", "Germany", "Ghana", "Greece", "Grenada", "Guatemala", "Guinea",
    "Guinea-Bissau", "Guyana", "Haiti", "Holy See", "Honduras", "Hungary", "Iceland", "India", "Indonesia", "Iraq",
    "Ireland", "Israel", "Italy", "Jamaica", "Japan", "Jordan", "Kazakhstan", "Kenya", "Kiribati", "Kuwait",
    "Kyrgyzstan", "Latvia", "Lebanon", "Lesotho", "Liberia", "Liechtenstein", "Lithuania", "Luxembourg", "Madagascar",
    "Malawi", "Malaysia", "Maldives", "Mali", "Malta", "Marshall Islands", "Mauritania", "Mauritius", "Mexico",
    "Micronesia", "Moldova", "Monaco", "Mongolia", "Montenegro", "Morocco", "Mozambique", "Myanmar", "Namibia",
    "Nauru", "Nepal", "Netherlands", "New Zealand", "Nicaragua", "Niger", "Nigeria", "North Macedonia", "Norway",
    "Oman", "Pakistan", "Palau", "Palestine, State of", "Panama", "Papua New Guinea", "Paraguay", "Peru", "Philippines",
    "Poland", "Portugal", "Qatar", "Romania", "Rwanda", "Saint Kitts and Nevis", "Saint Lucia",
    "Saint Vincent and the Grenadines", "Samoa", "San Marino", "Sao Tome and Principe", "Saudi Arabia", "Senegal",
    "Serbia", "Seychelles", "Sierra Leone", "Singapore", "Slovakia", "Slovenia", "Solomon Islands", "South Africa",
    "South Korea", "Spain", "Sri Lanka", "Suriname", "Sweden", "Switzerland", "Taiwan", "Tanzania", "Thailand",
    "Timor-Leste", "Togo", "Tonga", "Trinidad and Tobago", "Tunisia", "Turkey", "Tuvalu", "Uganda", "Ukraine",
    "United Arab Emirates", "United Kingdom", "United States", "Uruguay", "Vanuatu", "Zambia"
]

gpt_countries_iso = [country_to_iso2(country) for country in gpt_countries_list]

# Agregamos una nueva columna gpt_available, con valor 1 si el código iso2 está en la lista gpt_countries_iso, y 0 en caso contrario
balanced_df["gpt_available"] = balanced_df["iso2_code"].apply(lambda row: 1 if row in gpt_countries_iso else 0)
countries = balanced_df.iso2_code.unique()

def create_populations_dictionary():
    country_populations = {}
    special_cases = {"MM": 54688774, "PS": 5483450, "ME": 602445, "AD":79824}
    # special_cases = {}
    for country in countries:
        try:
            country_populations.update({country: CInfo(country).info()["population"]})
        except KeyError:
            try:
                fallback_name = pycountry.countries.lookup(country).name
                country_populations.update({country: CInfo(fallback_name).info()["population"]})
            except KeyError:
                print(country)
                country_populations.update({country: special_cases[country]})

    return country_populations

country_populations = create_populations_dictionary()
# Create population
balanced_df["population"] = balanced_df["iso2_code"].map(country_populations)

# Create pushes_pc
balanced_df["num_pushers_pc"] = (balanced_df["num_pushers"] / balanced_df["population"])*100000

ME
MM
PS
AD


In [150]:
# Perform the many-to-one merge
balanced_df_cat = balanced_df.merge(
    language_categ,
    on='language',
    how='left'
)
# balanced_df_cat.columns

## Disruptive Tech and Tech Skills by Programming Language - Top 20

In [151]:
import pandas as pd
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool, Tabs, TabPanel, Div
from bokeh.palettes import Category10

# Show Bokeh plots in notebook
output_notebook()

# Load classification file
classification = pd.read_csv("programming_languages_categorized.csv")

# Merge classification into your GitHub dataset
df = balanced_df.merge(classification, how="left", left_on="language", right_on="Programming Language")

# Sanity check: drop rows without classification
df = df.dropna(subset=["Skill Group"])

# Map of ISO codes to country names
country_map = {
    'BD': 'Bangladesh',
    'IN': 'India',
    'LK': 'Sri Lanka',
    'CN': 'China', 
    'TU': 'Turkiye',
    'ZA': 'South Africa', 
    'ID': 'Indonesia', 
    'US': 'United States'
}

def create_country_plot_by_skill(df_filtered, country_code, country_name):
    """Create a Bokeh plot by skill group (Tech Skill vs. Disruptive Tech Skill)"""
    country_data = df_filtered[df_filtered['iso2_code'] == country_code]

    if country_data.empty:
        print(f"No data available for {country_name}.")
        return None

    # Group by skill and quarter
    grouped = (
        country_data.groupby(['year_quarter', 'Skill Group'])
        .agg(num_pushers_pc=('num_pushers_pc', 'sum'))
        .reset_index()
    )

    # Create figure
    p = figure(
        title=f'Number of pushers per 100k by skill category - {country_name}',
        x_range=sorted(grouped['year_quarter'].unique()),
        width=800,
        height=500
    )

    categories = grouped['Skill Group'].unique()
    colors = Category10[3]  # Even though we only need 2, Category10[3] provides 3 distinct colors


    for i, skill in enumerate(categories):
        skill_data = grouped[grouped['Skill Group'] == skill]
        source = ColumnDataSource(skill_data)
        p.line(
            x='year_quarter',
            y='num_pushers_pc',
            source=source,
            line_width=3,
            legend_label=skill,
            color=colors[i % len(colors)]
        )
        p.circle(
            x='year_quarter',
            y='num_pushers_pc',
            source=source,
            size=10,
            color=colors[i % len(colors)],
            legend_label=skill
        )

    # Configure plot
    p.legend.title = "Skill Group"
    p.legend.label_text_font_size = "12pt"
    p.legend.title_text_font_size = "14pt"
    p.legend.click_policy = "hide"
    p.add_layout(p.legend[0], 'right')

    p.add_tools(HoverTool(
        tooltips=[
            ("Quarter", "@year_quarter"),
            ("Pushers per 100k", "@num_pushers_pc"),
            ("Skill Group", "@{Skill Group}")
        ],
        mode='vline'
    ))

    p.xaxis.axis_label = 'Quarter'
    p.yaxis.axis_label = 'Number of pushers per 100k'
    p.y_range.start = 0
    p.xaxis.major_label_orientation = 0.8
    p.background_fill_color = "#f9f9f9"

    return p, Div(text="")

def create_tabs_for_countries(df):
    """Create a tab for each country using skill group plots."""
    tabs = []
    for code, name in country_map.items():
        result = create_country_plot_by_skill(df, code, name)
        if result is not None:
            plot, annotation = result
            if plot:
                tab = TabPanel(child=plot, title=name)
                tabs.append(tab)

    if tabs:
        show(Tabs(tabs=tabs))
    else:
        print("No plots to display.")

# Run this to generate tabs for all countries by skill group
create_tabs_for_countries(df)



No data available for Turkiye.




## Disruptive Technology and Subcategories

In [152]:
import pandas as pd
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool, Tabs, TabPanel, Div, Legend
from bokeh.palettes import Category20, Turbo256
from bokeh.layouts import row

# Show Bokeh plots in notebook
output_notebook()

file_path = 'programming_languages.csv'
# Read the CSV file with programming languages
df_languages = pd.read_csv(file_path)

# Define more specific tech categories with precise language lists
tech_categories = {
    "AI_ML": [  # AI/Machine Learning
         "Python", "R", "Julia", "TensorFlow", "PyTorch", "CUDA", "MLIR", "Stan"
    ],
    "Blockchain": [  # Blockchain/Crypto
        "Solidity", "Move", "Cairo", "Vyper", "Yul", "Clarity"
    ],
    "QuantumComputing": [  # Quantum Computing
        "Q#", "Silq", "QML", "Quil"
    ],
    "Graphics_AR_VR": [  # Computer Graphics, AR/VR
        "ShaderLab", "HLSL", "GLSL", "Metal", "WebGL"
    ],
    "Robotics_IoT": [  # Robotics and IoT
        "ROS", "URDF", "RobotFramework", "Arduino", "MicroPython"
    ],
    "ModernSystems": [  # Modern systems programming
        "Rust", "WebAssembly", "Zig", "Nim", "Odin", "Crystal", "V", "Go"
    ],
    "DataEngineering": [  # Modern data processing/Big Data
        "Scala", "Spark", "Flink", "Presto", "DataWeave", "jq"
    ],
    "CloudNative": [  # Cloud-native technologies
        "HCL", "Terraform", "Jsonnet", "CUE", "Bicep", "Pulumi", "Nix", "Docker", "Dockerfile", "Kubernetes"
    ],
    "FunctionalProgramming": [  # Modern functional languages
        "Haskell", "Elm", "Reason", "ReScript", "F#", "Elixir", "Erlang", "OCaml", "PureScript", "Idris", "Lean", "Clojure"
    ]
}

# Function to classify programming languages with case-insensitive matching
def classify_tech_category(language):
    if not isinstance(language, str):
        return "General_Tech"
    
    # Convert the language to lowercase for comparison
    language_lower = language.lower()
    
    # Check if the language belongs to any specific tech category
    for category, languages in tech_categories.items():
        if any(lang.lower() == language_lower for lang in languages):
            return category
    
    # If no specific category matches, classify as general Tech Skills
    return "General_Tech"

# Apply the classification function
df_languages['tech_category'] = df_languages['Programming Language'].apply(classify_tech_category)

# Rename the column for consistency
df_linkedin_skill = df_languages.rename(columns={"Programming Language": "language"})[['language', 'tech_category']]

# Merge classification into your GitHub dataset (balanced_df)
df = balanced_df.merge(df_linkedin_skill, how="left", left_on="language", right_on="language")

# Fill NaN values in tech_category with "General_Tech"
df['tech_category'] = df['tech_category'].fillna("General_Tech")

# Map of ISO codes to country names
country_map = {
    'BD': 'Bangladesh',
    'IN': 'India',
    'LK': 'Sri Lanka',
    'CN': 'China', 
    'TU': 'Turkiye',
    'ZA': 'South Africa', 
    'ID': 'Indonesia', 
    'US': 'United States'
}

def create_country_plot_by_tech_category(df_filtered, country_code, country_name):
    """Create a Bokeh plot by detailed tech category, excluding General_Tech"""
    country_data = df_filtered[df_filtered['iso2_code'] == country_code]

    if country_data.empty:
        print(f"No data available for {country_name}.")
        return None

    # Filter out General_Tech
    country_data = country_data[country_data['tech_category'] != "General_Tech"]
    
    if country_data.empty:
        print(f"No specific tech categories found for {country_name} after filtering General_Tech.")
        return None

    # Group by tech category and quarter
    grouped = (
        country_data.groupby(['year_quarter', 'tech_category'])
        .agg(num_pushers_pc=('num_pushers_pc', 'sum'))
        .reset_index()
    )

    # Create figure
    p = figure(
        title=f'Disruptive Technology Adoption Trends in {country_name}',
        x_range=sorted(grouped['year_quarter'].unique()),
        width=800,
        height=500,
        tooltips=[
            ("Quarter", "@year_quarter"),
            ("Pushers per 100k", "@num_pushers_pc{0.00}"),
            ("Tech Category", "@tech_category")
        ]
    )

    # Get unique categories and assign colors
    categories = sorted(grouped['tech_category'].unique())
    
    # Use Category20 if we have 20 or fewer categories, otherwise use Turbo256
    if len(categories) <= 20:
        colors = Category20[20][:len(categories)]
    else:
        # Create evenly spaced colors from Turbo256
        step = len(Turbo256) // len(categories)
        colors = [Turbo256[i * step] for i in range(len(categories))]

    # Create lists to store legend items
    legend_items = []

    # Create a line for each tech category
    for i, category in enumerate(categories):
        category_data = grouped[grouped['tech_category'] == category]
        if not category_data.empty:  # Only plot if we have data
            source = ColumnDataSource(category_data)
            line = p.line(
                x='year_quarter',
                y='num_pushers_pc',
                source=source,
                line_width=3,
                color=colors[i % len(colors)]
            )
            circle = p.circle(
                x='year_quarter',
                y='num_pushers_pc',
                source=source,
                size=8,
                color=colors[i % len(colors)]
            )
            
            # Add to legend items
            legend_items.append((category, [line, circle]))

    # Create stand-alone legend
    legend = Legend(
        items=legend_items,
        location="center",
        title="Disruptive Technology Category",
        title_text_font_size="14pt",
        label_text_font_size="12pt",
        click_policy="hide",
        glyph_height=20,
        glyph_width=20,
        padding=5,
        spacing=8,
        background_fill_alpha=0.5
    )

    # Add the legend outside the plot
    p.add_layout(legend, 'right')

    # Configure plot
    p.xaxis.axis_label = 'Quarter'
    p.yaxis.axis_label = 'Number of pushers per 100k population'
    p.y_range.start = 0
    p.xaxis.major_label_orientation = 0.8
    p.background_fill_color = "#f9f9f9"
    p.grid.grid_line_alpha = 0.3

    return p, Div(text="")

def create_tabs_for_countries(df):
    """Create a tab for each country using detailed tech category plots."""
    tabs = []
    for code, name in country_map.items():
        result = create_country_plot_by_tech_category(df, code, name)
        if result is not None:
            plot, annotation = result
            tab = TabPanel(child=plot, title=name)
            tabs.append(tab)

    if tabs:
        show(Tabs(tabs=tabs))
    else:
        print("No plots to display.")

# Run this to generate tabs for all countries by tech category
create_tabs_for_countries(df)



No data available for Turkiye.




In [153]:
import pandas as pd

# Load the dataset
url = "https://raw.githubusercontent.com/github/innovationgraph/main/data/languages.csv"
data = pd.read_csv(url, delimiter=',', keep_default_na=False)

# Create a clean list of unique languages
unique_languages = data['language'].unique()

# Create an empty DataFrame for the results
result_df = pd.DataFrame(index=unique_languages)
result_df.index.name = 'language'

# Define the categories with improved names
categories = {
    "WebDev": [  # Web Development (Frontend & Backend)
        "HTML", "CSS", "JavaScript", "TypeScript", "PHP", "Ruby", "ASP.NET", "Vue", "React", "Angular", "Svelte"
    ],
    "MobileDev": [  # Mobile App Development
        "Swift", "Kotlin", "Dart", "Java", "Objective-C", "Flutter", "React Native"
    ],
    "DataScience": [  # Data Science / AI / Machine Learning
        "Python", "R", "Julia", "MATLAB", "Scala", "TensorFlow", "PyTorch", "Jupyter Notebook"
    ],
    "SystemsProg": [  # Systems Programming
        "C", "C++", "Rust", "Go", "Assembly", "Zig", "Ada", "Fortran"
    ],
    "GameDev": [  # Game Development
        "C#", "C++", "Lua", "ShaderLab", "HLSL", "GLSL", "UnrealScript", "Godot", "GDScript"
    ],
    "DevOps": [  # Scripting / Automation / DevOps
        "Bash", "PowerShell", "Python", "Shell", "Groovy", "Makefile", "Dockerfile", "HCL", "Terraform", "Ansible"
    ],
    "Embedded": [  # Embedded / IoT
        "C", "C++", "Rust", "MicroPython", "Arduino", "Verilog", "VHDL", "SystemVerilog", "Embedded C"
    ],
    "Scientific": [  # Academic / Research / Scientific Computing
        "MATLAB", "R", "Python", "Julia", "Fortran", "Wolfram", "Maple", "Maxima"
    ],
    "Database": [  # Database / Query Languages
        "SQL", "TSQL", "PLSQL", "PLpgSQL", "HiveQL", "GraphQL", "MongoDB Query Language"
    ],
    "Functional": [  # Functional Programming / Language Design
        "Haskell", "F#", "OCaml", "Erlang", "Elixir", "Clojure", "Scala", "Idris", "PureScript", "Reason"
    ],
    "UI_UX": [  # UI / UX / Markup / Templating
        "HTML", "CSS", "Markdown", "XML", "XSLT", "SASS", "LESS", "Pug", "Handlebars", "Blade", "Twig"
    ],
    "Legacy": [  # Miscellaneous / Niche / Historical / Low Market Demand
        "COBOL", "Pascal", "Smalltalk", "Prolog", "Lisp", "Delphi", "Forth", "APL", "Ada", "Modula-2", "BASIC"
    ]
}

# Define a function to check if a language belongs to a category
def belongs_to_category(language, category_languages):
    return 1 if language.upper() in [lang.upper() for lang in category_languages] else 0

# For each category, create a binary column
for category_name, category_languages in categories.items():
    column_name = f"cat_{category_name}"  # Prefix with 'cat_' for clarity
    result_df[column_name] = result_df.index.map(lambda lang: belongs_to_category(lang, category_languages))

# Reset index to make 'language' a column
result_df = result_df.reset_index()

# Add a string column with all the categories a language belongs to
def get_categories_list(row):
    # Get all categories where the value is 1
    matched_categories = [cat_name for cat_name in categories.keys() if row[f'cat_{cat_name}'] == 1]
    return ", ".join(matched_categories) if matched_categories else "Unknown"

result_df['language_categories'] = result_df.apply(get_categories_list, axis=1)

# Display the first few rows
# print(result_df.head())

# # Verify the values are binary for a specific category
# print("\nStatistics for Scientific Computing:")
# print(result_df["cat_Scientific"].describe())
# data_job_cat_2 = result_df

# Save the results to a CSV file
# result_df.to_csv('languages_with_categories.csv', index=False)

## Programming Language Classification by Development Domain

In [154]:
import pandas as pd
import numpy as np
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool, Tabs, TabPanel, Div, Legend
from bokeh.palettes import Category10
from bokeh.layouts import column

# Show Bokeh plots in notebook
output_notebook()

# Define the categories with improved names
categories = {
    "WebDev": [  # Web Development (Frontend & Backend)
        "HTML", "CSS", "JavaScript", "TypeScript", "PHP", "Ruby", "ASP.NET", "Vue", "React", "Angular", "Svelte"
    ],
    "MobileDev": [  # Mobile App Development
        "Swift", "Kotlin", "Dart", "Java", "Objective-C", "Flutter", "React Native"
    ],
    "DataScience": [  # Data Science / AI / Machine Learning
        "Python", "R", "Julia", "MATLAB", "Scala", "TensorFlow", "PyTorch", "Jupyter Notebook"
    ],
    "SystemsProg": [  # Systems Programming
        "C", "C++", "Rust", "Go", "Assembly", "Zig", "Ada", "Fortran"
    ],
    "GameDev": [  # Game Development
        "C#", "C++", "Lua", "ShaderLab", "HLSL", "GLSL", "UnrealScript", "Godot", "GDScript"
    ],
    "DevOps": [  # Scripting / Automation / DevOps
        "Bash", "PowerShell", "Python", "Shell", "Groovy", "Makefile", "Dockerfile", "HCL", "Terraform", "Ansible"
    ],
    "Embedded": [  # Embedded / IoT
        "C", "C++", "Rust", "MicroPython", "Arduino", "Verilog", "VHDL", "SystemVerilog", "Embedded C"
    ],
    "Scientific": [  # Academic / Research / Scientific Computing
        "MATLAB", "R", "Julia", "Fortran", "Wolfram", "Maple", "Maxima"
    ],
    "Database": [  # Database / Query Languages
        "SQL", "TSQL", "PLSQL", "PLpgSQL", "HiveQL", "GraphQL", "MongoDB Query Language"
    ],
    "Functional": [  # Functional Programming / Language Design
        "Haskell", "F#", "OCaml", "Erlang", "Elixir", "Clojure", "Scala", "Idris", "PureScript", "Reason"
    ],
    "UI_UX": [  # UI / UX / Markup / Templating
     "CSS", "Markdown", "XML", "XSLT", "SASS", "LESS", "Pug", "Handlebars", "Blade", "Twig"
    ],
    "Legacy": [  # Miscellaneous / Niche / Historical / Low Market Demand
        "COBOL", "Pascal", "Smalltalk", "Prolog", "Lisp", "Delphi", "Forth", "APL", "Ada", "Modula-2", "BASIC"
    ]
}

# Define a function to check if a language belongs to a category (case-insensitive)
def belongs_to_category(language, category_languages):
    if not isinstance(language, str):
        return 0
    return 1 if language.upper() in [lang.upper() for lang in category_languages] else 0

# Create a copy of balanced_df to add category columns
balanced_df_cat = balanced_df.copy()

# For each category, create a binary column
for category_name, category_languages in categories.items():
    column_name = f"cat_{category_name}"  # Prefix with 'cat_' for clarity
    balanced_df_cat[column_name] = balanced_df_cat['language'].apply(
        lambda lang: belongs_to_category(lang, category_languages)
    )

# Focus only on Bangladesh data
country_code = 'BD'
country_name = 'Bangladesh'

# Filter the data for Bangladesh
bd_data = balanced_df_cat[balanced_df_cat['iso2_code'] == country_code].copy()

# List of all category columns
cat_columns = ['cat_WebDev', 'cat_DevOps', 'cat_UI_UX', 'cat_MobileDev', 'cat_DataScience', 'cat_SystemsProg', 
               'cat_GameDev', 'cat_Embedded', 'cat_Scientific', 
               'cat_Database', 'cat_Functional', 'cat_Legacy']

# Organize categories into 3 tabs with 4 categories each
tab_categories = [
    cat_columns[0:4],   # Tab 1: WebDev, MobileDev, DataScience, SystemsProg
    cat_columns[4:8],   # Tab 2: GameDev, DevOps, Embedded, Scientific
    cat_columns[8:12]   # Tab 3: Database, Functional, UI_UX, Legacy
]

# Mapping for more readable category names
category_names = {
    'cat_WebDev': 'Web Development',
    'cat_MobileDev': 'Mobile Development',
    'cat_DataScience': 'Data Science',
    'cat_SystemsProg': 'Systems Programming',
    'cat_GameDev': 'Game Development',
    'cat_DevOps': 'DevOps',
    'cat_Embedded': 'Embedded Systems',
    'cat_Scientific': 'Scientific Computing',
    'cat_Database': 'Database',
    'cat_Functional': 'Functional Programming',
    'cat_UI_UX': 'UI/UX',
    'cat_Legacy': 'Legacy Systems'
}

def create_category_plot(bd_data, categories, tab_name):
    """Create a plot for specific categories"""
    
    # Sort data by quarter to ensure chronological order
    bd_data = bd_data.sort_values('year_quarter')
    
    # Get unique quarters
    quarters = bd_data['year_quarter'].unique()
    
    # Prepare data for plotting
    plot_data = {}
    
    for quarter in quarters:
        quarter_data = bd_data[bd_data['year_quarter'] == quarter]
        
        # For each category, sum the pushers_pc where the category dummy == 1
        for cat in categories:
            if cat not in plot_data:
                plot_data[cat] = {'quarters': [], 'values': []}
            
            # Filter the data where the category is 1, then sum num_pushers_pc
            cat_value = quarter_data[quarter_data[cat] == 1]['num_pushers_pc'].sum()
            
            plot_data[cat]['quarters'].append(quarter)
            plot_data[cat]['values'].append(cat_value)
    
    # Create figure
    p = figure(
        title=f'Technology Category Adoption in Bangladesh - {tab_name}',
        x_range=sorted(quarters),
        width=800,
        height=500,
        tooltips=[
            ("Quarter", "@quarters"),
            ("Pushers per 100k", "@values{0.00}"),
            ("Category", "$name")
        ]
    )
    
    # Use Category10 colors
    colors = Category10[len(categories)]
    
    # Create lists to store legend items
    legend_items = []
    
    # Add a line for each category
    for i, cat in enumerate(categories):
        source = ColumnDataSource({
            'quarters': plot_data[cat]['quarters'],
            'values': plot_data[cat]['values']
        })
        
        # Use proper name from mapping
        display_name = category_names.get(cat, cat)
        
        line = p.line(
            x='quarters',
            y='values',
            source=source,
            line_width=3,
            color=colors[i],
            name=display_name  # Used in tooltips
        )
        
        circle = p.circle(
            x='quarters',
            y='values',
            source=source,
            size=8,
            color=colors[i],
            name=display_name
        )
        
        # Add to legend items
        legend_items.append((display_name, [line, circle]))
    
    # Create stand-alone legend
    legend = Legend(
        items=legend_items,
        location="center",
        title="Technology Categories",
        title_text_font_size="14pt",
        label_text_font_size="12pt",
        click_policy="hide",
        glyph_height=20,
        glyph_width=20,
        padding=5,
        spacing=8,
        background_fill_alpha=0.5
    )
    
    # Add the legend outside the plot
    p.add_layout(legend, 'right')
    
    # Configure plot
    p.xaxis.axis_label = 'Quarter'
    p.yaxis.axis_label = 'Pushers per 100k population'
    p.y_range.start = 0
    p.xaxis.major_label_orientation = 0.8
    p.background_fill_color = "#f9f9f9"
    p.grid.grid_line_alpha = 0.3
    
    note = Div(
        text=(
            f"<div style='font-size: 12px; color: #666; margin-top: 5px;'>"
            f"Note: Values represent the sum of pushers per 100k population for languages in each category."
            f"</div>"
        )
    )
    
    return column(p, note)

# Create tabs
tabs = []

for i, categories in enumerate(tab_categories):
    tab_content = create_category_plot(bd_data, categories, f"Group {i+1}")
    tab = TabPanel(child=tab_content, title=f"Group {i+1}")
    tabs.append(tab)

# Show the tabbed plot
show(Tabs(tabs=tabs))



In [155]:
import pandas as pd
import numpy as np
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool, Tabs, TabPanel, Div, Legend
from bokeh.palettes import Category10
from bokeh.layouts import column

# Show Bokeh plots in notebook
output_notebook()

# Define the categories with improved names
categories = {
    "WebDev": [  # Web Development (Frontend & Backend)
        "HTML", "CSS", "JavaScript", "TypeScript", "PHP", "Ruby", "ASP.NET", "Vue", "React", "Angular", "Svelte"
    ],
    "MobileDev": [  # Mobile App Development
        "Swift", "Kotlin", "Dart", "Java", "Objective-C", "Flutter", "React Native"
    ],
    "DataScience": [  # Data Science / AI / Machine Learning
        "Python", "R", "Julia", "MATLAB", "Scala", "TensorFlow", "PyTorch", "Jupyter Notebook"
    ],
    "SystemsProg": [  # Systems Programming
        "C", "C++", "Rust", "Go", "Assembly", "Zig", "Ada", "Fortran"
    ],
    "GameDev": [  # Game Development
        "C#", "C++", "Lua", "ShaderLab", "HLSL", "GLSL", "UnrealScript", "Godot", "GDScript"
    ],
    "DevOps": [  # Scripting / Automation / DevOps
        "Bash", "PowerShell", "Python", "Shell", "Groovy", "Makefile", "Dockerfile", "HCL", "Terraform", "Ansible"
    ],
    "Embedded": [  # Embedded / IoT
        "C", "C++", "Rust", "MicroPython", "Arduino", "Verilog", "VHDL", "SystemVerilog", "Embedded C"
    ],
    "Scientific": [  # Academic / Research / Scientific Computing
        "MATLAB", "R", "Julia", "Fortran", "Wolfram", "Maple", "Maxima"
    ],
    "Database": [  # Database / Query Languages
        "SQL", "TSQL", "PLSQL", "PLpgSQL", "HiveQL", "GraphQL", "MongoDB Query Language"
    ],
    "Functional": [  # Functional Programming / Language Design
        "Haskell", "F#", "OCaml", "Erlang", "Elixir", "Clojure", "Scala", "Idris", "PureScript", "Reason"
    ],
    "UI_UX": [  # UI / UX / Markup / Templating
     "CSS", "Markdown", "XML", "XSLT", "SASS", "LESS", "Pug", "Handlebars", "Blade", "Twig"
    ],
    "Legacy": [  # Miscellaneous / Niche / Historical / Low Market Demand
        "COBOL", "Pascal", "Smalltalk", "Prolog", "Lisp", "Delphi", "Forth", "APL", "Ada", "Modula-2", "BASIC"
    ]
}

# Define a function to check if a language belongs to a category (case-insensitive)
def belongs_to_category(language, category_languages):
    if not isinstance(language, str):
        return 0
    return 1 if language.upper() in [lang.upper() for lang in category_languages] else 0

# Create a copy of balanced_df to add category columns
balanced_df_cat = balanced_df.copy()

# For each category, create a binary column
for category_name, category_languages in categories.items():
    column_name = f"cat_{category_name}"  # Prefix with 'cat_' for clarity
    balanced_df_cat[column_name] = balanced_df_cat['language'].apply(
        lambda lang: belongs_to_category(lang, category_languages)
    )

# Focus only on Bangladesh data
country_code = 'IN'
country_name = 'India'

# Filter the data for Bangladesh
bd_data = balanced_df_cat[balanced_df_cat['iso2_code'] == country_code].copy()

# List of all category columns
cat_columns = ['cat_WebDev', 'cat_DevOps', 'cat_UI_UX', 'cat_MobileDev', 'cat_DataScience', 'cat_SystemsProg', 
               'cat_GameDev', 'cat_Embedded', 'cat_Scientific', 
               'cat_Database', 'cat_Functional', 'cat_Legacy']

# Organize categories into 3 tabs with 4 categories each
tab_categories = [
    cat_columns[0:4],   # Tab 1: WebDev, MobileDev, DataScience, SystemsProg
    cat_columns[4:8],   # Tab 2: GameDev, DevOps, Embedded, Scientific
    cat_columns[8:12]   # Tab 3: Database, Functional, UI_UX, Legacy
]

# Mapping for more readable category names
category_names = {
    'cat_WebDev': 'Web Development',
    'cat_MobileDev': 'Mobile Development',
    'cat_DataScience': 'Data Science',
    'cat_SystemsProg': 'Systems Programming',
    'cat_GameDev': 'Game Development',
    'cat_DevOps': 'DevOps',
    'cat_Embedded': 'Embedded Systems',
    'cat_Scientific': 'Scientific Computing',
    'cat_Database': 'Database',
    'cat_Functional': 'Functional Programming',
    'cat_UI_UX': 'UI/UX',
    'cat_Legacy': 'Legacy Systems'
}

def create_category_plot(bd_data, categories, tab_name):
    """Create a plot for specific categories"""
    
    # Sort data by quarter to ensure chronological order
    bd_data = bd_data.sort_values('year_quarter')
    
    # Get unique quarters
    quarters = bd_data['year_quarter'].unique()
    
    # Prepare data for plotting
    plot_data = {}
    
    for quarter in quarters:
        quarter_data = bd_data[bd_data['year_quarter'] == quarter]
        
        # For each category, sum the pushers_pc where the category dummy == 1
        for cat in categories:
            if cat not in plot_data:
                plot_data[cat] = {'quarters': [], 'values': []}
            
            # Filter the data where the category is 1, then sum num_pushers_pc
            cat_value = quarter_data[quarter_data[cat] == 1]['num_pushers_pc'].sum()
            
            plot_data[cat]['quarters'].append(quarter)
            plot_data[cat]['values'].append(cat_value)
    
    # Create figure
    p = figure(
        title=f'Technology Category Adoption in India - {tab_name}',
        x_range=sorted(quarters),
        width=800,
        height=500,
        tooltips=[
            ("Quarter", "@quarters"),
            ("Pushers per 100k", "@values{0.00}"),
            ("Category", "$name")
        ]
    )
    
    # Use Category10 colors
    colors = Category10[len(categories)]
    
    # Create lists to store legend items
    legend_items = []
    
    # Add a line for each category
    for i, cat in enumerate(categories):
        source = ColumnDataSource({
            'quarters': plot_data[cat]['quarters'],
            'values': plot_data[cat]['values']
        })
        
        # Use proper name from mapping
        display_name = category_names.get(cat, cat)
        
        line = p.line(
            x='quarters',
            y='values',
            source=source,
            line_width=3,
            color=colors[i],
            name=display_name  # Used in tooltips
        )
        
        circle = p.circle(
            x='quarters',
            y='values',
            source=source,
            size=8,
            color=colors[i],
            name=display_name
        )
        
        # Add to legend items
        legend_items.append((display_name, [line, circle]))
    
    # Create stand-alone legend
    legend = Legend(
        items=legend_items,
        location="center",
        title="Technology Categories",
        title_text_font_size="14pt",
        label_text_font_size="12pt",
        click_policy="hide",
        glyph_height=20,
        glyph_width=20,
        padding=5,
        spacing=8,
        background_fill_alpha=0.5
    )
    
    # Add the legend outside the plot
    p.add_layout(legend, 'right')
    
    # Configure plot
    p.xaxis.axis_label = 'Quarter'
    p.yaxis.axis_label = 'Pushers per 100k population'
    p.y_range.start = 0
    p.xaxis.major_label_orientation = 0.8
    p.background_fill_color = "#f9f9f9"
    p.grid.grid_line_alpha = 0.3
    
    note = Div(
        text=(
            f"<div style='font-size: 12px; color: #666; margin-top: 5px;'>"
            f"Note: Values represent the sum of pushers per 100k population for languages in each category."
            f"</div>"
        )
    )
    
    return column(p, note)

# Create tabs
tabs = []

for i, categories in enumerate(tab_categories):
    tab_content = create_category_plot(bd_data, categories, f"Group {i+1}")
    tab = TabPanel(child=tab_content, title=f"Group {i+1}")
    tabs.append(tab)

# Show the tabbed plot
show(Tabs(tabs=tabs))



In [156]:
import pandas as pd
import numpy as np
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool, Tabs, TabPanel, Div, Legend
from bokeh.palettes import Category10
from bokeh.layouts import column

# Show Bokeh plots in notebook
output_notebook()

# Define the categories with improved names
categories = {
    "WebDev": [  # Web Development (Frontend & Backend)
        "HTML", "CSS", "JavaScript", "TypeScript", "PHP", "Ruby", "ASP.NET", "Vue", "React", "Angular", "Svelte"
    ],
    "MobileDev": [  # Mobile App Development
        "Swift", "Kotlin", "Dart", "Java", "Objective-C", "Flutter", "React Native"
    ],
    "DataScience": [  # Data Science / AI / Machine Learning
        "Python", "R", "Julia", "MATLAB", "Scala", "TensorFlow", "PyTorch", "Jupyter Notebook"
    ],
    "SystemsProg": [  # Systems Programming
        "C", "C++", "Rust", "Go", "Assembly", "Zig", "Ada", "Fortran"
    ],
    "GameDev": [  # Game Development
        "C#", "C++", "Lua", "ShaderLab", "HLSL", "GLSL", "UnrealScript", "Godot", "GDScript"
    ],
    "DevOps": [  # Scripting / Automation / DevOps
        "Bash", "PowerShell", "Python", "Shell", "Groovy", "Makefile", "Dockerfile", "HCL", "Terraform", "Ansible"
    ],
    "Embedded": [  # Embedded / IoT
        "C", "C++", "Rust", "MicroPython", "Arduino", "Verilog", "VHDL", "SystemVerilog", "Embedded C"
    ],
    "Scientific": [  # Academic / Research / Scientific Computing
        "MATLAB", "R", "Julia", "Fortran", "Wolfram", "Maple", "Maxima"
    ],
    "Database": [  # Database / Query Languages
        "SQL", "TSQL", "PLSQL", "PLpgSQL", "HiveQL", "GraphQL", "MongoDB Query Language"
    ],
    "Functional": [  # Functional Programming / Language Design
        "Haskell", "F#", "OCaml", "Erlang", "Elixir", "Clojure", "Scala", "Idris", "PureScript", "Reason"
    ],
    "UI_UX": [  # UI / UX / Markup / Templating
     "CSS", "Markdown", "XML", "XSLT", "SASS", "LESS", "Pug", "Handlebars", "Blade", "Twig"
    ],
    "Legacy": [  # Miscellaneous / Niche / Historical / Low Market Demand
        "COBOL", "Pascal", "Smalltalk", "Prolog", "Lisp", "Delphi", "Forth", "APL", "Ada", "Modula-2", "BASIC"
    ]
}

# Define a function to check if a language belongs to a category (case-insensitive)
def belongs_to_category(language, category_languages):
    if not isinstance(language, str):
        return 0
    return 1 if language.upper() in [lang.upper() for lang in category_languages] else 0

# Create a copy of balanced_df to add category columns
balanced_df_cat = balanced_df.copy()

# For each category, create a binary column
for category_name, category_languages in categories.items():
    column_name = f"cat_{category_name}"  # Prefix with 'cat_' for clarity
    balanced_df_cat[column_name] = balanced_df_cat['language'].apply(
        lambda lang: belongs_to_category(lang, category_languages)
    )

# Focus only on Bangladesh data
country_code = 'LK'
country_name = 'Sri Lanka'

# Filter the data for Bangladesh
bd_data = balanced_df_cat[balanced_df_cat['iso2_code'] == country_code].copy()

# List of all category columns
cat_columns = ['cat_WebDev', 'cat_DevOps', 'cat_UI_UX', 'cat_MobileDev', 'cat_DataScience', 'cat_SystemsProg', 
               'cat_GameDev', 'cat_Embedded', 'cat_Scientific', 
               'cat_Database', 'cat_Functional', 'cat_Legacy']

# Organize categories into 3 tabs with 4 categories each
tab_categories = [
    cat_columns[0:4],   # Tab 1: WebDev, MobileDev, DataScience, SystemsProg
    cat_columns[4:8],   # Tab 2: GameDev, DevOps, Embedded, Scientific
    cat_columns[8:12]   # Tab 3: Database, Functional, UI_UX, Legacy
]

# Mapping for more readable category names
category_names = {
    'cat_WebDev': 'Web Development',
    'cat_MobileDev': 'Mobile Development',
    'cat_DataScience': 'Data Science',
    'cat_SystemsProg': 'Systems Programming',
    'cat_GameDev': 'Game Development',
    'cat_DevOps': 'DevOps',
    'cat_Embedded': 'Embedded Systems',
    'cat_Scientific': 'Scientific Computing',
    'cat_Database': 'Database',
    'cat_Functional': 'Functional Programming',
    'cat_UI_UX': 'UI/UX',
    'cat_Legacy': 'Legacy Systems'
}

def create_category_plot(bd_data, categories, tab_name):
    """Create a plot for specific categories"""
    
    # Sort data by quarter to ensure chronological order
    bd_data = bd_data.sort_values('year_quarter')
    
    # Get unique quarters
    quarters = bd_data['year_quarter'].unique()
    
    # Prepare data for plotting
    plot_data = {}
    
    for quarter in quarters:
        quarter_data = bd_data[bd_data['year_quarter'] == quarter]
        
        # For each category, sum the pushers_pc where the category dummy == 1
        for cat in categories:
            if cat not in plot_data:
                plot_data[cat] = {'quarters': [], 'values': []}
            
            # Filter the data where the category is 1, then sum num_pushers_pc
            cat_value = quarter_data[quarter_data[cat] == 1]['num_pushers_pc'].sum()
            
            plot_data[cat]['quarters'].append(quarter)
            plot_data[cat]['values'].append(cat_value)
    
    # Create figure
    p = figure(
        title=f'Technology Category Adoption in India - {tab_name}',
        x_range=sorted(quarters),
        width=800,
        height=500,
        tooltips=[
            ("Quarter", "@quarters"),
            ("Pushers per 100k", "@values{0.00}"),
            ("Category", "$name")
        ]
    )
    
    # Use Category10 colors
    colors = Category10[len(categories)]
    
    # Create lists to store legend items
    legend_items = []
    
    # Add a line for each category
    for i, cat in enumerate(categories):
        source = ColumnDataSource({
            'quarters': plot_data[cat]['quarters'],
            'values': plot_data[cat]['values']
        })
        
        # Use proper name from mapping
        display_name = category_names.get(cat, cat)
        
        line = p.line(
            x='quarters',
            y='values',
            source=source,
            line_width=3,
            color=colors[i],
            name=display_name  # Used in tooltips
        )
        
        circle = p.circle(
            x='quarters',
            y='values',
            source=source,
            size=8,
            color=colors[i],
            name=display_name
        )
        
        # Add to legend items
        legend_items.append((display_name, [line, circle]))
    
    # Create stand-alone legend
    legend = Legend(
        items=legend_items,
        location="center",
        title="Technology Categories",
        title_text_font_size="14pt",
        label_text_font_size="12pt",
        click_policy="hide",
        glyph_height=20,
        glyph_width=20,
        padding=5,
        spacing=8,
        background_fill_alpha=0.5
    )
    
    # Add the legend outside the plot
    p.add_layout(legend, 'right')
    
    # Configure plot
    p.xaxis.axis_label = 'Quarter'
    p.yaxis.axis_label = 'Pushers per 100k population'
    p.y_range.start = 0
    p.xaxis.major_label_orientation = 0.8
    p.background_fill_color = "#f9f9f9"
    p.grid.grid_line_alpha = 0.3
    
    note = Div(
        text=(
            f"<div style='font-size: 12px; color: #666; margin-top: 5px;'>"
            f"Note: Values represent the sum of pushers per 100k population for languages in each category."
            f"</div>"
        )
    )
    
    return column(p, note)

# Create tabs
tabs = []

for i, categories in enumerate(tab_categories):
    tab_content = create_category_plot(bd_data, categories, f"Group {i+1}")
    tab = TabPanel(child=tab_content, title=f"Group {i+1}")
    tabs.append(tab)

# Show the tabbed plot
show(Tabs(tabs=tabs))



# Economy collaborators

In [157]:
import pandas as pd
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool, Tabs, TabPanel
from bokeh.layouts import column

# Map of ISO codes to country names
country_map = {
    'BD': 'Bangladesh',
    'IN': 'India',
    'LK': 'Sri Lanka',
    'CN': 'China', 
    'TR': 'Turkiye',
    'ZA': 'South Africa',
    'ID': 'Indonesia'
}

# Load the dataset
econ_col = pd.read_csv('https://raw.githubusercontent.com/github/innovationgraph/refs/heads/main/data/economy_collaborators.csv')
# econ_col

In [158]:
import pandas as pd
from bokeh.plotting import figure, output_notebook, show
from bokeh.models import ColumnDataSource, HoverTool, Tabs, TabPanel, Div
from bokeh.palettes import Category20
from bokeh.layouts import column

# Initialize Bokeh in the notebook
output_notebook()

# Map of ISO codes to country names
country_map = {
    'BD': 'Bangladesh',
    'IN': 'India',
    'LK': 'Sri Lanka',
    'CN': 'China', 
    'TU': 'Turkiye',
    'ZA': 'South Africa', 
    'ID': 'Indonesia', 
    'US': 'United States'
}

# Load the dataset
econ_col = pd.read_csv('https://raw.githubusercontent.com/github/innovationgraph/refs/heads/main/data/economy_collaborators.csv')

def create_weight_plots(data):
    """Create interactive plots for weight analysis by quarter for each source country."""
    tabs = []
    
    # Filter for source countries of interest and exclude EU from destinations
    data = data[data['source'].isin(country_map.keys()) & (data['destination'] != 'EU')]
    
    # Find top 10 destinations by total weight
    top_destinations = (data.groupby('destination')['weight']
                       .sum()
                       .sort_values(ascending=False)
                       .head(10)
                       .index.tolist())
    
    # Filter for top destinations
    data = data[data['destination'].isin(top_destinations)]
    
    # Create year_quarter field
    data['year_quarter'] = data['year'].astype(str) + "-Q" + data['quarter'].astype(str)
    
    for iso2_code, country_name in country_map.items():
        # Filter data by source country
        country_data = data[data['source'] == iso2_code].copy()
        
        if country_data.empty:
            print(f"No data available for {country_name}.")
            continue
            
        # Group by quarter and destination
        grouped_data = (country_data.groupby(['year_quarter', 'destination'])['weight']
                       .sum()
                       .reset_index())
        
        # Create data source
        source = ColumnDataSource(grouped_data)
        
        # Configure plot
        p = figure(
            title=f"Weight Evolution for {country_name} by Destination",
            x_range=sorted(grouped_data['year_quarter'].unique()),
            width=800,
            height=500,
            background_fill_color="#ffffff"
        )
        
        # Create lines for each destination
        destinations = grouped_data['destination'].unique()
        colors = Category20[len(destinations)] if len(destinations) <= 20 else Category20[20]
        
        for i, dest in enumerate(destinations):
            dest_data = grouped_data[grouped_data['destination'] == dest]
            dest_source = ColumnDataSource(dest_data)
            
            p.line(
                x='year_quarter',
                y='weight',
                source=dest_source,
                line_width=3,
                legend_label=dest,
                color=colors[i % len(colors)]
            )
            p.scatter(
                x='year_quarter',
                y='weight',
                source=dest_source,
                size=8,
                color=colors[i % len(colors)],
                legend_label=dest
            )
        
        # Configure legend
        p.legend.title = "Destination Countries"
        p.legend.label_text_font_size = "12pt"
        p.legend.title_text_font_size = "14pt"
        p.legend.click_policy = "hide"
        
        # Move legend outside of the plot area
        p.add_layout(p.legend[0], 'right')
        
        # Configure tools and axes
        p.add_tools(HoverTool(
            tooltips=[
                ("Quarter", "@year_quarter"),
                ("Weight", "@weight{0,0}"),
                ("Destination", "@destination"),
            ],
            mode='vline'
        ))
        
        p.xaxis.axis_label = 'Quarter'
        p.yaxis.axis_label = 'Weight'
        p.y_range.start = 0
        p.xaxis.major_label_orientation = 0.8
        p.xaxis.axis_label_text_font_size = "14pt"
        p.yaxis.axis_label_text_font_size = "14pt"
        p.title.text_font_size = "18pt"
        
        # Add summary statistics
        summary_stats = {
            'total_weight': country_data['weight'].sum(),
            'num_destinations': len(destinations),
            'num_periods': len(grouped_data['year_quarter'].unique()),
            'avg_weight_per_quarter': country_data['weight'].mean()
        }
        
        annotation = Div(
            text=(
                f"<div style='font-size: 14px; line-height: 1.5;'>"
                f"<b>Total Weight:</b> {summary_stats['total_weight']:,.0f}<br>"
                f"<b>Number of Destinations:</b> {summary_stats['num_destinations']}<br>"
                f"<b>Number of Periods:</b> {summary_stats['num_periods']}<br>"
                f"<b>Average Weight per Quarter:</b> {summary_stats['avg_weight_per_quarter']:,.0f}<br>"
                "</div>"
            )
        )
        
        # Create layout with plot and annotation
        layout = column(p, annotation)
        
        # Add as tab
        tab = TabPanel(child=layout, title=country_name)
        tabs.append(tab)
    
    # Create and show tabs
    if tabs:
        tabs_layout = Tabs(tabs=tabs)
        show(tabs_layout)
    else:
        print("No sufficient data to generate plots.")

# Execute function
create_weight_plots(econ_col)

No data available for Turkiye.


In [159]:
import pandas as pd
from bokeh.plotting import figure, output_notebook, show
from bokeh.models import ColumnDataSource, HoverTool, Tabs, TabPanel, Div
from bokeh.palettes import Category20
from bokeh.layouts import column

# Initialize Bokeh in the notebook
output_notebook()

# Map of ISO codes to country names
country_map = {
    'BD': 'Bangladesh',
    'IN': 'India',
    'LK': 'Sri Lanka',
    'CN': 'China', 
    'TU': 'Turkiye',
    'ZA': 'South Africa', 
    'ID': 'Indonesia', 
    'US': 'United States'
}

# Load the dataset
econ_col = pd.read_csv('https://raw.githubusercontent.com/github/innovationgraph/refs/heads/main/data/economy_collaborators.csv')

def create_weight_plots(data):
    """Create interactive plots showing top 10 contributing sources for each destination country."""
    tabs = []
    
    # Filter for destination countries of interest and exclude EU from sources
    data = data[data['destination'].isin(country_map.keys()) & (data['source'] != 'EU')]
    
    # Create year_quarter field
    data['year_quarter'] = data['year'].astype(str) + "-Q" + data['quarter'].astype(str)
    
    for iso2_code, country_name in country_map.items():
        # Filter data by destination country
        country_data = data[data['destination'] == iso2_code].copy()
        
        if country_data.empty:
            print(f"No data available for {country_name}.")
            continue
        
        # Find top 10 source countries by total weight for this destination
        top_sources = (country_data.groupby('source')['weight']
                      .sum()
                      .sort_values(ascending=False)
                      .head(10)
                      .index.tolist())
        
        # Filter for top sources
        country_data = country_data[country_data['source'].isin(top_sources)]
            
        # Group by quarter and source
        grouped_data = (country_data.groupby(['year_quarter', 'source'])['weight']
                       .sum()
                       .reset_index())
        
        # Create data source
        source = ColumnDataSource(grouped_data)
        
        # Configure plot
        p = figure(
            title=f"Top 10 Contributors to {country_name} by Quarter",
            x_range=sorted(grouped_data['year_quarter'].unique()),
            width=800,
            height=500,
            background_fill_color="#ffffff"
        )
        
        # Create lines for each source country
        sources = grouped_data['source'].unique()
        colors = Category20[len(sources)] if len(sources) <= 20 else Category20[20]
        
        for i, src in enumerate(sources):
            src_data = grouped_data[grouped_data['source'] == src]
            src_source = ColumnDataSource(src_data)
            
            p.line(
                x='year_quarter',
                y='weight',
                source=src_source,
                line_width=3,
                legend_label=src,
                color=colors[i % len(colors)]
            )
            p.scatter(
                x='year_quarter',
                y='weight',
                source=src_source,
                size=8,
                color=colors[i % len(colors)],
                legend_label=src
            )
        
        # Configure legend
        p.legend.title = "Source Countries"
        p.legend.label_text_font_size = "12pt"
        p.legend.title_text_font_size = "14pt"
        p.legend.click_policy = "hide"
        
        # Move legend outside of the plot area
        p.add_layout(p.legend[0], 'right')
        
        # Configure tools and axes
        p.add_tools(HoverTool(
            tooltips=[
                ("Quarter", "@year_quarter"),
                ("Weight", "@weight{0,0}"),
                ("Source", "@source"),
            ],
            mode='vline'
        ))
        
        p.xaxis.axis_label = 'Quarter'
        p.yaxis.axis_label = 'Weight'
        p.y_range.start = 0
        p.xaxis.major_label_orientation = 0.8
        p.xaxis.axis_label_text_font_size = "14pt"
        p.yaxis.axis_label_text_font_size = "14pt"
        p.title.text_font_size = "18pt"
        
        # Add summary statistics
        total_weight = country_data['weight'].sum()
        source_contributions = (country_data.groupby('source')['weight']
                              .sum()
                              .sort_values(ascending=False))
        
        top_contributor = source_contributions.index[0]
        top_weight = source_contributions.iloc[0]
        
        annotation = Div(
            text=(
                f"<div style='font-size: 14px; line-height: 1.5;'>"
                f"<b>Total Incoming Weight:</b> {total_weight:,.0f}<br>"
                f"<b>Number of Top Contributors:</b> {len(sources)}<br>"
                f"<b>Number of Periods:</b> {len(grouped_data['year_quarter'].unique())}<br>"
                f"<b>Top Contributor:</b> {top_contributor} ({top_weight:,.0f})<br>"
                f"<b>Average Weight per Quarter:</b> {total_weight/len(grouped_data['year_quarter'].unique()):,.0f}<br>"
                "</div>"
            )
        )
        
        # Create layout with plot and annotation
        layout = column(p, annotation)
        
        # Add as tab
        tab = TabPanel(child=layout, title=country_name)
        tabs.append(tab)
    
    # Create and show tabs
    if tabs:
        tabs_layout = Tabs(tabs=tabs)
        show(tabs_layout)
    else:
        print("No sufficient data to generate plots.")

# Execute function
create_weight_plots(econ_col)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['year_quarter'] = data['year'].astype(str) + "-Q" + data['quarter'].astype(str)


No data available for Turkiye.
