Media Analysis of PSI-20 Companies

Insights from News and Public Coverage

Hugo Veríssimo

124348

Objective & Methodology

Discover insights about specific companies, traded on the PSI-20 index, by analyzing media mentions, focusing on:

  • Sentiment analysis

  • Named entity recognition

Similar works:

Source code: https://github.com/Hugoverissimo21/FCD-project

Data Extraction from arquivo.pt

Companies and News Sources

companies = {"Banco Comercial Português": ["Banco Comercial Português", "BCP"],
             "Galp Energia": ["Galp Energia", "GALP"],
             "EDP": ["EDP", "Energias de Portugal", "Electricidade de Portugal"],
             "Sonae": ["Sonae", "SON"],
             "Mota-Engil": ["Mota-Engil", "EGL"]}
             
pd.read_csv('noticias.csv')             
                          0                             1
0            www.publico.pt                       Público
1                publico.pt                       Público
2                 www.dn.pt            Diário de Notícias
3                www.rtp.pt                           RTP
4                    rpt.pt                           RTP
5           www.cmjornal.pt              Correio da Manhã
6                www.iol.pt                           IOL
7          www.tvi24.iol.pt                           TVI
8              tvi24.iol.pt                           TVI
9          noticias.sapo.pt                          Sapo
10            observador.pt                    Observador
11              expresso.pt                      Expresso
12          www.expresso.pt                      Expresso
13              sol.sapo.pt                           SOL
14  www.jornaldenegocios.pt            Jornal de Negócios
15                www.jn.pt            Jornal de Notícias
16                    jn.pt            Jornal de Notícias
17               ionline.pt                      Jornal i
18           sicnoticias.pt                           SIC
19       www.sicnoticias.pt                           SIC
20           www.lux.iol.pt                           Lux
21           www.ionline.pt                      Jornal i
22           news.google.pt                        Google
23      www.dinheirovivo.pt                 Dinheiro Vivo
24             www.aeiou.pt                         AEIOU
25                 aeiou.pt                         AEIOU
26               www.tsf.pt                           TSF
27                   tsf.pt                           TSF
28            www.sabado.pt                        Sábado
29             dnoticias.pt  Diário de Notícias (Madeira)
30         www.dnoticias.pt  Diário de Notícias (Madeira)
31        economico.sapo.pt              Jornal Económico
32       cnnportugal.iol.pt                  CNN Portugal

API Request Function

def api_request(search, websites, date):
    """
    search: expression/word (what to look for)
    websites: comma separated websites (where to look for)
    date: list such as [20030101, 20031231] (when to look for)
    -
    returns the response_items from arquivo.pt api
    """
    search = f"q=%22{search.replace(' ', '%20')}%22"
    websites = f"&siteSearch={websites}"
    date = f"&from={date[0]}&to={date[1]}"    
    url = (
        f"https://arquivo.pt/textsearch?{search}{websites}{date}"
        "&fields=linkToArchive,linkToExtractedText,tstamp"
        "&maxItems=500"
        "&dedupValue=25"
        "&dedupField=url"
        "&prettyPrint=false"
        "&type=html"
        )
    json = requests.get(url).json()
    data = json["response_items"]
    if len(data) == 500:
        print(f"You might have lost some data: {search, date}")
    return data

API Request

def data01(companies):
    """
    Apply the API request for each company. Save the data into a Parquet file.
    
    - companies (dict): A dictionary of companies with their aliases.
    - companies_Ex = {"Galp Energia": ["Galp Energia", "GALP"],
                      "EDP": ["EDP", "Energias de Portugal"]}
    """
    # CREATING DF WITH COMPANIES AND THEIR ALIASES
    companies_data = {"companies": [], "aliases": []}
    for company in companies.keys():
        companies_data["companies"].append(company)
        companies_data["aliases"].append(companies[company])
    df = pd.DataFrame(companies_data).set_index("companies")

    # SITES OF WHERE TO LOOK FOR NEWS
    websites = news()

    # INITIALIZAING API REQUESTS
    # groups of 3 years, from 2000 to 2020
    for cluster in range(2000, 2021, 3):
        api_cluster = [] #reset api_cluster for each cluster (group of 3 year)
        print(f"Processing cluster: {cluster}")
        print("Processing company:", end=" ")
        # iterate over each company
        for company_aliases in df["aliases"]:
            api_company = [] #reset api_company for each company
            print(f"{company_aliases[0]}", end = "; ")
            # iterate over each company's aliases
            for alias in company_aliases:
                # iterate over each cluter's year
                for year in range(cluster, cluster + 3):                        
                    api_aliasS1 = api_request(alias, websites, [int(f"{year}0101"), int(f"{year}0630")])
                    api_aliasS2 = api_request(alias, websites, [int(f"{year}0701"), int(f"{year}1231")])
                    api_company += api_aliasS1 + api_aliasS2
            # save company data
            api_cluster.append(api_company)

        # save cluster into df
        df[f"api.{cluster}"] = api_cluster
        print(f"{cluster} OK.")

    # save all data
    df.to_parquet("data01.parquet")
    print("Finished.")

data01.parquet

pd.read_parquet("data01.parquet")
aliases api.2000 api.2003 api.2006 api.2009 api.2012 api.2015 api.2018
companies
Banco Comercial Português [Banco Comercial Português, BCP] [{'linkToArchive': 'https://arquivo.pt/wayback... [{'linkToArchive': 'https://arquivo.pt/wayback... [{'linkToArchive': 'https://arquivo.pt/wayback... [{'linkToArchive': 'https://arquivo.pt/wayback... [{'linkToArchive': 'https://arquivo.pt/wayback... [{'linkToArchive': 'https://arquivo.pt/wayback... [{'linkToArchive': 'https://arquivo.pt/wayback...
Galp Energia [Galp Energia, GALP] [{'linkToArchive': 'https://arquivo.pt/wayback... [{'linkToArchive': 'https://arquivo.pt/wayback... [{'linkToArchive': 'https://arquivo.pt/wayback... [{'linkToArchive': 'https://arquivo.pt/wayback... [{'linkToArchive': 'https://arquivo.pt/wayback... [{'linkToArchive': 'https://arquivo.pt/wayback... [{'linkToArchive': 'https://arquivo.pt/wayback...
EDP [EDP, Energias de Portu... [{'linkToArchive': 'https://arquivo.pt/wayback... [{'linkToArchive': 'https://arquivo.pt/wayback... [{'linkToArchive': 'https://arquivo.pt/wayback... [{'linkToArchive': 'https://arquivo.pt/wayback... [{'linkToArchive': 'https://arquivo.pt/wayback... [{'linkToArchive': 'https://arquivo.pt/wayback... [{'linkToArchive': 'https://arquivo.pt/wayback...
Sonae [Sonae, SON] [{'linkToArchive': 'https://arquivo.pt/wayback... [{'linkToArchive': 'https://arquivo.pt/wayback... [{'linkToArchive': 'https://arquivo.pt/wayback... [{'linkToArchive': 'https://arquivo.pt/wayback... [{'linkToArchive': 'https://arquivo.pt/wayback... [{'linkToArchive': 'https://arquivo.pt/wayback... [{'linkToArchive': 'https://arquivo.pt/wayback...
Mota-Engil [Mota-Engil, EGL] [{'linkToArchive': 'https://arquivo.pt/wayback... [{'linkToArchive': 'https://arquivo.pt/wayback... [{'linkToArchive': 'https://arquivo.pt/wayback... [{'linkToArchive': 'https://arquivo.pt/wayback... [{'linkToArchive': 'https://arquivo.pt/wayback... [{'linkToArchive': 'https://arquivo.pt/wayback... [{'linkToArchive': 'https://arquivo.pt/wayback...

data01.parquet

pd.read_parquet("data01.parquet").map(len)
aliases api.2000 api.2003 api.2006 api.2009 api.2012 api.2015 api.2018
companies
Banco Comercial Português 2 153 241 183 561 1074 1430 954
Galp Energia 2 128 389 272 582 1156 1391 968
EDP 3 133 339 173 653 1232 1970 1096
Sonae 2 192 435 279 502 1215 1705 1196
Mota-Engil 2 4 83 60 195 538 828 560

\(\ \)

pd.read_parquet("data01.parquet").iloc[0,1][0]
{
    "linkToArchive": "https://arquivo.pt/wayback/20001213153700/http://www.publico.pt/publico/1999/09/28/Economia/E08.html",
    "linkToExtractedText": "https://arquivo.pt/textextracted?m=http%3A%2F%2Fwww.publico.pt%2Fpublico%2F1999%2F09%2F28%2FEconomia%2FE08.html%2F20001213153700",
    "tstamp": "20001213153700"
}

Text Extraction

Problems Faced

  • Duplicated texts due to &dedupValue=25&dedupField=url

  • Extracted text without any aliases

  • API usage limit (250 requests/min, error 429)

  • API error 404 for certain URLs

Text Extraction Function

def extracText(linkToExtractedText):
    # infinite loop to handle retries
    while True:
        response = requests.get(linkToExtractedText)
        status_code = response.status_code
        
        # request is successful (200 OK)
        if status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")
            return soup.get_text()
        
        # too many requests (429 WAIT)
        elif status_code == 429:
            print(" (...)", end = "")
            time.sleep(60)
        
        # no text to be found (404 PASS)
        elif status_code == 404:
            return 0
        
        # something else (???)
        else:
            print(f"Request failed: {status_code}; Link was {linkToExtractedText}")
            break

Applying Text Extraction

def filterColumn(column, aliases):
    """
    extract text for each cell in a column
        - check for duplicated text
        - check for api errors
        - check for aliases in text
    """
    filtered_column = []
    for row in aliases.index:
        print(f"; {row}", end = "")
        
        seen_text = set()
        filtered_cell = []
        for i in column.loc[row]:
            # extract text from linkToExtractedText
            text = extracText(i['linkToExtractedText'])
            
            # skip duplicates or error 404
            if (text in seen_text) or (not text):
                continue
              
            # check for aliases in text
            elif any(alias.lower() in text.lower() for alias in aliases.loc[row]):
                i["ExtractedText"] = text
                i.pop('linkToExtractedText', None)
                filtered_cell.append(i) # save to cell
                seen_text.add(text) # mark as seen

        filtered_column.append(filtered_cell)
    return filtered_column


def processColumns(cols2Process):
    """
    cols2Process = ["api.2000", "api.2003"]
    """
    print(f"Starting: {datetime.now()}")
    try:
        # try to access the started extraction
        df = pd.read_parquet("data02.parquet")
    except:
       # if fails, start a new one
       df = pd.read_parquet("data01.parquet").to_parquet("data02.parquet")
       df = pd.read_parquet("data02.parquet")
       
    for column in cols2Process:
      
        # verify if the column has already been processed
        has_link = "linkToExtractedText" in df.iloc[-1][column][-1]
        has_extracText = "ExtractedText" in df.iloc[-1][column][-1]
        if not has_link and has_extracText:
            print(f"\n{column} already done. Skipping.")
        
        # if not, process the column and save it
        else:
            print(f"\nProcessing {column}", end = ": ")
            df[column] = filterColumn(df[column], df["aliases"])
            df.to_parquet("data02.parquet")
    print(f"\nEnded: {datetime.now()}.")

processColumns(["api.2000", "api.2003", "api.2006", "api.2009", "api.2012", "api.2015", "api.2018"])

data02.parquet

pd.read_parquet("data02.parquet").map(len) - pd.read_parquet("data01.parquet").map(len)
aliases api.2000 api.2003 api.2006 api.2009 api.2012 api.2015 api.2018
companies
Banco Comercial Português 0 -63 -50 -14 -64 -91 -211 -130
Galp Energia 0 -62 -156 -91 -113 -192 -287 -156
EDP 0 -53 -94 -33 -115 -156 -442 -224
Sonae 0 -62 -117 -40 -43 -106 -305 -170
Mota-Engil 0 -1 -16 -34 -31 -154 -232 -115

\(\ \)

pd.read_parquet("data02.parquet").iloc[0,1][0]
{
    "tstamp": "20001213153700",
    "linkToArchive": "https://arquivo.pt/wayback/20001213153700/http://w...",
    "ExtractedText": "POL | Economia | Processo de infraccao contra Portugal vai ate ao fim SECCOES 1a Pagina Destaque Editoriais A Semana Politica Internacional Espaco Publico Sociedade Cultura Desporto Economia Media Local Lisboa Local Porto Ultima Pagina Ficha Tecnica SO TEXTO   .   Alternativa ao acordo com o BSCH nao trava contencioso Processo de Infraccao Contra P..."
}

Data Filtering

Problems Faced

  • Some extracted texts are very similar, though not identical. They should be removed based on a certain similarity criterion.

  • There are also “news” items that aren’t actually relevant to the company: they might be ads, irrelevant financial reports, or other unrelated content. These could be filtered out as well.

Since the first problem has a higher complexity (\(O(n^2)\)), removing irrelevant news will reduce the dataset and speed up solving the first problem.

\(\ \)

Note: it would have been ideal to filter these out during text extraction.

Dataset for News Detection Model

To address the ‘Not News’ problem, a small dataset with several features was created to train and test a model for detecting non-news content.

pd.read_csv("dtree01.csv").info()
<class 'pandas.core.frame.DataFrame'>
Index: 610 entries, 0 to 609
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   news          610 non-null    bool   
 1   IstALIAS      610 non-null    int64  
 2   propAN        610 non-null    float64
 3   txtSZ         610 non-null    int64  
 4   countALI      610 non-null    int64  
 5   countCAPS     610 non-null    int64  
 6   countDTS      610 non-null    int64  
 7   countHOUR     610 non-null    int64  
 8   countHTTP     610 non-null    int64  
 9   DTS+Hours     610 non-null    int64  
 10  alias_in_url  610 non-null    bool   
 11  news_site     610 non-null    object 
 12  date          610 non-null    int64  
 13  timestamp     610 non-null    int64  
 14  url           610 non-null    object 
 15  txt           610 non-null    object 
dtypes: bool(2), float64(1), int64(10), object(3)
memory usage: 72.7+ KB

Choosing and Training the Model

After experimenting with several models and feature combinations, it was discovered that a decision tree utilizing the following features and parameters provided the best fit for the data.

dataset = pd.read_csv("dtree01.csv")
X = dataset[['IstALIAS', 'propAN', 'txtSZ', 'countALI', 
              'countDTS', 'countHOUR', 'countCAPS']]
y = dataset['news'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, 
    test_size=0.15, random_state=21)

clf = DecisionTreeClassifier(random_state=21, max_depth=5, 
    min_samples_split=15, min_samples_leaf=10)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

Decision Tree Model

Accuracy after tuning: 0.9130434782608695
Classification Report: precision    recall  f1-score   support
                    0       0.92      0.97      0.94        68
                    1       0.90      0.75      0.82        24
  
             accuracy                           0.91        92
            macro avg       0.91      0.86      0.88        92
         weighted avg       0.91      0.91      0.91        92
decisiton tree ramos
Fig. 1: Decision tree visualization.

Decision Tree Model Application

dtree percentagens distribution
Fig. 2: Percentage distribution of classification 1 in the decision tree model applied to the entire dataset.

Decision Tree Model Application

# 0.4 to 0.6 setences filter
def filter_sentences_by_keywords(text, aliases):
    # split the text by punctuation, multiple spaces or newlines
    sentences = re.split(r'(?<=[.!?]) +|\s{2,}|\n+', text)
    filtered_sentences = [sentence for sentence in sentences if any(keyword.lower() in sentence.lower() for keyword in aliases)]
    return ' '.join(filtered_sentences)

# load dtree01 model and set everything
clf = load('dtree01.joblib')
data = pd.read_parquet("data02.parquet")
features = ['IstALIAS', 'propAN', 'txtSZ', 'countALI', 'countDTS', 'countHOUR', 'countCAPS']

# apply the model
for row in data.index:
    print(f"\n {row}", end = ": ")
    aliases = data.loc[row, "aliases"]
    for column in data.columns[1:]:
        print(column, end = " | ")
        validation = []
        for req in data.loc[row, column]:
            text = req["ExtractedText"]
            df = {}
            for feature in features:
                # calls the function of each feature
                df[feature] = [globals()[feature](text, aliases)]
            probability = clf.predict_proba(pd.DataFrame(df))[0, 1]
            if probability < 0.4:
                pass
            elif probability >= 0.4 and probability <= 0.6:
                req["newsProbability"] = round(probability, 3)
                req["ExtractedText"] = filter_sentences_by_keywords(text, aliases)
                validation.append(req)
            elif probability > 0.6:
                req["newsProbability"] = round(probability, 3)
                validation.append(req)
        data.loc[row, column] = validation

# save the results
data.to_parquet("data03.parquet")

data03.parquet

pd.read_parquet("data03.parquet").map(len) - pd.read_parquet("data02.parquet").map(len)
aliases api.2000 api.2003 api.2006 api.2009 api.2012 api.2015 api.2018
companies
Banco Comercial Português 0 -77 -157 -155 -350 -677 -832 -441
Galp Energia 0 -61 -190 -172 -321 -629 -681 -437
EDP 0 -69 -195 -122 -362 -779 -1064 -432
Sonae 0 -122 -297 -230 -369 -968 -1071 -725
Mota-Engil 0 -3 -63 -23 -117 -285 -470 -265

\(\ \)

pd.read_parquet("data03.parquet").iloc[0,1][0]
{
    "newsProbability": 0.933,
    "tstamp": "20010913052557",
    "linkToArchive": "https://arquivo.pt/wayback/20010913052557/http://w...",
    "ExtractedText": "DN   13 de Setembro de 2001 Intalacoes do BCP servem de refugio As instalacoes do Banco Comercial Portugues, em Wall Street, foram utilizadas como refugio por dezenas de pessoas depois dos atentados ao World Trade Center. Nenhum dos 48 funcionarios daquela sucursal banc..."
}

Filtering Out Similar Texts

def nearDuplicates(lista, threshold=90):
    total_data = len(lista) # status
    curr_data = 0 # status
    new_list = [lista[0]]
    texts = [lista[0]["ExtractedText"]]
    for req in lista[1:]:
        curr_data += 1 # status
        ExtractedText = req["ExtractedText"]
        similarity = 0
        for txt in texts:
            similarity = max(similarity, fuzz.ratio(txt, ExtractedText))
            if similarity > threshold:
                break
        if similarity <= threshold:
            new_list.append(req)
            texts.append(ExtractedText)
        if random.uniform(0, 1) < 0.01: # status
            print(f"{curr_data} of {total_data}", end = " | ") # status
    print("") # status
    return new_list
    
df = pd.read_parquet("data03.parquet").map(lambda x: list(x))
# group all api.YYYY columns
df['news'] = df.iloc[:, 1:].sum(axis=1)
# keep only 'aliases' and 'news' columns
df_filtered = df.iloc[:, [0, -1]]
# apply nearDuplicates function
df_filtered.loc[:, "news"] = df_filtered["news"].map(lambda x: nearDuplicates(x))


def tstampANDsource(lista):
    """formatting timestamp and extract source"""
    new_list = []
    noticias = pd.read_csv("noticias.csv")
    for req in lista:
        # news source
        linkToArchive = req["linkToArchive"]
        foundSource = False
        for index, row in noticias.iterrows():
            if row.iloc[0] in linkToArchive:
                req["newsSource"] = row.iloc[1]
                foundSource = True
                break
            else:
                pass
        if not foundSource:
            req["newsSource"] = "unknown"
        # timestamp
        req["tstamp"] = req["tstamp"][:6]
        # SAVE
        new_list.append(req)
    return new_list
    
df_filtered.loc[:, "news"] = df_filtered["news"].map(lambda x: tstampANDsource(x))
# save
df_filtered.to_parquet("data04.parquet")

data04.parquet

aliases news
companies
Banco Comercial Português [Banco Comercial Português, BCP] [{'ExtractedText': 'DN   13 de Setembro de 200...
Galp Energia [Galp Energia, GALP] [{'ExtractedText': 'RTP Galp reforça posição n...
EDP [EDP, Energias de Portugal, Electricidade de P... [{'ExtractedText': 'DN-Sinteses Negocios 9 de ...
Sonae [Sonae, SON] [{'ExtractedText': 'DN-Sinteses 5 de Março de ...
Mota-Engil [Mota-Engil, EGL] [{'ExtractedText': 'RTP Lucro da Mota-Engil so...

\(\ \)

pd.read_parquet("data04.parquet").iloc[0,1][0]
{
    "tstamp": "200109",
    "newsProbability": 0.933,
    "newsSource": "Diario de Noticias",
    "linkToArchive": "https://arquivo.pt/wayback/20010913052557/http://w...",
    "ExtractedText": "DN   13 de Setembro de 2001 Intalacoes do BCP servem de refugio As instalacoes do Banco Comercial Portugues, em Wall Street, foram..."
}

data04.parquet

coemcamos com 100% de news e agora
Fig. 3: Remaining news after filtering.

Banco Comercial Português \(4596 \rightarrow 807\) | Galp Energia \(4886 \rightarrow 809\)

EDP \(5596 \rightarrow 954\) | Sonae \(5524 \rightarrow 520\) | Mota-Engil \(2268 \rightarrow 296\)