Untitled

iostream_h

May 6th, 2025

208

Never

Add comment

Not a member of Pastebin yet? Sign Up, it unlocks many cool features!

Python 3.67 KB | None | 0 0

raw download clone embed print report

import re, pandas as pd, numpy as np, os, ace_tools as tools
# reload df
df=pd.read_csv('/mnt/data/events.csv')
def simple_lemma(token:str):
t=token.lower()
if t.endswith('ies') and len(t)>4:
return t[:-3]+'y'
if t.endswith('es') and len(t)>3:
return t[:-2]
if t.endswith('s') and len(t)>3:
return t[:-1]
return t
lexicons={
4:['epidemic','pandemic','plague','cholera','influenza','flu','smallpox','measles',
'yellow','fever','typhus','virus','disease','outbreak','covid','sars','ebola',
'coronavirus','infection'],
3:['independence','liberation','decolonization','sovereignty','nationalist','secession',
'freedom','emancipation','self-determination','unification','separate','separatist'],
2:['war','battle','rebellion','siege','invasion','conflict','raid','hostility','military',
'crusade','skirmish','ambush','massacre','attack','bombing','uprising','revolution',
'expedition','occupation','insurgency','engagement','offensive','assault'],
1:['fire','earthquake','eruption','volcano','flood','hurricane','storm','tornado','tsunami',
'cyclone','typhoon','mudslide','avalanche','drought','explosion','disaster','catastrophe',
'shipwreck','sinking','spill','crash','collapse','accident','blaze','landslide','wildfire',
'heatwave','slide']
}
lex_set={k:set(v) for k,v in lexicons.items()}
severity_words={
5:['catastrophic','devastating','deadliest','massive','worst','largest','extreme','mega','strongest','powerful','violent','annihilating'],
4:['severe','major','intense','huge','large','significant','heavy','destructive'],
3:['moderate','notable','considerable'],
2:['minor','small','light','weak','limited'],
}
severity_patterns={k:re.compile('|'.join(map(re.escape,v)),re.IGNORECASE) for k,v in severity_words.items()}
def estimate_scale(text):
t=text.lower()
nums_units=re.findall(r'(\d+(?:[\d,\.]*))\s*(million|millions|thousand|thousands|hundred|hundreds)?\s*(?:dead|killed|deaths|casualties|injured|people)?',t)
max_val=0
for num_str,unit in nums_units:
cleaned=re.sub(r'[^0-9\.]','',num_str)
cleaned=cleaned.rstrip('.') # remove trailing dots
if cleaned=='' or cleaned=='.':
continue
num_clean=float(cleaned)
if unit:
if 'million' in unit:
num_clean*=1_000_000
elif 'thousand' in unit:
num_clean*=1_000
elif 'hundred' in unit:
num_clean*=100
max_val=max(max_val,num_clean)
scale_num=None
if max_val:
if max_val>=100000: scale_num=5
elif max_val>=10000: scale_num=4
elif max_val>=1000: scale_num=3
elif max_val>=100: scale_num=2
else: scale_num=1
scale_adj=None
for s in [5,4,3,2]:
if severity_patterns[s].search(t):
scale_adj=s
break
if scale_num is not None or scale_adj is not None:
return max(scale_num or 0, scale_adj or 0)
return None
token_pattern=re.compile(r"[A-Za-z]+")
def classify(text):
tokens=[simple_lemma(tok) for tok in token_pattern.findall(str(text))]
token_set=set(tokens)
for cls in [4,3,2,1]:
if token_set & lex_set[cls]:
return cls
return 0
df['class']=df['text'].apply(classify)
df['scale']=df['text'].apply(estimate_scale)
out_path='/mnt/data/events_classified_tokens.csv'
df[['date','text','class','scale']].to_csv(out_path,index=False)
tools.display_dataframe_to_user("Token-based classification preview", df[['date','text','class','scale']].head(20))
print(f"File saved to: {out_path}")

Add Comment

Please, Sign In to add comment