Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re, pandas as pd, numpy as np, os, ace_tools as tools
- # reload df
- df=pd.read_csv('/mnt/data/events.csv')
- def simple_lemma(token:str):
- t=token.lower()
- if t.endswith('ies') and len(t)>4:
- return t[:-3]+'y'
- if t.endswith('es') and len(t)>3:
- return t[:-2]
- if t.endswith('s') and len(t)>3:
- return t[:-1]
- return t
- lexicons={
- 4:['epidemic','pandemic','plague','cholera','influenza','flu','smallpox','measles',
- 'yellow','fever','typhus','virus','disease','outbreak','covid','sars','ebola',
- 'coronavirus','infection'],
- 3:['independence','liberation','decolonization','sovereignty','nationalist','secession',
- 'freedom','emancipation','self-determination','unification','separate','separatist'],
- 2:['war','battle','rebellion','siege','invasion','conflict','raid','hostility','military',
- 'crusade','skirmish','ambush','massacre','attack','bombing','uprising','revolution',
- 'expedition','occupation','insurgency','engagement','offensive','assault'],
- 1:['fire','earthquake','eruption','volcano','flood','hurricane','storm','tornado','tsunami',
- 'cyclone','typhoon','mudslide','avalanche','drought','explosion','disaster','catastrophe',
- 'shipwreck','sinking','spill','crash','collapse','accident','blaze','landslide','wildfire',
- 'heatwave','slide']
- }
- lex_set={k:set(v) for k,v in lexicons.items()}
- severity_words={
- 5:['catastrophic','devastating','deadliest','massive','worst','largest','extreme','mega','strongest','powerful','violent','annihilating'],
- 4:['severe','major','intense','huge','large','significant','heavy','destructive'],
- 3:['moderate','notable','considerable'],
- 2:['minor','small','light','weak','limited'],
- }
- severity_patterns={k:re.compile('|'.join(map(re.escape,v)),re.IGNORECASE) for k,v in severity_words.items()}
- def estimate_scale(text):
- t=text.lower()
- nums_units=re.findall(r'(\d+(?:[\d,\.]*))\s*(million|millions|thousand|thousands|hundred|hundreds)?\s*(?:dead|killed|deaths|casualties|injured|people)?',t)
- max_val=0
- for num_str,unit in nums_units:
- cleaned=re.sub(r'[^0-9\.]','',num_str)
- cleaned=cleaned.rstrip('.') # remove trailing dots
- if cleaned=='' or cleaned=='.':
- continue
- num_clean=float(cleaned)
- if unit:
- if 'million' in unit:
- num_clean*=1_000_000
- elif 'thousand' in unit:
- num_clean*=1_000
- elif 'hundred' in unit:
- num_clean*=100
- max_val=max(max_val,num_clean)
- scale_num=None
- if max_val:
- if max_val>=100000: scale_num=5
- elif max_val>=10000: scale_num=4
- elif max_val>=1000: scale_num=3
- elif max_val>=100: scale_num=2
- else: scale_num=1
- scale_adj=None
- for s in [5,4,3,2]:
- if severity_patterns[s].search(t):
- scale_adj=s
- break
- if scale_num is not None or scale_adj is not None:
- return max(scale_num or 0, scale_adj or 0)
- return None
- token_pattern=re.compile(r"[A-Za-z]+")
- def classify(text):
- tokens=[simple_lemma(tok) for tok in token_pattern.findall(str(text))]
- token_set=set(tokens)
- for cls in [4,3,2,1]:
- if token_set & lex_set[cls]:
- return cls
- return 0
- df['class']=df['text'].apply(classify)
- df['scale']=df['text'].apply(estimate_scale)
- out_path='/mnt/data/events_classified_tokens.csv'
- df[['date','text','class','scale']].to_csv(out_path,index=False)
- tools.display_dataframe_to_user("Token-based classification preview", df[['date','text','class','scale']].head(20))
- print(f"File saved to: {out_path}")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement