Advertisement
iostream_h

Untitled

May 6th, 2025
208
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.67 KB | None | 0 0
  1. import re, pandas as pd, numpy as np, os, ace_tools as tools
  2.  
  3. # reload df
  4. df=pd.read_csv('/mnt/data/events.csv')
  5.  
  6. def simple_lemma(token:str):
  7.     t=token.lower()
  8.     if t.endswith('ies') and len(t)>4:
  9.         return t[:-3]+'y'
  10.     if t.endswith('es') and len(t)>3:
  11.         return t[:-2]
  12.     if t.endswith('s') and len(t)>3:
  13.         return t[:-1]
  14.     return t
  15.  
  16. lexicons={
  17.     4:['epidemic','pandemic','plague','cholera','influenza','flu','smallpox','measles',
  18.        'yellow','fever','typhus','virus','disease','outbreak','covid','sars','ebola',
  19.        'coronavirus','infection'],
  20.     3:['independence','liberation','decolonization','sovereignty','nationalist','secession',
  21.        'freedom','emancipation','self-determination','unification','separate','separatist'],
  22.     2:['war','battle','rebellion','siege','invasion','conflict','raid','hostility','military',
  23.        'crusade','skirmish','ambush','massacre','attack','bombing','uprising','revolution',
  24.        'expedition','occupation','insurgency','engagement','offensive','assault'],
  25.     1:['fire','earthquake','eruption','volcano','flood','hurricane','storm','tornado','tsunami',
  26.        'cyclone','typhoon','mudslide','avalanche','drought','explosion','disaster','catastrophe',
  27.        'shipwreck','sinking','spill','crash','collapse','accident','blaze','landslide','wildfire',
  28.        'heatwave','slide']
  29. }
  30. lex_set={k:set(v) for k,v in lexicons.items()}
  31.  
  32. severity_words={
  33.     5:['catastrophic','devastating','deadliest','massive','worst','largest','extreme','mega','strongest','powerful','violent','annihilating'],
  34.     4:['severe','major','intense','huge','large','significant','heavy','destructive'],
  35.     3:['moderate','notable','considerable'],
  36.     2:['minor','small','light','weak','limited'],
  37. }
  38. severity_patterns={k:re.compile('|'.join(map(re.escape,v)),re.IGNORECASE) for k,v in severity_words.items()}
  39.  
  40. def estimate_scale(text):
  41.     t=text.lower()
  42.     nums_units=re.findall(r'(\d+(?:[\d,\.]*))\s*(million|millions|thousand|thousands|hundred|hundreds)?\s*(?:dead|killed|deaths|casualties|injured|people)?',t)
  43.     max_val=0
  44.     for num_str,unit in nums_units:
  45.         cleaned=re.sub(r'[^0-9\.]','',num_str)
  46.         cleaned=cleaned.rstrip('.')  # remove trailing dots
  47.         if cleaned=='' or cleaned=='.':
  48.             continue
  49.         num_clean=float(cleaned)
  50.         if unit:
  51.             if 'million' in unit:
  52.                 num_clean*=1_000_000
  53.             elif 'thousand' in unit:
  54.                 num_clean*=1_000
  55.             elif 'hundred' in unit:
  56.                 num_clean*=100
  57.         max_val=max(max_val,num_clean)
  58.     scale_num=None
  59.     if max_val:
  60.         if max_val>=100000: scale_num=5
  61.         elif max_val>=10000: scale_num=4
  62.         elif max_val>=1000: scale_num=3
  63.         elif max_val>=100: scale_num=2
  64.         else: scale_num=1
  65.     scale_adj=None
  66.     for s in [5,4,3,2]:
  67.         if severity_patterns[s].search(t):
  68.             scale_adj=s
  69.             break
  70.     if scale_num is not None or scale_adj is not None:
  71.         return max(scale_num or 0, scale_adj or 0)
  72.     return None
  73.  
  74. token_pattern=re.compile(r"[A-Za-z]+")
  75.  
  76. def classify(text):
  77.     tokens=[simple_lemma(tok) for tok in token_pattern.findall(str(text))]
  78.     token_set=set(tokens)
  79.     for cls in [4,3,2,1]:
  80.         if token_set & lex_set[cls]:
  81.             return cls
  82.     return 0
  83.  
  84. df['class']=df['text'].apply(classify)
  85. df['scale']=df['text'].apply(estimate_scale)
  86. out_path='/mnt/data/events_classified_tokens.csv'
  87. df[['date','text','class','scale']].to_csv(out_path,index=False)
  88.  
  89. tools.display_dataframe_to_user("Token-based classification preview", df[['date','text','class','scale']].head(20))
  90. print(f"File saved to: {out_path}")
  91.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement