Terry Zhuo
commited on
Commit
·
5a98598
1
Parent(s):
fb6851a
update
Browse files- app.py +109 -0
- count_ip_data.py +289 -0
- requirements.txt +3 -0
app.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import gradio as gr
|
3 |
+
import pandas as pd
|
4 |
+
from datetime import datetime
|
5 |
+
import time
|
6 |
+
from count_ip_data import count_files_per_ip
|
7 |
+
import threading
|
8 |
+
|
9 |
+
# Define the path for storing the data
|
10 |
+
DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
|
11 |
+
STATS_FILE = os.path.join(DATA_DIR, 'battle_stats.csv')
|
12 |
+
LAST_UPDATE_FILE = os.path.join(DATA_DIR, 'last_update.txt')
|
13 |
+
|
14 |
+
# Ensure data directory exists
|
15 |
+
os.makedirs(DATA_DIR, exist_ok=True)
|
16 |
+
|
17 |
+
def save_stats(df, current_time):
|
18 |
+
"""Save statistics and last update time to files"""
|
19 |
+
df.to_csv(STATS_FILE, index=False)
|
20 |
+
with open(LAST_UPDATE_FILE, 'w') as f:
|
21 |
+
f.write(current_time)
|
22 |
+
|
23 |
+
def load_stats():
|
24 |
+
"""Load statistics and last update time from files"""
|
25 |
+
try:
|
26 |
+
df = pd.read_csv(STATS_FILE)
|
27 |
+
with open(LAST_UPDATE_FILE, 'r') as f:
|
28 |
+
last_update = f.read().strip()
|
29 |
+
return df, last_update
|
30 |
+
except (FileNotFoundError, pd.errors.EmptyDataError):
|
31 |
+
return pd.DataFrame(columns=['IP Address', 'Battle Count']), ""
|
32 |
+
|
33 |
+
def update_stats():
|
34 |
+
"""Get the latest battle statistics"""
|
35 |
+
smb_url = os.getenv("SMB_URL")
|
36 |
+
if not smb_url:
|
37 |
+
return pd.DataFrame(columns=['IP Address', 'Battle Count']), ""
|
38 |
+
|
39 |
+
ip_counts = count_files_per_ip(smb_url)
|
40 |
+
|
41 |
+
# Convert to DataFrame for better display
|
42 |
+
df = pd.DataFrame(list(ip_counts.items()), columns=['IP Address', 'Battle Count'])
|
43 |
+
df = df.sort_values('Battle Count', ascending=False)
|
44 |
+
|
45 |
+
# Get current time
|
46 |
+
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
47 |
+
|
48 |
+
# Save the updated stats
|
49 |
+
save_stats(df, current_time)
|
50 |
+
|
51 |
+
return df, current_time
|
52 |
+
|
53 |
+
def auto_update(state):
|
54 |
+
"""Background task to update stats every hour"""
|
55 |
+
while state['running']:
|
56 |
+
state['stats'], state['last_update'] = update_stats()
|
57 |
+
time.sleep(3600) # Sleep for 1 hour
|
58 |
+
|
59 |
+
def create_ui():
|
60 |
+
state = {'running': True}
|
61 |
+
|
62 |
+
# Try to load existing stats first
|
63 |
+
state['stats'], state['last_update'] = load_stats()
|
64 |
+
|
65 |
+
# If no existing stats or they're empty, update them
|
66 |
+
if state['stats'].empty:
|
67 |
+
state['stats'], state['last_update'] = update_stats()
|
68 |
+
|
69 |
+
# Start background update thread
|
70 |
+
update_thread = threading.Thread(target=auto_update, args=(state,))
|
71 |
+
update_thread.daemon = True
|
72 |
+
update_thread.start()
|
73 |
+
|
74 |
+
def get_current_stats():
|
75 |
+
return state['stats']
|
76 |
+
|
77 |
+
def get_last_update():
|
78 |
+
return state['last_update']
|
79 |
+
|
80 |
+
def manual_refresh():
|
81 |
+
state['stats'], state['last_update'] = update_stats()
|
82 |
+
return state['stats'], state['last_update']
|
83 |
+
|
84 |
+
with gr.Blocks(title="Battle Count Statistics") as app:
|
85 |
+
gr.Markdown("# Battle Count Statistics")
|
86 |
+
gr.Markdown("Displays the count of valid battles per IP address. Updates automatically every hour.")
|
87 |
+
|
88 |
+
with gr.Row():
|
89 |
+
last_update = gr.Textbox(
|
90 |
+
value=get_last_update,
|
91 |
+
label="Last Updated",
|
92 |
+
interactive=False
|
93 |
+
)
|
94 |
+
|
95 |
+
with gr.Row():
|
96 |
+
output = gr.DataFrame(
|
97 |
+
value=get_current_stats,
|
98 |
+
interactive=False,
|
99 |
+
wrap=True,
|
100 |
+
)
|
101 |
+
|
102 |
+
# refresh_btn = gr.Button("Refresh Now")
|
103 |
+
# refresh_btn.click(fn=manual_refresh, outputs=[output, last_update])
|
104 |
+
|
105 |
+
return app
|
106 |
+
|
107 |
+
if __name__ == "__main__":
|
108 |
+
app = create_ui()
|
109 |
+
app.launch()
|
count_ip_data.py
ADDED
@@ -0,0 +1,289 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import logging
|
3 |
+
from datetime import datetime, timedelta
|
4 |
+
from urllib.parse import unquote
|
5 |
+
import json
|
6 |
+
from collections import defaultdict
|
7 |
+
import smbclient
|
8 |
+
import shutil
|
9 |
+
import re
|
10 |
+
import argparse
|
11 |
+
|
12 |
+
# List of IP addresses we care about
|
13 |
+
WHITELIST_IPS = [
|
14 |
+
"199.111.212.5",
|
15 |
+
"175.159.122.63",
|
16 |
+
"109.245.193.97",
|
17 |
+
"158.195.18.232",
|
18 |
+
"2607:fea8:4f40:4b00:e5b9:9806:6b69:233b",
|
19 |
+
"66.254.231.49",
|
20 |
+
"129.74.154.194",
|
21 |
+
"175.196.44.217",
|
22 |
+
"2601:600:8d00:9510:1d77:b610:9358:f443",
|
23 |
+
"74.90.222.68",
|
24 |
+
"2a02:169:3e9:0:6ce8:e76f:faed:c830",
|
25 |
+
"70.50.179.57",
|
26 |
+
"2a02:842a:24:5a01:8cd6:5b22:1189:6035",
|
27 |
+
"2408:8418:6390:7603:40b:555f:774:a05d"
|
28 |
+
]
|
29 |
+
|
30 |
+
logging.basicConfig(level=logging.WARNING)
|
31 |
+
log = logging.getLogger(__name__)
|
32 |
+
# log.disabled = True
|
33 |
+
|
34 |
+
def get_ip_from_jsonl(file_path):
|
35 |
+
"""Extract IP from the first line of a JSONL file"""
|
36 |
+
try:
|
37 |
+
with smbclient.open_file(file_path, mode='r') as f:
|
38 |
+
first_line = f.readline()
|
39 |
+
data = json.loads(first_line)
|
40 |
+
return data.get('ip')
|
41 |
+
except Exception as e:
|
42 |
+
log.error(f"Error reading file {file_path}: {e}")
|
43 |
+
return None
|
44 |
+
|
45 |
+
def get_chat_session_id(file_path):
|
46 |
+
"""Extract chat_session_id based on the file location:
|
47 |
+
- For files under conv_logs: extract from filename
|
48 |
+
- For files under sandbox_logs: read from file content
|
49 |
+
"""
|
50 |
+
try:
|
51 |
+
if 'conv_logs' in file_path:
|
52 |
+
# Extract from filename for conv_logs
|
53 |
+
# Handle Windows UNC path format
|
54 |
+
filename = file_path.split('\\')[-1] # Get the last component of the path
|
55 |
+
match = re.match(r'conv-log-([a-f0-9]+)\.json', filename)
|
56 |
+
if match:
|
57 |
+
return match.group(1)
|
58 |
+
elif 'sandbox_logs' in file_path:
|
59 |
+
# Read from file content for sandbox_logs
|
60 |
+
with smbclient.open_file(file_path, mode='r') as f:
|
61 |
+
data = json.loads(f.read())
|
62 |
+
return data['sandbox_state'].get('chat_session_id')
|
63 |
+
return None
|
64 |
+
except Exception as e:
|
65 |
+
log.error(f"Error getting chat_session_id from {file_path}: {e}")
|
66 |
+
return None
|
67 |
+
|
68 |
+
def get_sandbox_session_ids(server, share, date_str):
|
69 |
+
"""Get all chat_session_ids from sandbox logs for a given date"""
|
70 |
+
sandbox_folder = f"\\\\{server}\\{share}\\{date_str}\\sandbox_logs"
|
71 |
+
session_ids = set()
|
72 |
+
|
73 |
+
if not smbclient.path.exists(sandbox_folder):
|
74 |
+
return session_ids
|
75 |
+
|
76 |
+
try:
|
77 |
+
for file_info in smbclient.scandir(sandbox_folder):
|
78 |
+
if file_info.name.endswith('.json'):
|
79 |
+
file_path = f"{sandbox_folder}\\{file_info.name}"
|
80 |
+
session_id = get_chat_session_id(file_path)
|
81 |
+
if session_id:
|
82 |
+
session_ids.add(session_id)
|
83 |
+
except Exception as e:
|
84 |
+
log.error(f"Error scanning sandbox folder {sandbox_folder}: {e}")
|
85 |
+
|
86 |
+
return session_ids
|
87 |
+
|
88 |
+
def check_vote_conditions(file_path):
|
89 |
+
"""Check if the last line of the file has type:vote and feedback dict with 6 keys"""
|
90 |
+
try:
|
91 |
+
with smbclient.open_file(file_path, mode='r') as f:
|
92 |
+
# Read all lines and get the last non-empty line
|
93 |
+
lines = [line.strip() for line in f if line.strip()]
|
94 |
+
if not lines:
|
95 |
+
return False
|
96 |
+
last_line = lines[-1]
|
97 |
+
try:
|
98 |
+
data = json.loads(last_line)
|
99 |
+
feedback = data.get('feedback')
|
100 |
+
return (data.get('type') == 'vote' and
|
101 |
+
isinstance(feedback, dict) and
|
102 |
+
len(feedback) == 6)
|
103 |
+
except json.JSONDecodeError:
|
104 |
+
return False
|
105 |
+
except Exception as e:
|
106 |
+
log.error(f"Error checking vote conditions in file {file_path}: {e}")
|
107 |
+
return False
|
108 |
+
|
109 |
+
def get_file_data(file_path):
|
110 |
+
"""Read file and return IP and vote condition status"""
|
111 |
+
try:
|
112 |
+
with smbclient.open_file(file_path, mode='r') as f:
|
113 |
+
lines = [line.strip() for line in f if line.strip()]
|
114 |
+
if not lines:
|
115 |
+
return None, False
|
116 |
+
|
117 |
+
# Get IP from first line
|
118 |
+
try:
|
119 |
+
first_line_data = json.loads(lines[0])
|
120 |
+
ip = first_line_data.get('ip')
|
121 |
+
# Early return if IP is not in whitelist
|
122 |
+
if ip not in WHITELIST_IPS:
|
123 |
+
return None, False
|
124 |
+
except json.JSONDecodeError:
|
125 |
+
ip = None
|
126 |
+
|
127 |
+
# Check vote conditions from last line
|
128 |
+
try:
|
129 |
+
last_line_data = json.loads(lines[-1])
|
130 |
+
feedback = last_line_data.get('feedback')
|
131 |
+
vote_conditions_met = (last_line_data.get('type') == 'vote' and
|
132 |
+
isinstance(feedback, dict) and
|
133 |
+
len(feedback) == 6)
|
134 |
+
except json.JSONDecodeError:
|
135 |
+
vote_conditions_met = False
|
136 |
+
|
137 |
+
return ip, vote_conditions_met
|
138 |
+
except Exception as e:
|
139 |
+
log.error(f"Error reading file {file_path}: {e}")
|
140 |
+
return None, False
|
141 |
+
|
142 |
+
def count_files_per_ip(smb_url, start_date_str="2025_02_18"):
|
143 |
+
"""Count files per IP address from the given start date"""
|
144 |
+
# Remove 'smb://' prefix and parse URL components
|
145 |
+
url = smb_url[6:]
|
146 |
+
creds_server, share = url.split('/', 1)
|
147 |
+
creds, server = creds_server.rsplit('@', 1)
|
148 |
+
username, password = creds.split(':', 1)
|
149 |
+
password = unquote(password)
|
150 |
+
|
151 |
+
# Register the SMB session
|
152 |
+
smbclient.register_session(server, username=username, password=password)
|
153 |
+
|
154 |
+
# Convert start date string to datetime
|
155 |
+
start_date = datetime.strptime(start_date_str, "%Y_%m_%d")
|
156 |
+
ip_counts = defaultdict(int)
|
157 |
+
|
158 |
+
try:
|
159 |
+
# Get current date for iteration
|
160 |
+
current_date = start_date
|
161 |
+
today = datetime.now()
|
162 |
+
|
163 |
+
while current_date <= today:
|
164 |
+
date_str = current_date.strftime("%Y_%m_%d")
|
165 |
+
folder_path = f"\\\\{server}\\{share}\\{date_str}\\conv_logs\\battle_anony"
|
166 |
+
|
167 |
+
try:
|
168 |
+
# List all JSON files in the battle_anony folder
|
169 |
+
if smbclient.path.exists(folder_path):
|
170 |
+
for file_info in smbclient.scandir(folder_path, search_pattern="conv-log-*.json"):
|
171 |
+
file_path = f"{folder_path}\\{file_info.name}"
|
172 |
+
ip, vote_conditions_met = get_file_data(file_path)
|
173 |
+
if vote_conditions_met and ip:
|
174 |
+
ip_counts[ip] += 1
|
175 |
+
except Exception as e:
|
176 |
+
log.error(f"Error processing folder {date_str}: {e}")
|
177 |
+
|
178 |
+
# Move to next day
|
179 |
+
current_date += timedelta(days=1)
|
180 |
+
|
181 |
+
except Exception as e:
|
182 |
+
log.error(f"Error accessing SMB share: {e}")
|
183 |
+
|
184 |
+
return dict(ip_counts)
|
185 |
+
|
186 |
+
def download_files_by_ip(smb_url, start_date_str="2025_02_18", check_sandbox=True):
|
187 |
+
"""Download files and organize them by IP address
|
188 |
+
|
189 |
+
Args:
|
190 |
+
smb_url (str): The SMB URL to connect to
|
191 |
+
start_date_str (str): The start date in YYYY_MM_DD format
|
192 |
+
check_sandbox (bool): Whether to check for matching sandbox logs
|
193 |
+
"""
|
194 |
+
# Remove 'smb://' prefix and parse URL components
|
195 |
+
url = smb_url[6:]
|
196 |
+
creds_server, share = url.split('/', 1)
|
197 |
+
creds, server = creds_server.rsplit('@', 1)
|
198 |
+
username, password = creds.split(':', 1)
|
199 |
+
password = unquote(password)
|
200 |
+
|
201 |
+
# Register the SMB session
|
202 |
+
smbclient.register_session(server, username=username, password=password)
|
203 |
+
|
204 |
+
# Create base data directory
|
205 |
+
data_dir = os.path.join(os.getcwd(), "data")
|
206 |
+
os.makedirs(data_dir, exist_ok=True)
|
207 |
+
|
208 |
+
# Convert start date string to datetime
|
209 |
+
start_date = datetime.strptime(start_date_str, "%Y_%m_%d")
|
210 |
+
|
211 |
+
try:
|
212 |
+
# Get current date for iteration
|
213 |
+
current_date = start_date
|
214 |
+
today = datetime.now()
|
215 |
+
|
216 |
+
while current_date <= today:
|
217 |
+
date_str = current_date.strftime("%Y_%m_%d")
|
218 |
+
folder_path = f"\\\\{server}\\{share}\\{date_str}\\conv_logs\\battle_anony"
|
219 |
+
|
220 |
+
# Get all sandbox session IDs for this date
|
221 |
+
sandbox_session_ids = get_sandbox_session_ids(server, share, date_str) if check_sandbox else set()
|
222 |
+
try:
|
223 |
+
# List all JSON files in the battle_anony folder
|
224 |
+
if smbclient.path.exists(folder_path):
|
225 |
+
for file_info in smbclient.scandir(folder_path):
|
226 |
+
# Skip macOS metadata files
|
227 |
+
if file_info.name.startswith('._'):
|
228 |
+
continue
|
229 |
+
if file_info.name.endswith('.json'):
|
230 |
+
file_path = f"{folder_path}\\{file_info.name}"
|
231 |
+
ip = get_ip_from_jsonl(file_path)
|
232 |
+
if ip:
|
233 |
+
# Create directory structure for this IP
|
234 |
+
ip_dir = os.path.join(data_dir, ip)
|
235 |
+
valid_dir = os.path.join(ip_dir, "valid")
|
236 |
+
invalid_dir = os.path.join(ip_dir, "invalid")
|
237 |
+
os.makedirs(valid_dir, exist_ok=True)
|
238 |
+
os.makedirs(invalid_dir, exist_ok=True)
|
239 |
+
|
240 |
+
# Check if chat_session_id exists in sandbox logs
|
241 |
+
if check_sandbox:
|
242 |
+
chat_session_id = get_chat_session_id(file_path)
|
243 |
+
has_sandbox = chat_session_id in sandbox_session_ids if chat_session_id else False
|
244 |
+
target_dir = valid_dir if has_sandbox else invalid_dir
|
245 |
+
else:
|
246 |
+
# When sandbox checking is disabled, put everything in valid
|
247 |
+
target_dir = valid_dir
|
248 |
+
|
249 |
+
# Download the file
|
250 |
+
local_file_path = os.path.join(target_dir, file_info.name)
|
251 |
+
try:
|
252 |
+
with smbclient.open_file(file_path, mode='rb') as remote_file:
|
253 |
+
with open(local_file_path, 'wb') as local_file:
|
254 |
+
shutil.copyfileobj(remote_file, local_file)
|
255 |
+
log.info(f"Downloaded {file_info.name} to {target_dir}")
|
256 |
+
except Exception as e:
|
257 |
+
log.error(f"Error downloading file {file_info.name}: {e}")
|
258 |
+
|
259 |
+
except Exception as e:
|
260 |
+
log.error(f"Error processing folder {date_str}: {e}")
|
261 |
+
|
262 |
+
# Move to next day
|
263 |
+
current_date += timedelta(days=1)
|
264 |
+
|
265 |
+
except Exception as e:
|
266 |
+
log.error(f"Error accessing SMB share: {e}")
|
267 |
+
|
268 |
+
def main():
|
269 |
+
smb_url = os.getenv("SMB_URL")
|
270 |
+
|
271 |
+
# Add argument parser for optional parameters
|
272 |
+
parser = argparse.ArgumentParser(description='Download and organize conversation files by IP')
|
273 |
+
parser.add_argument('--sandbox-check', action='store_true', help='Check for matching sandbox logs')
|
274 |
+
parser.add_argument('--download', action='store_true', help='Enable file download')
|
275 |
+
args = parser.parse_args()
|
276 |
+
|
277 |
+
# Download files if enabled
|
278 |
+
if args.download:
|
279 |
+
print("\nDownloading files and organizing by IP address...")
|
280 |
+
download_files_by_ip(smb_url, check_sandbox=args.sandbox_check)
|
281 |
+
|
282 |
+
# Count and display statistics
|
283 |
+
ip_counts = count_files_per_ip(smb_url)
|
284 |
+
print("\nFile counts per IP address:")
|
285 |
+
for ip, count in sorted(ip_counts.items(), key=lambda x: x[1], reverse=True):
|
286 |
+
print(f"IP: {ip:<15} Count: {count}")
|
287 |
+
|
288 |
+
if __name__ == "__main__":
|
289 |
+
main()
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
gradio>=4.0.0
|
2 |
+
pandas>=2.0.0
|
3 |
+
smbclient>=1.0.0
|