Terry Zhuo commited on
Commit
5a98598
·
1 Parent(s): fb6851a
Files changed (3) hide show
  1. app.py +109 -0
  2. count_ip_data.py +289 -0
  3. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import pandas as pd
4
+ from datetime import datetime
5
+ import time
6
+ from count_ip_data import count_files_per_ip
7
+ import threading
8
+
9
+ # Define the path for storing the data
10
+ DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
11
+ STATS_FILE = os.path.join(DATA_DIR, 'battle_stats.csv')
12
+ LAST_UPDATE_FILE = os.path.join(DATA_DIR, 'last_update.txt')
13
+
14
+ # Ensure data directory exists
15
+ os.makedirs(DATA_DIR, exist_ok=True)
16
+
17
+ def save_stats(df, current_time):
18
+ """Save statistics and last update time to files"""
19
+ df.to_csv(STATS_FILE, index=False)
20
+ with open(LAST_UPDATE_FILE, 'w') as f:
21
+ f.write(current_time)
22
+
23
+ def load_stats():
24
+ """Load statistics and last update time from files"""
25
+ try:
26
+ df = pd.read_csv(STATS_FILE)
27
+ with open(LAST_UPDATE_FILE, 'r') as f:
28
+ last_update = f.read().strip()
29
+ return df, last_update
30
+ except (FileNotFoundError, pd.errors.EmptyDataError):
31
+ return pd.DataFrame(columns=['IP Address', 'Battle Count']), ""
32
+
33
+ def update_stats():
34
+ """Get the latest battle statistics"""
35
+ smb_url = os.getenv("SMB_URL")
36
+ if not smb_url:
37
+ return pd.DataFrame(columns=['IP Address', 'Battle Count']), ""
38
+
39
+ ip_counts = count_files_per_ip(smb_url)
40
+
41
+ # Convert to DataFrame for better display
42
+ df = pd.DataFrame(list(ip_counts.items()), columns=['IP Address', 'Battle Count'])
43
+ df = df.sort_values('Battle Count', ascending=False)
44
+
45
+ # Get current time
46
+ current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
47
+
48
+ # Save the updated stats
49
+ save_stats(df, current_time)
50
+
51
+ return df, current_time
52
+
53
+ def auto_update(state):
54
+ """Background task to update stats every hour"""
55
+ while state['running']:
56
+ state['stats'], state['last_update'] = update_stats()
57
+ time.sleep(3600) # Sleep for 1 hour
58
+
59
+ def create_ui():
60
+ state = {'running': True}
61
+
62
+ # Try to load existing stats first
63
+ state['stats'], state['last_update'] = load_stats()
64
+
65
+ # If no existing stats or they're empty, update them
66
+ if state['stats'].empty:
67
+ state['stats'], state['last_update'] = update_stats()
68
+
69
+ # Start background update thread
70
+ update_thread = threading.Thread(target=auto_update, args=(state,))
71
+ update_thread.daemon = True
72
+ update_thread.start()
73
+
74
+ def get_current_stats():
75
+ return state['stats']
76
+
77
+ def get_last_update():
78
+ return state['last_update']
79
+
80
+ def manual_refresh():
81
+ state['stats'], state['last_update'] = update_stats()
82
+ return state['stats'], state['last_update']
83
+
84
+ with gr.Blocks(title="Battle Count Statistics") as app:
85
+ gr.Markdown("# Battle Count Statistics")
86
+ gr.Markdown("Displays the count of valid battles per IP address. Updates automatically every hour.")
87
+
88
+ with gr.Row():
89
+ last_update = gr.Textbox(
90
+ value=get_last_update,
91
+ label="Last Updated",
92
+ interactive=False
93
+ )
94
+
95
+ with gr.Row():
96
+ output = gr.DataFrame(
97
+ value=get_current_stats,
98
+ interactive=False,
99
+ wrap=True,
100
+ )
101
+
102
+ # refresh_btn = gr.Button("Refresh Now")
103
+ # refresh_btn.click(fn=manual_refresh, outputs=[output, last_update])
104
+
105
+ return app
106
+
107
+ if __name__ == "__main__":
108
+ app = create_ui()
109
+ app.launch()
count_ip_data.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from datetime import datetime, timedelta
4
+ from urllib.parse import unquote
5
+ import json
6
+ from collections import defaultdict
7
+ import smbclient
8
+ import shutil
9
+ import re
10
+ import argparse
11
+
12
+ # List of IP addresses we care about
13
+ WHITELIST_IPS = [
14
+ "199.111.212.5",
15
+ "175.159.122.63",
16
+ "109.245.193.97",
17
+ "158.195.18.232",
18
+ "2607:fea8:4f40:4b00:e5b9:9806:6b69:233b",
19
+ "66.254.231.49",
20
+ "129.74.154.194",
21
+ "175.196.44.217",
22
+ "2601:600:8d00:9510:1d77:b610:9358:f443",
23
+ "74.90.222.68",
24
+ "2a02:169:3e9:0:6ce8:e76f:faed:c830",
25
+ "70.50.179.57",
26
+ "2a02:842a:24:5a01:8cd6:5b22:1189:6035",
27
+ "2408:8418:6390:7603:40b:555f:774:a05d"
28
+ ]
29
+
30
+ logging.basicConfig(level=logging.WARNING)
31
+ log = logging.getLogger(__name__)
32
+ # log.disabled = True
33
+
34
+ def get_ip_from_jsonl(file_path):
35
+ """Extract IP from the first line of a JSONL file"""
36
+ try:
37
+ with smbclient.open_file(file_path, mode='r') as f:
38
+ first_line = f.readline()
39
+ data = json.loads(first_line)
40
+ return data.get('ip')
41
+ except Exception as e:
42
+ log.error(f"Error reading file {file_path}: {e}")
43
+ return None
44
+
45
+ def get_chat_session_id(file_path):
46
+ """Extract chat_session_id based on the file location:
47
+ - For files under conv_logs: extract from filename
48
+ - For files under sandbox_logs: read from file content
49
+ """
50
+ try:
51
+ if 'conv_logs' in file_path:
52
+ # Extract from filename for conv_logs
53
+ # Handle Windows UNC path format
54
+ filename = file_path.split('\\')[-1] # Get the last component of the path
55
+ match = re.match(r'conv-log-([a-f0-9]+)\.json', filename)
56
+ if match:
57
+ return match.group(1)
58
+ elif 'sandbox_logs' in file_path:
59
+ # Read from file content for sandbox_logs
60
+ with smbclient.open_file(file_path, mode='r') as f:
61
+ data = json.loads(f.read())
62
+ return data['sandbox_state'].get('chat_session_id')
63
+ return None
64
+ except Exception as e:
65
+ log.error(f"Error getting chat_session_id from {file_path}: {e}")
66
+ return None
67
+
68
+ def get_sandbox_session_ids(server, share, date_str):
69
+ """Get all chat_session_ids from sandbox logs for a given date"""
70
+ sandbox_folder = f"\\\\{server}\\{share}\\{date_str}\\sandbox_logs"
71
+ session_ids = set()
72
+
73
+ if not smbclient.path.exists(sandbox_folder):
74
+ return session_ids
75
+
76
+ try:
77
+ for file_info in smbclient.scandir(sandbox_folder):
78
+ if file_info.name.endswith('.json'):
79
+ file_path = f"{sandbox_folder}\\{file_info.name}"
80
+ session_id = get_chat_session_id(file_path)
81
+ if session_id:
82
+ session_ids.add(session_id)
83
+ except Exception as e:
84
+ log.error(f"Error scanning sandbox folder {sandbox_folder}: {e}")
85
+
86
+ return session_ids
87
+
88
+ def check_vote_conditions(file_path):
89
+ """Check if the last line of the file has type:vote and feedback dict with 6 keys"""
90
+ try:
91
+ with smbclient.open_file(file_path, mode='r') as f:
92
+ # Read all lines and get the last non-empty line
93
+ lines = [line.strip() for line in f if line.strip()]
94
+ if not lines:
95
+ return False
96
+ last_line = lines[-1]
97
+ try:
98
+ data = json.loads(last_line)
99
+ feedback = data.get('feedback')
100
+ return (data.get('type') == 'vote' and
101
+ isinstance(feedback, dict) and
102
+ len(feedback) == 6)
103
+ except json.JSONDecodeError:
104
+ return False
105
+ except Exception as e:
106
+ log.error(f"Error checking vote conditions in file {file_path}: {e}")
107
+ return False
108
+
109
+ def get_file_data(file_path):
110
+ """Read file and return IP and vote condition status"""
111
+ try:
112
+ with smbclient.open_file(file_path, mode='r') as f:
113
+ lines = [line.strip() for line in f if line.strip()]
114
+ if not lines:
115
+ return None, False
116
+
117
+ # Get IP from first line
118
+ try:
119
+ first_line_data = json.loads(lines[0])
120
+ ip = first_line_data.get('ip')
121
+ # Early return if IP is not in whitelist
122
+ if ip not in WHITELIST_IPS:
123
+ return None, False
124
+ except json.JSONDecodeError:
125
+ ip = None
126
+
127
+ # Check vote conditions from last line
128
+ try:
129
+ last_line_data = json.loads(lines[-1])
130
+ feedback = last_line_data.get('feedback')
131
+ vote_conditions_met = (last_line_data.get('type') == 'vote' and
132
+ isinstance(feedback, dict) and
133
+ len(feedback) == 6)
134
+ except json.JSONDecodeError:
135
+ vote_conditions_met = False
136
+
137
+ return ip, vote_conditions_met
138
+ except Exception as e:
139
+ log.error(f"Error reading file {file_path}: {e}")
140
+ return None, False
141
+
142
+ def count_files_per_ip(smb_url, start_date_str="2025_02_18"):
143
+ """Count files per IP address from the given start date"""
144
+ # Remove 'smb://' prefix and parse URL components
145
+ url = smb_url[6:]
146
+ creds_server, share = url.split('/', 1)
147
+ creds, server = creds_server.rsplit('@', 1)
148
+ username, password = creds.split(':', 1)
149
+ password = unquote(password)
150
+
151
+ # Register the SMB session
152
+ smbclient.register_session(server, username=username, password=password)
153
+
154
+ # Convert start date string to datetime
155
+ start_date = datetime.strptime(start_date_str, "%Y_%m_%d")
156
+ ip_counts = defaultdict(int)
157
+
158
+ try:
159
+ # Get current date for iteration
160
+ current_date = start_date
161
+ today = datetime.now()
162
+
163
+ while current_date <= today:
164
+ date_str = current_date.strftime("%Y_%m_%d")
165
+ folder_path = f"\\\\{server}\\{share}\\{date_str}\\conv_logs\\battle_anony"
166
+
167
+ try:
168
+ # List all JSON files in the battle_anony folder
169
+ if smbclient.path.exists(folder_path):
170
+ for file_info in smbclient.scandir(folder_path, search_pattern="conv-log-*.json"):
171
+ file_path = f"{folder_path}\\{file_info.name}"
172
+ ip, vote_conditions_met = get_file_data(file_path)
173
+ if vote_conditions_met and ip:
174
+ ip_counts[ip] += 1
175
+ except Exception as e:
176
+ log.error(f"Error processing folder {date_str}: {e}")
177
+
178
+ # Move to next day
179
+ current_date += timedelta(days=1)
180
+
181
+ except Exception as e:
182
+ log.error(f"Error accessing SMB share: {e}")
183
+
184
+ return dict(ip_counts)
185
+
186
+ def download_files_by_ip(smb_url, start_date_str="2025_02_18", check_sandbox=True):
187
+ """Download files and organize them by IP address
188
+
189
+ Args:
190
+ smb_url (str): The SMB URL to connect to
191
+ start_date_str (str): The start date in YYYY_MM_DD format
192
+ check_sandbox (bool): Whether to check for matching sandbox logs
193
+ """
194
+ # Remove 'smb://' prefix and parse URL components
195
+ url = smb_url[6:]
196
+ creds_server, share = url.split('/', 1)
197
+ creds, server = creds_server.rsplit('@', 1)
198
+ username, password = creds.split(':', 1)
199
+ password = unquote(password)
200
+
201
+ # Register the SMB session
202
+ smbclient.register_session(server, username=username, password=password)
203
+
204
+ # Create base data directory
205
+ data_dir = os.path.join(os.getcwd(), "data")
206
+ os.makedirs(data_dir, exist_ok=True)
207
+
208
+ # Convert start date string to datetime
209
+ start_date = datetime.strptime(start_date_str, "%Y_%m_%d")
210
+
211
+ try:
212
+ # Get current date for iteration
213
+ current_date = start_date
214
+ today = datetime.now()
215
+
216
+ while current_date <= today:
217
+ date_str = current_date.strftime("%Y_%m_%d")
218
+ folder_path = f"\\\\{server}\\{share}\\{date_str}\\conv_logs\\battle_anony"
219
+
220
+ # Get all sandbox session IDs for this date
221
+ sandbox_session_ids = get_sandbox_session_ids(server, share, date_str) if check_sandbox else set()
222
+ try:
223
+ # List all JSON files in the battle_anony folder
224
+ if smbclient.path.exists(folder_path):
225
+ for file_info in smbclient.scandir(folder_path):
226
+ # Skip macOS metadata files
227
+ if file_info.name.startswith('._'):
228
+ continue
229
+ if file_info.name.endswith('.json'):
230
+ file_path = f"{folder_path}\\{file_info.name}"
231
+ ip = get_ip_from_jsonl(file_path)
232
+ if ip:
233
+ # Create directory structure for this IP
234
+ ip_dir = os.path.join(data_dir, ip)
235
+ valid_dir = os.path.join(ip_dir, "valid")
236
+ invalid_dir = os.path.join(ip_dir, "invalid")
237
+ os.makedirs(valid_dir, exist_ok=True)
238
+ os.makedirs(invalid_dir, exist_ok=True)
239
+
240
+ # Check if chat_session_id exists in sandbox logs
241
+ if check_sandbox:
242
+ chat_session_id = get_chat_session_id(file_path)
243
+ has_sandbox = chat_session_id in sandbox_session_ids if chat_session_id else False
244
+ target_dir = valid_dir if has_sandbox else invalid_dir
245
+ else:
246
+ # When sandbox checking is disabled, put everything in valid
247
+ target_dir = valid_dir
248
+
249
+ # Download the file
250
+ local_file_path = os.path.join(target_dir, file_info.name)
251
+ try:
252
+ with smbclient.open_file(file_path, mode='rb') as remote_file:
253
+ with open(local_file_path, 'wb') as local_file:
254
+ shutil.copyfileobj(remote_file, local_file)
255
+ log.info(f"Downloaded {file_info.name} to {target_dir}")
256
+ except Exception as e:
257
+ log.error(f"Error downloading file {file_info.name}: {e}")
258
+
259
+ except Exception as e:
260
+ log.error(f"Error processing folder {date_str}: {e}")
261
+
262
+ # Move to next day
263
+ current_date += timedelta(days=1)
264
+
265
+ except Exception as e:
266
+ log.error(f"Error accessing SMB share: {e}")
267
+
268
+ def main():
269
+ smb_url = os.getenv("SMB_URL")
270
+
271
+ # Add argument parser for optional parameters
272
+ parser = argparse.ArgumentParser(description='Download and organize conversation files by IP')
273
+ parser.add_argument('--sandbox-check', action='store_true', help='Check for matching sandbox logs')
274
+ parser.add_argument('--download', action='store_true', help='Enable file download')
275
+ args = parser.parse_args()
276
+
277
+ # Download files if enabled
278
+ if args.download:
279
+ print("\nDownloading files and organizing by IP address...")
280
+ download_files_by_ip(smb_url, check_sandbox=args.sandbox_check)
281
+
282
+ # Count and display statistics
283
+ ip_counts = count_files_per_ip(smb_url)
284
+ print("\nFile counts per IP address:")
285
+ for ip, count in sorted(ip_counts.items(), key=lambda x: x[1], reverse=True):
286
+ print(f"IP: {ip:<15} Count: {count}")
287
+
288
+ if __name__ == "__main__":
289
+ main()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio>=4.0.0
2
+ pandas>=2.0.0
3
+ smbclient>=1.0.0