Terry Zhuo commited on
Commit
881554c
·
1 Parent(s): 7168226
Files changed (1) hide show
  1. azure_count_ip_data.py +140 -24
azure_count_ip_data.py CHANGED
@@ -262,27 +262,77 @@ def get_first_user_prompt(content: str) -> Optional[str]:
262
  first_message = messages[0]
263
  if isinstance(first_message, list) and len(first_message) > 1:
264
  # Format: ["<|im_start|>user", "hello"]
265
- return first_message[1]
 
 
 
 
 
 
 
 
 
 
266
 
267
  # Format 2: state.messages array with {"role": "user", "content": "hello"} format
268
  if messages and len(messages) > 0:
269
  first_message = messages[0]
270
  if isinstance(first_message, dict) and 'content' in first_message:
271
- return first_message.get('content')
 
 
 
 
 
 
 
 
 
 
272
 
273
  # Format 3: Direct messages array in the root
274
  messages = first_line_data.get('messages', [])
275
  if messages and len(messages) > 0:
276
  first_message = messages[0]
277
  if isinstance(first_message, list) and len(first_message) > 1:
278
- return first_message[1]
 
 
 
 
 
 
 
 
 
 
279
  elif isinstance(first_message, dict) and 'content' in first_message:
280
- return first_message.get('content')
 
 
 
 
 
 
 
 
 
 
281
 
282
  # Format 4: Look for a specific user role key
283
  for key in ['user', 'human', 'Human']:
284
  if key in first_line_data:
285
- return first_line_data[key]
 
 
 
 
 
 
 
 
 
 
286
 
287
  log.warning(f"Could not extract first user prompt from content: {content[:200]}...")
288
  return None
@@ -318,25 +368,33 @@ def get_unique_prompts_per_annotator(reader: 'RemoteLogReader', start_date_str:
318
  # Process each conversation
319
  for conv_id, messages in battle_anony_logs.items():
320
  if messages:
321
- # Convert messages to file content format
322
- content = '\n'.join(json.dumps(msg) for msg in messages)
323
-
324
- # First check if the conversation passes the vote conditions
325
- ip, username, vote_conditions_met = get_file_data(content)
326
-
327
- # Only proceed if vote conditions are met
328
- if vote_conditions_met:
329
- # Get annotator name from either IP or username
330
- annotator_name = get_annotator_name(ip, username)
331
- if annotator_name:
332
- # Extract first user prompt
333
- first_prompt = get_first_user_prompt(content)
334
- if first_prompt:
335
- # Strip whitespace and check if it's not in the example prompts list
336
- cleaned_prompt = first_prompt.strip()
337
- if cleaned_prompt and cleaned_prompt not in EXAMPLE_PROMPTS:
338
- # Add to set of unique prompts for this annotator
339
- annotator_prompts[annotator_name].add(cleaned_prompt.lower())
 
 
 
 
 
 
 
 
340
 
341
  except Exception as e:
342
  log.error(f"Error processing logs for date {date_str}: {e}")
@@ -510,6 +568,57 @@ def export_unique_prompts_to_csv(reader: 'RemoteLogReader', output_file: str, st
510
 
511
  print(f"Exported {len(rows)} unique prompts to {output_file}")
512
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
513
  def main():
514
  # Initialize RemoteLogReader
515
  reader = RemoteLogReader()
@@ -520,8 +629,15 @@ def main():
520
  parser.add_argument('--download', action='store_true', help='Enable file download')
521
  parser.add_argument('--export-prompts', action='store_true', help='Export unique prompts to CSV')
522
  parser.add_argument('--output-file', default='unique_prompts.csv', help='Output file for unique prompts (default: unique_prompts.csv)')
 
523
  args = parser.parse_args()
524
 
 
 
 
 
 
 
525
  # Download files if enabled
526
  if args.download:
527
  print("\nDownloading files and organizing by annotator name...")
 
262
  first_message = messages[0]
263
  if isinstance(first_message, list) and len(first_message) > 1:
264
  # Format: ["<|im_start|>user", "hello"]
265
+ message_content = first_message[1]
266
+ # Ensure message_content is a string
267
+ if isinstance(message_content, str):
268
+ return message_content
269
+ elif isinstance(message_content, list):
270
+ # If it's a list, try to join it or get the first element
271
+ if message_content and all(isinstance(item, str) for item in message_content):
272
+ return ' '.join(message_content)
273
+ elif message_content and isinstance(message_content[0], str):
274
+ return message_content[0]
275
+ return str(message_content) if message_content else None
276
 
277
  # Format 2: state.messages array with {"role": "user", "content": "hello"} format
278
  if messages and len(messages) > 0:
279
  first_message = messages[0]
280
  if isinstance(first_message, dict) and 'content' in first_message:
281
+ message_content = first_message.get('content')
282
+ # Ensure message_content is a string
283
+ if isinstance(message_content, str):
284
+ return message_content
285
+ elif isinstance(message_content, list):
286
+ # If it's a list, try to join it or get the first element
287
+ if message_content and all(isinstance(item, str) for item in message_content):
288
+ return ' '.join(message_content)
289
+ elif message_content and isinstance(message_content[0], str):
290
+ return message_content[0]
291
+ return str(message_content) if message_content else None
292
 
293
  # Format 3: Direct messages array in the root
294
  messages = first_line_data.get('messages', [])
295
  if messages and len(messages) > 0:
296
  first_message = messages[0]
297
  if isinstance(first_message, list) and len(first_message) > 1:
298
+ message_content = first_message[1]
299
+ # Ensure message_content is a string
300
+ if isinstance(message_content, str):
301
+ return message_content
302
+ elif isinstance(message_content, list):
303
+ # If it's a list, try to join it or get the first element
304
+ if message_content and all(isinstance(item, str) for item in message_content):
305
+ return ' '.join(message_content)
306
+ elif message_content and isinstance(message_content[0], str):
307
+ return message_content[0]
308
+ return str(message_content) if message_content else None
309
  elif isinstance(first_message, dict) and 'content' in first_message:
310
+ message_content = first_message.get('content')
311
+ # Ensure message_content is a string
312
+ if isinstance(message_content, str):
313
+ return message_content
314
+ elif isinstance(message_content, list):
315
+ # If it's a list, try to join it or get the first element
316
+ if message_content and all(isinstance(item, str) for item in message_content):
317
+ return ' '.join(message_content)
318
+ elif message_content and isinstance(message_content[0], str):
319
+ return message_content[0]
320
+ return str(message_content) if message_content else None
321
 
322
  # Format 4: Look for a specific user role key
323
  for key in ['user', 'human', 'Human']:
324
  if key in first_line_data:
325
+ message_content = first_line_data[key]
326
+ # Ensure message_content is a string
327
+ if isinstance(message_content, str):
328
+ return message_content
329
+ elif isinstance(message_content, list):
330
+ # If it's a list, try to join it or get the first element
331
+ if message_content and all(isinstance(item, str) for item in message_content):
332
+ return ' '.join(message_content)
333
+ elif message_content and isinstance(message_content[0], str):
334
+ return message_content[0]
335
+ return str(message_content) if message_content else None
336
 
337
  log.warning(f"Could not extract first user prompt from content: {content[:200]}...")
338
  return None
 
368
  # Process each conversation
369
  for conv_id, messages in battle_anony_logs.items():
370
  if messages:
371
+ try:
372
+ # Convert messages to file content format
373
+ content = '\n'.join(json.dumps(msg) for msg in messages)
374
+
375
+ # First check if the conversation passes the vote conditions
376
+ ip, username, vote_conditions_met = get_file_data(content)
377
+
378
+ # Only proceed if vote conditions are met
379
+ if vote_conditions_met:
380
+ # Get annotator name from either IP or username
381
+ annotator_name = get_annotator_name(ip, username)
382
+ if annotator_name:
383
+ # Extract first user prompt
384
+ try:
385
+ first_prompt = get_first_user_prompt(content)
386
+ if first_prompt:
387
+ # Strip whitespace and check if it's not in the example prompts list
388
+ cleaned_prompt = first_prompt.strip()
389
+ if cleaned_prompt and cleaned_prompt not in EXAMPLE_PROMPTS:
390
+ # Add to set of unique prompts for this annotator
391
+ annotator_prompts[annotator_name].add(cleaned_prompt.lower())
392
+ else:
393
+ log.warning(f"Could not extract first user prompt for conversation {conv_id}")
394
+ except Exception as e:
395
+ log.error(f"Error processing first prompt for conversation {conv_id}: {e}")
396
+ except Exception as e:
397
+ log.error(f"Error processing conversation {conv_id}: {e}")
398
 
399
  except Exception as e:
400
  log.error(f"Error processing logs for date {date_str}: {e}")
 
568
 
569
  print(f"Exported {len(rows)} unique prompts to {output_file}")
570
 
571
+ def debug_problematic_conversations(reader: 'RemoteLogReader', date_str: str) -> None:
572
+ """Debug function to identify problematic conversations for a specific date
573
+
574
+ Args:
575
+ reader: RemoteLogReader instance
576
+ date_str: The date in YYYY_MM_DD format
577
+ """
578
+ try:
579
+ # Get conversation logs for battle_anony mode
580
+ conv_logs = reader.get_conv_logs(date_str)
581
+ battle_anony_logs = conv_logs.get('battle_anony', {})
582
+
583
+ print(f"Found {len(battle_anony_logs)} conversations for date {date_str}")
584
+
585
+ # Process each conversation
586
+ for conv_id, messages in battle_anony_logs.items():
587
+ if not messages:
588
+ continue
589
+
590
+ try:
591
+ # Convert messages to file content format
592
+ content = '\n'.join(json.dumps(msg) for msg in messages)
593
+
594
+ # Check if the conversation passes the vote conditions
595
+ ip, username, vote_conditions_met = get_file_data(content)
596
+
597
+ if vote_conditions_met:
598
+ # Get annotator name from either IP or username
599
+ annotator_name = get_annotator_name(ip, username)
600
+ if annotator_name:
601
+ # Try to extract first user prompt
602
+ try:
603
+ first_prompt = get_first_user_prompt(content)
604
+ if first_prompt:
605
+ print(f"Conversation {conv_id} - Annotator: {annotator_name} - First prompt: {first_prompt[:50]}...")
606
+ else:
607
+ print(f"WARNING: Could not extract first user prompt for conversation {conv_id} - Annotator: {annotator_name}")
608
+ # Print the first line of the content for debugging
609
+ first_line = content.split('\n')[0]
610
+ print(f"First line: {first_line[:200]}...")
611
+ except Exception as e:
612
+ print(f"ERROR: Error processing first prompt for conversation {conv_id}: {e}")
613
+ # Print the first line of the content for debugging
614
+ first_line = content.split('\n')[0]
615
+ print(f"First line: {first_line[:200]}...")
616
+ except Exception as e:
617
+ print(f"ERROR: Error processing conversation {conv_id}: {e}")
618
+
619
+ except Exception as e:
620
+ print(f"ERROR: Error processing logs for date {date_str}: {e}")
621
+
622
  def main():
623
  # Initialize RemoteLogReader
624
  reader = RemoteLogReader()
 
629
  parser.add_argument('--download', action='store_true', help='Enable file download')
630
  parser.add_argument('--export-prompts', action='store_true', help='Export unique prompts to CSV')
631
  parser.add_argument('--output-file', default='unique_prompts.csv', help='Output file for unique prompts (default: unique_prompts.csv)')
632
+ parser.add_argument('--debug-date', help='Debug problematic conversations for a specific date (format: YYYY_MM_DD)')
633
  args = parser.parse_args()
634
 
635
+ # Debug problematic conversations if date is provided
636
+ if args.debug_date:
637
+ print(f"\nDebugging problematic conversations for date {args.debug_date}...")
638
+ debug_problematic_conversations(reader, args.debug_date)
639
+ return
640
+
641
  # Download files if enabled
642
  if args.download:
643
  print("\nDownloading files and organizing by annotator name...")