Skip to content

First Data Analysis

Learn how to load and analyze the JSONL files collected with the OSECHI detector.


Prerequisites

  • Python 3.11+
  • Required libraries: pandas, matplotlib
  • A JSONL file from First Measurement — e.g. osechi_data.jsonl

Install required libraries

uv pip install pandas matplotlib

Marimo Notebook

If you prefer working in a reactive notebook:

uv pip install marimo
uv run marimo edit


1. Load a JSONL file

from pathlib import Path
import pandas as pd

file_path = Path("osechi_data.jsonl")

df = pd.read_json(file_path, lines=True)

print(f"Shape: {df.shape}")          # (event count, field count)
print(f"\nColumns:\n{df.columns.tolist()}")
print(f"\nFirst 5 rows:\n{df.head()}")

Example output:

Shape: (1000, 20)

Columns:
['type', 'status', 'received_us', 'sent_us', 'hit1', 'hit2', 'hit3', 'adc', ...]

First 5 rows:
  type status  hit1  hit2  hit3   adc  ...
0  event     ok     0     1     0  1064  ...
1  event     ok     2     2     0  1041  ...

Using the standard library

import json
from pathlib import Path

file_path = Path("osechi_data.jsonl")

data = []
with file_path.open() as f:
    for line in f:
        data.append(json.loads(line))

print(f"Events loaded: {len(data)}")
print(f"First event: {data[0]}")

2. Parse timestamps

Timestamp fields (received_us, detected_us, sent_us) are stored as Unix time in microseconds. Convert them to datetime for time-series analysis:

import pandas as pd

df = pd.read_json("osechi_data.jsonl", lines=True)

df["received_dt"] = pd.to_datetime(df["received_us"], unit="us")
df["detected_dt"] = pd.to_datetime(df["detected_us"], unit="us")

print(df[["received_dt", "detected_dt"]].head())

Timestamp fields:

Field Description
detected_us Time recorded by the onboard RTC when the event was detected (µs)
sent_us Time the event was sent from the detector (µs)
received_us Time kazunoko received the response (µs)

3. Basic statistics

import pandas as pd

df = pd.read_json("osechi_data.jsonl", lines=True)

# Summary statistics for all numeric columns
print(df.describe())

Per-channel hit counts:

for ch in [1, 2, 3]:
    col = f"hit{ch}"
    print(f"Channel {ch}: mean={df[col].mean():.2f}, max={df[col].max()}")

4. Filter events

Filter by hit pattern

# Events where channel 1 fired
ch1_events = df[df["hit1"] > 0]
print(f"Channel 1 events: {len(ch1_events)}")

# Coincidence events: channels 1 and 2 both fired
coincidence = df[(df["hit1"] > 0) & (df["hit2"] > 0)]
print(f"Coincidence events: {len(coincidence)}")

Filter by ADC value

# Events with ADC between 1100 and 1200
filtered = df[(df["adc"] >= 1100) & (df["adc"] <= 1200)]
print(f"Filtered events: {len(filtered)} ({len(filtered)/len(df)*100:.1f}%)")

5. Visualize the data

ADC value distribution

import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_json("osechi_data.jsonl", lines=True)

plt.figure(figsize=(8, 5))
plt.hist(df["adc"], bins=50, edgecolor="black", alpha=0.7)
plt.xlabel("ADC Value")
plt.ylabel("Count")
plt.title("ADC Value Distribution")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

Per-channel hit counts

fig, axes = plt.subplots(1, 3, figsize=(12, 4))

for i, ch in enumerate([1, 2, 3]):
    axes[i].hist(df[f"hit{ch}"], bins=20, edgecolor="black", alpha=0.7)
    axes[i].set_xlabel("Hit Count")
    axes[i].set_ylabel("Events")
    axes[i].set_title(f"Channel {ch}")
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

Event rate over time

df["received_dt"] = pd.to_datetime(df["received_us"], unit="us")
df = df.set_index("received_dt")

# Resample to 1-second bins
rate = df["adc"].resample("1s").count()

plt.figure(figsize=(10, 4))
plt.plot(rate.index, rate.values)
plt.xlabel("Time")
plt.ylabel("Events / second")
plt.title("Event Rate")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

6. Load multiple files

from pathlib import Path
import pandas as pd

data_dir = Path("data/")
files = sorted(data_dir.glob("*.jsonl"))

dataframes = [pd.read_json(f, lines=True) for f in files]
df = pd.concat(dataframes, ignore_index=True)

print(f"Total events: {len(df)} from {len(files)} files")

Large files

For very large files, use a generator to avoid loading everything into memory at once.

import json

def read_jsonl(path):
    with open(path) as f:
        for line in f:
            yield json.loads(line)

for event in read_jsonl("osechi_data.jsonl"):
    # process one event at a time
    pass

Next Steps

Check event rates over time

check_event_rate.py plots event rates from a directory of JSONL files and detects rate anomalies.

Check coincidence between two detectors

check_coincidence.py finds temporally overlapping events from two detectors using pd.merge_asof.

Convert to Parquet

from_jsonl_to_parquet.py converts JSONL files to the columnar Parquet format for faster queries on large datasets.