As Robert Love mentions in his book, a timer provides a mechanism for triggering an interrupt at a periodic rate. Such timer interrupts orchestrate OS scheduling, update resource bookkeeping, update the wall clock, etc. We are going to take a look into the Linux timer event handler.
https://github.com/torvalds/linux/blob/master/kernel/time/clockevents.c
/**
* clockevents_register_device - register a clock event device
* @dev: device to register
*/
void clockevents_register_device(struct clock_event_device *dev)
{
unsigned long flags;
/* Initialize state to DETACHED */
clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);
if (!dev->cpumask) {
WARN_ON(num_possible_cpus() > 1);
dev->cpumask = cpumask_of(smp_processor_id());
}
if (dev->cpumask == cpu_all_mask) {
WARN(1, "%s cpumask == cpu_all_mask, using cpu_possible_mask instead\n",
dev->name);
dev->cpumask = cpu_possible_mask;
}
raw_spin_lock_irqsave(&clockevents_lock, flags);
list_add(&dev->list, &clockevent_devices);
tick_check_new_device(dev);
clockevents_notify_released();
raw_spin_unlock_irqrestore(&clockevents_lock, flags);
}
EXPORT_SYMBOL_GPL(clockevents_register_device);
tick_check_new_device()
is defined in https://github.com/torvalds/linux/blob/master/kernel/time/tick-common.c
/*
* Check, if the new registered device should be used. Called with
* clockevents_lock held and interrupts disabled.
*/
void tick_check_new_device(struct clock_event_device *newdev)
{
struct clock_event_device *curdev;
struct tick_device *td;
int cpu;
cpu = smp_processor_id();
td = &per_cpu(tick_cpu_device, cpu);
curdev = td->evtdev;
/* cpu local device ? */
if (!tick_check_percpu(curdev, newdev, cpu))
goto out_bc;
/* Preference decision */
if (!tick_check_preferred(curdev, newdev))
goto out_bc;
if (!try_module_get(newdev->owner))
return;
/*
* Replace the eventually existing device by the new
* device. If the current device is the broadcast device, do
* not give it back to the clockevents layer !
*/
if (tick_is_broadcast_device(curdev)) {
clockevents_shutdown(curdev);
curdev = NULL;
}
clockevents_exchange_device(curdev, newdev);
tick_setup_device(td, newdev, cpu, cpumask_of(cpu));
if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
tick_oneshot_notify();
return;
out_bc:
/*
* Can the new device be used as a broadcast device ?
*/
tick_install_broadcast_device(newdev);
}
tick_check_new_device()
calls tick_setup_device()
to setup per cpu tick interrupt handler.
newdev
is a clock_event_device
struct which describes a timer event device.
This chapter has more details on it.
(You may also want to look at other chapters of this book. It explains kernel notions very well.).
https://0xax.gitbooks.io/linux-insides/content/Timers/linux-timers-5.html
If the timer device is a broadcast device, the execution flow goes to out_bc
, which calls tick_install_broadcast_device()
.
A broadcast timer device does not belong to a certain CPU.
A non-broadcast timer will hibernate along with its CPU, and therefore, the CPU package needs an outside timer to wake it up.
A broadcast timer will do it.
tick_setup_device()
is defined as follow,
static void tick_setup_device(struct tick_device *td,
struct clock_event_device *newdev, int cpu,
const struct cpumask *cpumask)
{
void (*handler)(struct clock_event_device *) = NULL;
ktime_t next_event = 0;
/*
* First device setup ?
*/
if (!td->evtdev) {
/*
* If no cpu took the do_timer update, assign it to
* this cpu:
*/
if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
tick_do_timer_cpu = cpu;
tick_next_period = ktime_get();
tick_period = NSEC_PER_SEC / HZ;
#ifdef CONFIG_NO_HZ_FULL
/*
* The boot CPU may be nohz_full, in which case set
* tick_do_timer_boot_cpu so the first housekeeping
* secondary that comes up will take do_timer from
* us.
*/
if (tick_nohz_full_cpu(cpu))
tick_do_timer_boot_cpu = cpu;
} else if (tick_do_timer_boot_cpu != -1 &&
!tick_nohz_full_cpu(cpu)) {
tick_take_do_timer_from_boot();
tick_do_timer_boot_cpu = -1;
WARN_ON(tick_do_timer_cpu != cpu);
#endif
}
/*
* Startup in periodic mode first.
*/
td->mode = TICKDEV_MODE_PERIODIC;
} else {
handler = td->evtdev->event_handler;
next_event = td->evtdev->next_event;
td->evtdev->event_handler = clockevents_handle_noop;
}
td->evtdev = newdev;
/*
* When the device is not per cpu, pin the interrupt to the
* current cpu:
*/
if (!cpumask_equal(newdev->cpumask, cpumask))
irq_set_affinity(newdev->irq, cpumask);
/*
* When global broadcasting is active, check if the current
* device is registered as a placeholder for broadcast mode.
* This allows us to handle this x86 misfeature in a generic
* way. This function also returns !=0 when we keep the
* current active broadcast state for this CPU.
*/
if (tick_device_uses_broadcast(newdev, cpu))
return;
if (td->mode == TICKDEV_MODE_PERIODIC)
tick_setup_periodic(newdev, 0);
else
tick_setup_oneshot(newdev, handler, next_event);
}
It does a lot of checks, and eventually calls tick_setup_periodic()
:
void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
{
tick_set_periodic_handler(dev, broadcast);
/* Broadcast setup ? */
if (!tick_device_is_functional(dev))
return;
if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
!tick_broadcast_oneshot_active()) {
clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC);
} else {
unsigned int seq;
ktime_t next;
do {
seq = read_seqbegin(&jiffies_lock);
next = tick_next_period;
} while (read_seqretry(&jiffies_lock, seq));
clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
for (;;) {
if (!clockevents_program_event(dev, next, false))
return;
next = ktime_add(next, tick_period);
}
}
}
tick_set_periodic_handler()
simply writes the interrupt handler to dev->event_handler
.
The function is defined in https://github.com/torvalds/linux/blob/master/kernel/time/tick-broadcast.c
void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
{
if (!broadcast)
dev->event_handler = tick_handle_periodic;
else
dev->event_handler = tick_handle_periodic_broadcast;
}
tick_handle_periodic()
is defined in https://github.com/torvalds/linux/blob/master/kernel/time/tick-common.c
void tick_handle_periodic(struct clock_event_device *dev)
{
int cpu = smp_processor_id();
ktime_t next = dev->next_event;
tick_periodic(cpu);
#if defined(CONFIG_HIGH_RES_TIMERS) || defined(CONFIG_NO_HZ_COMMON)
/*
* The cpu might have transitioned to HIGHRES or NOHZ mode via
* update_process_times() -> run_local_timers() ->
* hrtimer_run_queues().
*/
if (dev->event_handler != tick_handle_periodic)
return;
#endif
if (!clockevent_state_oneshot(dev))
return;
for (;;) {
/*
* Setup the next period for devices, which do not have
* periodic mode:
*/
next = ktime_add(next, tick_period);
if (!clockevents_program_event(dev, next, false))
return;
/*
* Have to be careful here. If we're in oneshot mode,
* before we call tick_periodic() in a loop, we need
* to be sure we're using a real hardware clocksource.
* Otherwise we could get trapped in an infinite
* loop, as the tick_periodic() increments jiffies,
* which then will increment time, possibly causing
* the loop to trigger again and again.
*/
if (timekeeping_valid_for_hres())
tick_periodic(cpu);
}
}
Upon every timer tick interrupt, tick_handle_periodic()
calls tick_periodic()
.
If the device is a fixed timer, it will simply return.
Otherwise, if the timer is a "oneshot" device, it sets up the next timer interval (i.e. when the next tick interrupt should arrive).
In the same file, tick_periodic
is defined as,
static void tick_periodic(int cpu)
{
if (tick_do_timer_cpu == cpu) {
write_seqlock(&jiffies_lock);
/* Keep track of the next tick event */
tick_next_period = ktime_add(tick_next_period, tick_period);
do_timer(1);
write_sequnlock(&jiffies_lock);
update_wall_time();
}
update_process_times(user_mode(get_irq_regs()));
profile_tick(CPU_PROFILING);
}
do_timer
updates the jiffies counter and kernel load statistics.
It is defined in https://github.com/torvalds/linux/blob/master/kernel/time/timekeeping.c
void do_timer(unsigned long ticks)
{
jiffies_64 += ticks;
calc_global_load(ticks);
}
update_process_times()
is defined in https://github.com/torvalds/linux/blob/master/kernel/time/timer.c
It eats a tick from the current process for fair scheduling purposes.
/*
* Called from the timer interrupt handler to charge one tick to the current
* process. user_tick is 1 if the tick is user time, 0 for system.
*/
void update_process_times(int user_tick)
{
struct task_struct *p = current;
/* Note: this timer irq context must be accounted for as well. */
account_process_tick(p, user_tick);
run_local_timers();
rcu_sched_clock_irq(user_tick);
#ifdef CONFIG_IRQ_WORK
if (in_irq())
irq_work_tick();
#endif
scheduler_tick();
if (IS_ENABLED(CONFIG_POSIX_TIMERS))
run_posix_cpu_timers();
}