race in timerobj

Philippe Gerum rpm at xenomai.org
Fri Dec 4 16:29:02 CET 2020


Ronny Meeus <ronny.meeus at gmail.com> writes:

> Op di 1 dec. 2020 om 14:51 schreef Philippe Gerum <rpm at xenomai.org>:
>>
>>
>> Ronny Meeus via Xenomai <xenomai at xenomai.org> writes:
>>
>> > Op di 1 dec. 2020 om 12:06 schreef Jan Kiszka <jan.kiszka at siemens.com>:
>> >>
>> >> On 01.12.20 11:26, Ronny Meeus via Xenomai wrote:
>> >> > Hello Xenomai community,
>> >> >
>> >> > it looks like we have a race condition in the timer object handling.
>> >> > The scope of the below mentioned issue is the alarm interface of the
>> >> > alchemy skin:
>> >> > int rt_alarm_start(RT_ALARM *alarm, RTIME value, RTIME interval)
>> >> >
>> >> > The documentation mentions that this start can be called also on an
>> >> > already running timer:
>> >> > "This service overrides any previous setup of the expiry date and
>> >> > reload interval for the given alarm."
>> >> >
>> >> > In the timer server code (see file lib/copperplate/timerobj.c):
>> >> > static void *timerobj_server (void *arg))
>> >> > I see the timer being re-inserted in the timeout list in case of a
>> >> > periodic timer.
>> >> >
>> >> > write_lock_nocancel(&svlock);
>> >> > ...
>> >> > if (interval.tv_sec > 0 || interval.tv_nsec > 0) {
>> >> >   timespec_add(&tmobj->itspec.it_value, &value, &interval);
>> >> >   timerobj_enqueue(tmobj);
>> >> > }
>> >> > write_unlock(&svlock);
>> >> > tmobj->handler(tmobj);
>> >> > write_lock_nocancel(&svlock);
>> >> > }
>> >> >
>> >> > This re-insert is done with the svlock taken but the timer specific
>> >> > lock is not taken.
>> >> >
>> >> > In the start on the other hand I see:
>> >> >
>> >> > int timerobj_start(struct timerobj *tmobj,
>> >> >   void (*handler)(struct timerobj *tmobj),
>> >> >   struct itimerspec *it) /* lock held, dropped */
>> >> > {
>> >> > tmobj->handler = handler;
>> >> > tmobj->itspec = *it;
>> >> > ...
>> >> > write_lock_nocancel(&svlock);
>> >> >
>> >> > So the itspec is updated with only the timerobj lock taken.
>> >> > If the timeout value is changed via the timerobj_start while the timer is
>> >> > under processing by the timer server, we can enter an endless loop (at
>> >> > least that is what we see sporadically)
>> >> >
>> >> > Does this make sense?
>> >>
>> >> Yes, at least to me. Patch welcome.
>> >
>> > Jan,
>> >
>> > this is the patch we are currently testing with (note that the line numbers
>> > might not match since we are not at the latest revision).
>> >
>> > diff --git a/lib/copperplate/timerobj.c b/lib/copperplate/timerobj.c
>> > --- a/lib/copperplate/timerobj.c
>> > +++ b/lib/copperplate/timerobj.c
>> > @@ -165,7 +165,8 @@ static void *timerobj_server(void *arg)
>> >                                 timerobj_enqueue(tmobj);
>> >                         }
>> >                         write_unlock(&svlock);
>> > -                       tmobj->handler(tmobj);
>> > +                       if (tmobj->handler)
>> > +                               tmobj->handler(tmobj);
>> >                         write_lock_nocancel(&svlock);
>> >                 }
>> >
>> > @@ -232,10 +233,14 @@ int timerobj_start(struct timerobj *tmob
>> >                    void (*handler)(struct timerobj *tmobj),
>> >                    struct itimerspec *it) /* lock held, dropped */
>> >  {
>> > +       write_lock_nocancel(&svlock);
>> > +
>> > +       if (pvholder_linked(&tmobj->next))
>> > +               pvlist_remove_init(&tmobj->next);
>> > +
>> >         tmobj->handler = handler;
>> >         tmobj->itspec = *it;
>> >
>> > -       write_lock_nocancel(&svlock);
>> >         timerobj_enqueue(tmobj);
>> >         write_unlock(&svlock);
>> >         timerobj_unlock(tmobj);
>> >
>> >
>> >>
>> >> Note, though, that updating the handler will remain inherently racy.
>> >>
>>
>> Good catch. Also, we would need a consistent view of the
>> (value,interval,handler) triplet for any time wrt concurrent start/stop
>> updates, all accessed under server lock. We would not want a new value
>> being copied to be used with an old interval and/or handler for
>> instance.
>>
>> There is also a locking imbalance to be fixed on error in
>> timerobj_start().
>>
>> e.g. (proudly untested).
>>
>> diff --git a/lib/copperplate/timerobj.c b/lib/copperplate/timerobj.c
>> index cbfcda566..08cc0d3b9 100644
>> --- a/lib/copperplate/timerobj.c
>> +++ b/lib/copperplate/timerobj.c
>> @@ -98,6 +98,7 @@ static int server_prologue(void *arg)
>>
>>  static void *timerobj_server(void *arg)
>>  {
>> +       void (*handler)(struct timerobj *tmobj);
>>         struct timespec now, value, interval;
>>         struct timerobj *tmobj, *tmp;
>>         sigset_t set;
>> @@ -120,17 +121,18 @@ static void *timerobj_server(void *arg)
>>
>>                 pvlist_for_each_entry_safe(tmobj, tmp, &svtimers, next) {
>>                         value = tmobj->itspec.it_value;
>> +                       interval = tmobj->itspec.it_interval;
>> +                       handler = tmobj->handler;
>>                         if (timespec_after(&value, &now))
>>                                 break;
>>                         pvlist_remove_init(&tmobj->next);
>> -                       interval = tmobj->itspec.it_interval;
>>                         if (interval.tv_sec > 0 || interval.tv_nsec > 0) {
>>                                 timespec_add(&tmobj->itspec.it_value,
>>                                              &value, &interval);
>>                                 timerobj_enqueue(tmobj);
>>                         }
>>                         write_unlock(&svlock);
>> -                       tmobj->handler(tmobj);
>> +                       handler(tmobj);
>>                         write_lock_nocancel(&svlock);
>>                 }
>>
>> @@ -218,8 +220,7 @@ int timerobj_start(struct timerobj *tmobj,
>>                    void (*handler)(struct timerobj *tmobj),
>>                    struct itimerspec *it) /* lock held, dropped */
>>  {
>> -       tmobj->handler = handler;
>> -       tmobj->itspec = *it;
>> +       int ret = 0;
>>
>>         /*
>>          * We hold the queue lock long enough to prevent the timer
>> @@ -232,14 +233,23 @@ int timerobj_start(struct timerobj *tmobj,
>>          */
>>         write_lock_nocancel(&svlock);
>>
>> -       if (__RT(timer_settime(tmobj->timer, TIMER_ABSTIME, it, NULL)))
>> -               return __bt(-errno);
>> +       if (pvholder_linked(&tmobj->next))
>> +               pvlist_remove_init(&tmobj->next);
>> +
>> +       tmobj->handler = handler;
>> +       tmobj->itspec = *it;
>> +
>> +       if (__RT(timer_settime(tmobj->timer, TIMER_ABSTIME, it, NULL))) {
>> +               ret = __bt(-errno);
>> +               goto fail;
>> +       }
>>
>>         timerobj_enqueue(tmobj);
>> +fail:
>>         write_unlock(&svlock);
>>         timerobj_unlock(tmobj);
>>
>> -       return 0;
>> +       return ret;
>>  }
>>
>>  int timerobj_stop(struct timerobj *tmobj) /* lock held, dropped */
>> @@ -251,10 +261,9 @@ int timerobj_stop(struct timerobj *tmobj) /* lock held, dropped */
>>         if (pvholder_linked(&tmobj->next))
>>                 pvlist_remove_init(&tmobj->next);
>>
>> -       write_unlock(&svlock);
>> -
>>         __RT(timer_settime(tmobj->timer, 0, &itimer_stop, NULL));
>>         tmobj->handler = NULL;
>> +       write_unlock(&svlock);
>>         timerobj_unlock(tmobj);
>>
>>         return 0;
>> --
>
> Philippe,
>
> we tested with the patch you proposed and the problem seem to be resolved.
> I think this is a good improvement.
>
> Best regards,
> Ronny
>
>> Philippe.

Ok, thanks for testing. Now pushing a patch referring to your original
post with the preliminary fix.

-- 
Philippe.



More information about the Xenomai mailing list