diff mbox series

[libfortran] Fix thead sanitizer issue with libgfortran

Message ID 43bc4547-abe8-312d-e66e-ad9e2d3034bd@netcologne.de
State New
Headers show
Series [libfortran] Fix thead sanitizer issue with libgfortran | expand

Commit Message

Thomas Koenig Sept. 28, 2017, 5:29 p.m. UTC
Hello world,

the attached patch fixes the problem reported in PR 66756:  When
opeing a file, the main lock for all units was acquired, the unit lock
was acquired, and then the main lock was released and re-aquired.  To
the thread sanitizer, this is a lock-order inversion.

One option would have been to simply close the bug, because this
only occurs in opening a file, when the gfc_unit has not yet had
a chance to escape to another thread. However, it appears that
this causes trouble debugging parallel applications, hence this
patch.

What this patch does is to change the assumptions for insert_unit:
Previously, this used to lock the newly created unit, and the caller
had to unlock. Now, gfc_get_unit can do the locking after releasing
the global lock.

This gets rid of the thread sanitizer issue; the thread sanitizer
output is clean.  However, I would appreciate feedback about whether
this approach (and my code) is correct.

Regression-tested.

Comments? Suggestions for improvements/other approaches? Close the PR
as WONTFIX instead? OK for trunk?

Regards

	Thomas

2017-09-28  Thomas Koenig  <tkoenig@gcc.gnu.org>

         PR fortran/66756
         * io/fbuf.c (fbuf_destroy): Lock unit before freeing the buffer.
         * io/unit.c (insert_unit): Do not create lock and lock, move to
         (gfc_get_unit): here; lock after insert_unit has succeded.
         (init_units): Do not unlock unit locks for stdin, stdout and
         stderr.

Comments

Thomas Koenig Sept. 29, 2017, 6:03 a.m. UTC | #1
I wrote:

> This gets rid of the thread sanitizer issue; the thread sanitizer
> output is clean.  However, I would appreciate feedback about whether
> this approach (and my code) is correct.
> 
> Regression-tested.
> 


Here is an update: The previous version of the patch had a regression,
which is removed (with a test case) with the current version.
Also regression-tested.

So,

 > Comments? Suggestions for improvements/other approaches? Close the PR
 > as WONTFIX instead? OK for trunk?

Regards

	Thomas

2017-09-29  Thomas Koenig  <tkoenig@gcc.gnu.org>

         PR fortran/66756
         * io/fbuf.c (fbuf_destroy): Add argument "locked". Lock unit
         before freeing the buffer if necessary.
         * io/unit.c (insert_unit): Do not create lock and lock, move to
         (gfc_get_unit): here; lock after insert_unit has succeded.
         (init_units): Do not unlock unit locks for stdin, stdout and
         stderr.

2017-09-29  Thomas Koenig  <tkoenig@gcc.gnu.org>

         PR fortran/66756
         * gfortran.dg/openmp-close.f90: New test.
Index: io/fbuf.c
===================================================================
--- io/fbuf.c	(Revision 253162)
+++ io/fbuf.c	(Arbeitskopie)
@@ -46,13 +46,17 @@ fbuf_init (gfc_unit *u, int len)
 
 
 void
-fbuf_destroy (gfc_unit *u)
+fbuf_destroy (gfc_unit *u, int locked)
 {
   if (u->fbuf == NULL)
     return;
+  if (locked)
+    __gthread_mutex_lock (&u->lock);
   free (u->fbuf->buf);
   free (u->fbuf);
   u->fbuf = NULL;
+  if (locked)
+    __gthread_mutex_unlock (&u->lock);
 }
 
 
Index: io/fbuf.h
===================================================================
--- io/fbuf.h	(Revision 253162)
+++ io/fbuf.h	(Arbeitskopie)
@@ -47,7 +47,7 @@ struct fbuf
 extern void fbuf_init (gfc_unit *, int);
 internal_proto(fbuf_init);
 
-extern void fbuf_destroy (gfc_unit *);
+extern void fbuf_destroy (gfc_unit *, int);
 internal_proto(fbuf_destroy);
 
 extern int fbuf_reset (gfc_unit *);
Index: io/unit.c
===================================================================
--- io/unit.c	(Revision 253162)
+++ io/unit.c	(Arbeitskopie)
@@ -221,23 +221,14 @@ insert (gfc_unit *new, gfc_unit *t)
   return t;
 }
 
+/* insert_unit()-- Create a new node, insert it into the treap.  It is assumed
+   that the caller holds unit_lock.  */
 
-/* insert_unit()-- Create a new node, insert it into the treap.  */
-
 static gfc_unit *
 insert_unit (int n)
 {
   gfc_unit *u = xcalloc (1, sizeof (gfc_unit));
   u->unit_number = n;
-#ifdef __GTHREAD_MUTEX_INIT
-  {
-    __gthread_mutex_t tmp = __GTHREAD_MUTEX_INIT;
-    u->lock = tmp;
-  }
-#else
-  __GTHREAD_MUTEX_INIT_FUNCTION (&u->lock);
-#endif
-  __gthread_mutex_lock (&u->lock);
   u->priority = pseudo_random ();
   unit_root = insert (u, unit_root);
   return u;
@@ -361,9 +352,20 @@ retry:
 
   if (created)
     {
-      /* Newly created units have their lock held already
-	 from insert_unit.  Just unlock UNIT_LOCK and return.  */
+#ifdef __GTHREAD_MUTEX_INIT
+      {
+	__gthread_mutex_t tmp = __GTHREAD_MUTEX_INIT;
+	p->lock = tmp;
+      }
+#else
+      __GTHREAD_MUTEX_INIT_FUNCTION (&p->lock);
+#endif
       __gthread_mutex_unlock (&unit_lock);
+
+      /* Nobody outside this address has seen this unit yet.  We could safely
+	 keep it unlocked until now.  */
+      
+      __gthread_mutex_lock (&p->lock);
       return p;
     }
 
@@ -618,8 +620,6 @@ init_units (void)
       u->filename = strdup (stdin_name);
 
       fbuf_init (u, 0);
-
-      __gthread_mutex_unlock (&u->lock);
     }
 
   if (options.stdout_unit >= 0)
@@ -649,8 +649,6 @@ init_units (void)
       u->filename = strdup (stdout_name);
 
       fbuf_init (u, 0);
-
-      __gthread_mutex_unlock (&u->lock);
     }
 
   if (options.stderr_unit >= 0)
@@ -680,8 +678,6 @@ init_units (void)
 
       fbuf_init (u, 256);  /* 256 bytes should be enough, probably not doing
                               any kind of exotic formatting to stderr.  */
-
-      __gthread_mutex_unlock (&u->lock);
     }
 
   /* Calculate the maximum file offset in a portable manner.
@@ -719,7 +715,7 @@ close_unit_1 (gfc_unit *u, int locked)
   u->filename = NULL;
 
   free_format_hash_table (u);
-  fbuf_destroy (u);
+  fbuf_destroy (u, locked);
 
   if (u->unit_number <= NEWUNIT_START)
     newunit_free (u->unit_number);
! { dg-do run }
! { dg-require-effective-target fopenmp }
! { dg-additional-options "-fopenmp" }
program main
  use omp_lib
  !$OMP PARALLEL NUM_THREADS(100)
  write (10,*) 'asdf'
  !$OMP END PARALLEL 
  close(10,status="delete")
end program main
Janne Blomqvist Sept. 29, 2017, 8:03 a.m. UTC | #2
On Fri, Sep 29, 2017 at 9:03 AM, Thomas Koenig <tkoenig@netcologne.de> wrote:
> I wrote:
>
>> This gets rid of the thread sanitizer issue; the thread sanitizer
>> output is clean.  However, I would appreciate feedback about whether
>> this approach (and my code) is correct.
>>
>> Regression-tested.
>>
>
>
> Here is an update: The previous version of the patch had a regression,
> which is removed (with a test case) with the current version.
> Also regression-tested.
>
> So,
>
>> Comments? Suggestions for improvements/other approaches? Close the PR
>> as WONTFIX instead? OK for trunk?

In general, I do think this is an issue that should be fixed.
Multithreading is only going get more pervasive, and gfortran spewing
false positives from tools such as threadsanitizer is definitely
unhelpful.

Now, for the patch itself, AFAICT the general approach is correct.
However, I have a couple of questions and/or suggestions for
improvements:

1) I'm confused why fbuf_destroy is modified. The fbuf structure is
part of gfc_unit, and should be accessed with the same locking rules
as the rest of the gfc_unit components. When closing the unit, I think
the same should apply here, no?

2) I think the mutex init stuff can remain in insert_unit, just the
locking needs to move out and below freeing unit_lock like you have
done.
Thomas Koenig Sept. 29, 2017, 6:53 p.m. UTC | #3
Am 29.09.2017 um 10:03 schrieb Janne Blomqvist:

> 
> 1) I'm confused why fbuf_destroy is modified. The fbuf structure is
> part of gfc_unit, and should be accessed with the same locking rules
> as the rest of the gfc_unit components. When closing the unit, I think
> the same should apply here, no?

It is to avoid a data race for

program main
   use omp_lib
   !$OMP PARALLEL NUM_THREADS(2)
   call file_open(OMP_get_thread_num())
   !$OMP END PARALLEL
contains
   recursive subroutine file_open(i)
   integer :: i
   integer :: nunit
   nunit = i + 20
   write (nunit,*) 'asdf'
   end subroutine file_open
end program main

which leads to

   Read of size 4 at 0x7b580000ff30 by main thread (mutexes: write M16):
     #0 close_unit_1 ../../../trunk/libgfortran/io/unit.c:699 
(libgfortran.so.5+0x000000283ba6)
     #1 _gfortrani_close_units ../../../trunk/libgfortran/io/unit.c:767 
(libgfortran.so.5+0x000000283f59)
     #2 cleanup ../../../trunk/libgfortran/runtime/main.c:113 
(libgfortran.so.5+0x00000001fe22)
     #3 _dl_fini <null> (ld-linux-x86-64.so.2+0x00000000fb62)

   Previous write of size 4 at 0x7b580000ff30 by thread T1 (mutexes: 
write M17):
     #0 finalize_transfer ../../../trunk/libgfortran/io/transfer.c:3934 
(libgfortran.so.5+0x000000281aa1)
     #1 _gfortran_st_write_done 
../../../trunk/libgfortran/io/transfer.c:4125 
(libgfortran.so.5+0x000000281e35)
     #2 file_open.3528 <null> (a.out+0x000000400c1b)
     #3 MAIN__._omp_fn.0 <null> (a.out+0x000000400d27)
     #4 gomp_thread_start ../../../trunk/libgomp/team.c:120 
(libgomp.so.1+0x00000001756f)

Note that not all problems with implicit close at the end are resolved
so far, but that is a different problem.

> 2) I think the mutex init stuff can remain in insert_unit, just the
> locking needs to move out and below freeing unit_lock like you have
> done.

I can change that one easily.

Any other comments?

Regards

	Thomas
Janne Blomqvist Oct. 1, 2017, 7:42 a.m. UTC | #4
On Fri, Sep 29, 2017 at 9:53 PM, Thomas Koenig <tkoenig@netcologne.de> wrote:
> Am 29.09.2017 um 10:03 schrieb Janne Blomqvist:
>
>>
>> 1) I'm confused why fbuf_destroy is modified. The fbuf structure is
>> part of gfc_unit, and should be accessed with the same locking rules
>> as the rest of the gfc_unit components. When closing the unit, I think
>> the same should apply here, no?
>
>
> It is to avoid a data race for
>
> program main
>   use omp_lib
>   !$OMP PARALLEL NUM_THREADS(2)
>   call file_open(OMP_get_thread_num())
>   !$OMP END PARALLEL
> contains
>   recursive subroutine file_open(i)
>   integer :: i
>   integer :: nunit
>   nunit = i + 20
>   write (nunit,*) 'asdf'
>   end subroutine file_open
> end program main
>
> which leads to
>
>   Read of size 4 at 0x7b580000ff30 by main thread (mutexes: write M16):
>     #0 close_unit_1 ../../../trunk/libgfortran/io/unit.c:699
> (libgfortran.so.5+0x000000283ba6)
>     #1 _gfortrani_close_units ../../../trunk/libgfortran/io/unit.c:767
> (libgfortran.so.5+0x000000283f59)
>     #2 cleanup ../../../trunk/libgfortran/runtime/main.c:113
> (libgfortran.so.5+0x00000001fe22)
>     #3 _dl_fini <null> (ld-linux-x86-64.so.2+0x00000000fb62)
>
>   Previous write of size 4 at 0x7b580000ff30 by thread T1 (mutexes: write
> M17):
>     #0 finalize_transfer ../../../trunk/libgfortran/io/transfer.c:3934
> (libgfortran.so.5+0x000000281aa1)
>     #1 _gfortran_st_write_done ../../../trunk/libgfortran/io/transfer.c:4125
> (libgfortran.so.5+0x000000281e35)
>     #2 file_open.3528 <null> (a.out+0x000000400c1b)
>     #3 MAIN__._omp_fn.0 <null> (a.out+0x000000400d27)
>     #4 gomp_thread_start ../../../trunk/libgomp/team.c:120
> (libgomp.so.1+0x00000001756f)
>
> Note that not all problems with implicit close at the end are resolved
> so far, but that is a different problem.

I'm still confused. If gfc_unit->fbuf needs to be protected by holding
gfc_unit->lock when closing, surely the same applies to other
gfc_units fields as well? How if gfc_unit->fbuf special? AFAICS it
isn't..
diff mbox series

Patch

Index: io/fbuf.c
===================================================================
--- io/fbuf.c	(Revision 253162)
+++ io/fbuf.c	(Arbeitskopie)
@@ -50,9 +50,11 @@  fbuf_destroy (gfc_unit *u)
 {
   if (u->fbuf == NULL)
     return;
+  __gthread_mutex_lock (&u->lock);
   free (u->fbuf->buf);
   free (u->fbuf);
   u->fbuf = NULL;
+  __gthread_mutex_unlock (&u->lock);
 }
 
 
Index: io/unit.c
===================================================================
--- io/unit.c	(Revision 253162)
+++ io/unit.c	(Arbeitskopie)
@@ -221,23 +221,14 @@  insert (gfc_unit *new, gfc_unit *t)
   return t;
 }
 
+/* insert_unit()-- Create a new node, insert it into the treap.  It is assumed
+   that the caller holds unit_lock.  */
 
-/* insert_unit()-- Create a new node, insert it into the treap.  */
-
 static gfc_unit *
 insert_unit (int n)
 {
   gfc_unit *u = xcalloc (1, sizeof (gfc_unit));
   u->unit_number = n;
-#ifdef __GTHREAD_MUTEX_INIT
-  {
-    __gthread_mutex_t tmp = __GTHREAD_MUTEX_INIT;
-    u->lock = tmp;
-  }
-#else
-  __GTHREAD_MUTEX_INIT_FUNCTION (&u->lock);
-#endif
-  __gthread_mutex_lock (&u->lock);
   u->priority = pseudo_random ();
   unit_root = insert (u, unit_root);
   return u;
@@ -361,9 +352,20 @@  retry:
 
   if (created)
     {
-      /* Newly created units have their lock held already
-	 from insert_unit.  Just unlock UNIT_LOCK and return.  */
+#ifdef __GTHREAD_MUTEX_INIT
+      {
+	__gthread_mutex_t tmp = __GTHREAD_MUTEX_INIT;
+	p->lock = tmp;
+      }
+#else
+      __GTHREAD_MUTEX_INIT_FUNCTION (&p->lock);
+#endif
       __gthread_mutex_unlock (&unit_lock);
+
+      /* Nobody outside this address has seen this unit yet.  We could safely
+	 keep it unlocked until now.  */
+      
+      __gthread_mutex_lock (&p->lock);
       return p;
     }
 
@@ -618,8 +620,6 @@  init_units (void)
       u->filename = strdup (stdin_name);
 
       fbuf_init (u, 0);
-
-      __gthread_mutex_unlock (&u->lock);
     }
 
   if (options.stdout_unit >= 0)
@@ -649,8 +649,6 @@  init_units (void)
       u->filename = strdup (stdout_name);
 
       fbuf_init (u, 0);
-
-      __gthread_mutex_unlock (&u->lock);
     }
 
   if (options.stderr_unit >= 0)
@@ -680,8 +678,6 @@  init_units (void)
 
       fbuf_init (u, 256);  /* 256 bytes should be enough, probably not doing
                               any kind of exotic formatting to stderr.  */
-
-      __gthread_mutex_unlock (&u->lock);
     }
 
   /* Calculate the maximum file offset in a portable manner.