crt: Make tls_atexit.c to use explicit Tls function calls instead of using automatic thread local storage

The automatic thread local storage worked just fine when using
native TLS (as clang does by default), but fails when using emutls
(which is the default in GCC).

The TLS callback function for DLL_THREAD_DETACH is called after emutls
has already cleaned up for the thread, so the tls_dtors were always NULL
at that time. Work around this by using Windows native TlsAlloc and
friends.

This didn't seem like an issue originally, as GCC/libstdc++ seemed to
keep using its own function. However that was an incorrect conclusion;
when building a cross compiler, libstdc++ always provides the
__cxa_thread_atexit function, but if building GCC/libstdc++ natively,
it tries to detect the presence of the __cxa_thread_atexit, and if found,
it omits providing it in libstdc++, using the one from mingw-w64-crt
instead.

Fixes https://sourceforge.net/p/mingw-w64/bugs/859/

Signed-off-by: Jeremy Drake <jeremyd2019@users.sourceforge.net>
Signed-off-by: Martin Storsjö <martin@martin.st>
diff --git a/mingw-w64-crt/crt/tls_atexit.c b/mingw-w64-crt/crt/tls_atexit.c
index 1241b56..f39731a 100644
--- a/mingw-w64-crt/crt/tls_atexit.c
+++ b/mingw-w64-crt/crt/tls_atexit.c
@@ -35,7 +35,7 @@
 static CRITICAL_SECTION lock;
 static int inited = 0;
 static dtor_obj *global_dtors = NULL;
-static __thread dtor_obj *tls_dtors = NULL;
+static DWORD tls_dtors_slot = TLS_OUT_OF_INDEXES;
 
 int __mingw_cxa_atexit(dtor_fn dtor, void *obj, void *dso) {
   if (!inited)
@@ -73,24 +73,29 @@
     return 1;
   handler->dtor = dtor;
   handler->obj = obj;
-  handler->next = tls_dtors;
-  tls_dtors = handler;
+  handler->next = (dtor_obj *)TlsGetValue(tls_dtors_slot);
+  TlsSetValue(tls_dtors_slot, handler);
   return 0;
 }
 
 static void WINAPI tls_atexit_callback(HANDLE __UNUSED_PARAM(hDllHandle), DWORD dwReason, LPVOID __UNUSED_PARAM(lpReserved)) {
   if (dwReason == DLL_PROCESS_DETACH) {
-    run_dtor_list(&tls_dtors);
+    dtor_obj * p = (dtor_obj *)TlsGetValue(tls_dtors_slot);
+    run_dtor_list(&p);
+    TlsSetValue(tls_dtors_slot, p);
+    TlsFree(tls_dtors_slot);
     run_dtor_list(&global_dtors);
   }
 }
 
 static void WINAPI tls_callback(HANDLE hDllHandle, DWORD dwReason, LPVOID __UNUSED_PARAM(lpReserved)) {
+  dtor_obj * p;
   switch (dwReason) {
   case DLL_PROCESS_ATTACH:
     if (inited == 0) {
       InitializeCriticalSection(&lock);
       __dso_handle = hDllHandle;
+      tls_dtors_slot = TlsAlloc();
       /*
        * We can only call _register_thread_local_exe_atexit_callback once
        * in a process; if we call it a second time the process terminates.
@@ -129,11 +134,14 @@
      * linked CRT (which still runs TLS destructors for the main thread).
      */
     if (__mingw_module_is_dll) {
-      run_dtor_list(&tls_dtors);
+      p = (dtor_obj *)TlsGetValue(tls_dtors_slot);
+      run_dtor_list(&p);
+      TlsSetValue(tls_dtors_slot, p);
       /* For DLLs, run dtors when detached. For EXEs, run dtors via the
        * thread local atexit callback, to make sure they don't run when
        * exiting the process with _exit or ExitProcess. */
       run_dtor_list(&global_dtors);
+      TlsFree(tls_dtors_slot);
     }
     if (inited == 1) {
       inited = 0;
@@ -143,7 +151,9 @@
   case DLL_THREAD_ATTACH:
     break;
   case DLL_THREAD_DETACH:
-    run_dtor_list(&tls_dtors);
+    p = (dtor_obj *)TlsGetValue(tls_dtors_slot);
+    run_dtor_list(&p);
+    TlsSetValue(tls_dtors_slot, p);
     break;
   }
 }