From cc86039ed915a04021bf908f127821b1c01b8792 Mon Sep 17 00:00:00 2001 From: Rui Ueyama Date: Sat, 14 Jan 2023 09:42:09 +0800 Subject: [PATCH] [x86-64] Implement TLSGD to TLSIE relaxation If we know that the .so file we are creating will not be dlopen'ed, we can relax __tls_get_addr function calls to GOT loads. --- docs/mold.1 | 2 ++ elf/arch-i386.cc | 8 +++--- elf/arch-s390x.cc | 5 ++-- elf/arch-x86-64.cc | 51 +++++++++++++++++++++++++++------ elf/mold.h | 10 ------- test/elf/x86_64_ifunc-alias.sh | 0 test/elf/x86_64_tls-gd-to-ie.sh | 38 ++++++++++++++++++++++++ 7 files changed, 90 insertions(+), 24 deletions(-) mode change 100644 => 100755 test/elf/x86_64_ifunc-alias.sh create mode 100755 test/elf/x86_64_tls-gd-to-ie.sh diff --git a/docs/mold.1 b/docs/mold.1 index 5f833b78e6..10addaa8af 100644 --- a/docs/mold.1 +++ b/docs/mold.1 @@ -1198,6 +1198,8 @@ Mark DSO non-deletable at runtime. .It Fl z Cm nodlopen Mark DSO not available to .Xr dlopen 3 . +This option makes it possible for the linker to optimize thread-local \ +variable accesses by rewriting instructions for some targets. .Pp .It Fl z Cm nodump Mark DSO not available to diff --git a/elf/arch-i386.cc b/elf/arch-i386.cc index 41d77a1291..cfb70f1d7d 100644 --- a/elf/arch-i386.cc +++ b/elf/arch-i386.cc @@ -140,6 +140,7 @@ void write_plt_entry(Context &ctx, u8 *buf, Symbol &sym) { 0xcc, // (padding) }; memcpy(buf, insn, sizeof(insn)); + *(ul32 *)(buf + 5) = sym.get_plt_idx(ctx) * sizeof(ElfRel); *(ul32 *)(buf + 11) = sym.get_gotplt_addr(ctx) - ctx.got->shdr.sh_addr; } else { static const u8 insn[] = { @@ -149,10 +150,9 @@ void write_plt_entry(Context &ctx, u8 *buf, Symbol &sym) { 0xcc, // (padding) }; memcpy(buf, insn, sizeof(insn)); + *(ul32 *)(buf + 5) = sym.get_plt_idx(ctx) * sizeof(ElfRel); *(ul32 *)(buf + 11) = sym.get_gotplt_addr(ctx); } - - *(ul32 *)(buf + 5) = sym.get_plt_idx(ctx) * sizeof(ElfRel); } template <> @@ -528,7 +528,7 @@ void InputSection::scan_relocations(Context &ctx) { ty != R_386_GOT32 && ty != R_386_GOT32X) Fatal(ctx) << *this << ": TLS_GD reloc must be followed by PLT or GOT32"; - if (relax_tlsgd(ctx, sym)) + if (ctx.arg.relax && !ctx.arg.shared && !sym.is_imported) i++; else sym.flags.fetch_or(NEEDS_TLSGD, std::memory_order_relaxed); @@ -542,7 +542,7 @@ void InputSection::scan_relocations(Context &ctx) { ty != R_386_GOT32 && ty != R_386_GOT32X) Fatal(ctx) << *this << ": TLS_LDM reloc must be followed by PLT or GOT32"; - if (relax_tlsld(ctx)) + if (ctx.arg.relax && !ctx.arg.shared) i++; else ctx.needs_tlsld.store(true, std::memory_order_relaxed); diff --git a/elf/arch-s390x.cc b/elf/arch-s390x.cc index 3798d03ebe..84564305c1 100644 --- a/elf/arch-s390x.cc +++ b/elf/arch-s390x.cc @@ -456,12 +456,13 @@ void InputSection::scan_relocations(Context &ctx) { break; case R_390_TLS_GD32: case R_390_TLS_GD64: - if (!relax_tlsgd(ctx, sym)) + if (bool do_relax = ctx.arg.relax && !ctx.arg.shared && !sym.is_imported; + !do_relax) sym.flags.fetch_or(NEEDS_TLSGD, std::memory_order_relaxed); break; case R_390_TLS_LDM32: case R_390_TLS_LDM64: - if (!relax_tlsld(ctx)) + if (bool do_relax = ctx.arg.relax && !ctx.arg.shared; !do_relax) ctx.needs_tlsld.store(true, std::memory_order_relaxed); break; case R_390_TLS_LE32: diff --git a/elf/arch-x86-64.cc b/elf/arch-x86-64.cc index db1fd7f128..dcd3a42626 100644 --- a/elf/arch-x86-64.cc +++ b/elf/arch-x86-64.cc @@ -234,8 +234,37 @@ static void relax_gd_to_le(u8 *loc, ElfRel rel, u64 val) { } } +static void relax_gd_to_ie(u8 *loc, ElfRel rel, u64 val) { + switch (rel.r_type) { + case R_X86_64_PLT32: + case R_X86_64_PC32: + case R_X86_64_GOTPCREL: + case R_X86_64_GOTPCRELX: { + static const u8 insn[] = { + 0x64, 0x48, 0x8b, 0x04, 0x25, 0, 0, 0, 0, // mov %fs:0, %rax + 0x48, 0x03, 0x05, 0, 0, 0, 0, // add foo@gottpoff(%rip), %rax + }; + memcpy(loc - 4, insn, sizeof(insn)); + *(ul32 *)(loc + 8) = val - 12; + break; + } + case R_X86_64_PLTOFF64: { + static const u8 insn[] = { + 0x64, 0x48, 0x8b, 0x04, 0x25, 0, 0, 0, 0, // mov %fs:0, %rax + 0x48, 0x03, 0x05, 0, 0, 0, 0, // add foo@gottpoff(%rip), %rax + 0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00, // nop + }; + memcpy(loc - 3, insn, sizeof(insn)); + *(ul32 *)(loc + 9) = val - 13; + break; + } + default: + unreachable(); + } +} + // Rewrite a function call to __tls_get_addr to a cheaper instruction -// sequence. The difference from relax_ld_to_le is that we are +// sequence. The difference from relax_gd_to_le is that we are // materializing a Dynamic Thread Pointer for the current ELF module // instead of an address for a particular thread-local variable. static void relax_ld_to_le(u8 *loc, ElfRel rel, u64 val) { @@ -416,6 +445,9 @@ void InputSection::apply_reloc_alloc(Context &ctx, u8 *base) { case R_X86_64_TLSGD: if (sym.has_tlsgd(ctx)) { write32s(sym.get_tlsgd_addr(ctx) + A - P); + } else if (sym.has_gottp(ctx)) { + relax_gd_to_ie(loc, rels[i + 1], sym.get_gottp_addr(ctx) - P); + i++; } else { relax_gd_to_le(loc, rels[i + 1], S - ctx.tp_addr); i++; @@ -662,7 +694,7 @@ void InputSection::scan_relocations(Context &ctx) { if (sym.is_imported) sym.flags.fetch_or(NEEDS_PLT, std::memory_order_relaxed); break; - case R_X86_64_TLSGD: { + case R_X86_64_TLSGD: if (rel.r_addend != -4) Fatal(ctx) << *this << ": bad r_addend for R_X86_64_TLSGD"; @@ -675,13 +707,17 @@ void InputSection::scan_relocations(Context &ctx) { ty != R_X86_64_GOTPCRELX) Fatal(ctx) << *this << ": TLSGD reloc must be followed by PLT or GOTPCREL"; - if (relax_tlsgd(ctx, sym)) + if (ctx.arg.relax && !sym.is_imported && !ctx.arg.shared) { i++; - else + } else if (ctx.arg.relax && !sym.is_imported && ctx.arg.shared && + !ctx.arg.z_dlopen) { + sym.flags.fetch_or(NEEDS_GOTTP, std::memory_order_relaxed); + i++; + } else { sym.flags.fetch_or(NEEDS_TLSGD, std::memory_order_relaxed); + } break; - } - case R_X86_64_TLSLD: { + case R_X86_64_TLSLD: if (rel.r_addend != -4) Fatal(ctx) << *this << ": bad r_addend for R_X86_64_TLSLD"; @@ -694,12 +730,11 @@ void InputSection::scan_relocations(Context &ctx) { ty != R_X86_64_GOTPCRELX) Fatal(ctx) << *this << ": TLSLD reloc must be followed by PLT or GOTPCREL"; - if (relax_tlsld(ctx)) + if (ctx.arg.relax && !ctx.arg.shared) i++; else ctx.needs_tlsld.store(true, std::memory_order_relaxed); break; - } case R_X86_64_GOTTPOFF: { if (rel.r_addend != -4) Fatal(ctx) << *this << ": bad r_addend for R_X86_64_GOTTPOFF"; diff --git a/elf/mold.h b/elf/mold.h index 3b11c0e84c..d9ef3264df 100644 --- a/elf/mold.h +++ b/elf/mold.h @@ -2826,16 +2826,6 @@ inline bool is_c_identifier(std::string_view s) { return true; } -template -inline bool relax_tlsgd(Context &ctx, Symbol &sym) { - return ctx.arg.relax && !ctx.arg.shared && !sym.is_imported; -} - -template -inline bool relax_tlsld(Context &ctx) { - return ctx.arg.relax && !ctx.arg.shared; -} - template inline bool relax_tlsdesc(Context &ctx, Symbol &sym) { // TLSDESC relocs must be always relaxed for statically-linked diff --git a/test/elf/x86_64_ifunc-alias.sh b/test/elf/x86_64_ifunc-alias.sh old mode 100644 new mode 100755 diff --git a/test/elf/x86_64_tls-gd-to-ie.sh b/test/elf/x86_64_tls-gd-to-ie.sh new file mode 100755 index 0000000000..257d52bf7d --- /dev/null +++ b/test/elf/x86_64_tls-gd-to-ie.sh @@ -0,0 +1,38 @@ +#!/bin/bash +. $(dirname $0)/common.inc + +cat < + +__attribute__((tls_model("global-dynamic"))) static _Thread_local int x1 = 1; +__attribute__((tls_model("global-dynamic"))) _Thread_local int x2 = 2; +__attribute__((tls_model("global-dynamic"))) _Thread_local int x3; + +int foo() { + x3 = 3; + + printf("%d %d %d\n", x1, x2, x3); + return 0; +} +EOF + +cat <