Index: contrib/gcc/doc/invoke.texi =================================================================== --- contrib/gcc/doc/invoke.texi (revision 237107) +++ contrib/gcc/doc/invoke.texi (working copy) @@ -513,7 +513,7 @@ in the following sections. -mno-fp-ret-in-387 -msoft-float -msvr3-shlib @gol -mno-wide-multiply -mrtd -malign-double @gol -mpreferred-stack-boundary=@var{num} @gol --mmmx -msse -msse2 -msse3 -mssse3 -m3dnow @gol +-mmmx -msse -msse2 -msse3 -mssse3 -msse4a -m3dnow -mpopcnt -mabm @gol -mthreads -mno-align-stringops -minline-all-stringops @gol -mpush-args -maccumulate-outgoing-args -m128bit-long-double @gol -m96bit-long-double -mregparm=@var{num} -msseregparm @gol @@ -9061,6 +9061,10 @@ AMD K8 core based CPUs with x86-64 instruction set MMX, SSE, SSE2, 3dNOW!, enhanced 3dNOW! and 64-bit instruction set extensions.) @item k8-sse3, opteron-sse3, athlon64-sse3 Improved versions of k8, opteron and athlon64 with SSE3 instruction set support. +@item amdfam10, barcelona +AMD Family 10h core based CPUs with x86-64 instruction set support. (This +supersets MMX, SSE, SSE2, SSE3, SSE4A, 3dNOW!, enhanced 3dNOW!, ABM and 64-bit +instruction set extensions.) @item winchip-c6 IDT Winchip C6 CPU, dealt in same way as i486 with additional MMX instruction set support. @@ -9355,8 +9359,14 @@ preferred alignment to @option{-mpreferred-stack-b @itemx -mno-sse3 @item -mssse3 @itemx -mno-ssse3 +@item -msse4a +@item -mno-sse4a @item -m3dnow @itemx -mno-3dnow +@item -mpopcnt +@itemx -mno-popcnt +@item -mabm +@itemx -mno-abm @opindex mmmx @opindex mno-mmx @opindex msse @@ -9364,7 +9374,7 @@ preferred alignment to @option{-mpreferred-stack-b @opindex m3dnow @opindex mno-3dnow These switches enable or disable the use of instructions in the MMX, -SSE, SSE2, SSE3, SSSE3 or 3DNow! extended instruction sets. +SSE, SSE2, SSE3, SSSE3, SSE4A, ABM or 3DNow! extended instruction sets. These extensions are also available as built-in functions: see @ref{X86 Built-in Functions}, for details of the functions enabled and disabled by these switches. Index: contrib/gcc/doc/extend.texi =================================================================== --- contrib/gcc/doc/extend.texi (revision 237107) +++ contrib/gcc/doc/extend.texi (working copy) @@ -7255,6 +7255,23 @@ v4si __builtin_ia32_pabsd128 (v4si) v8hi __builtin_ia32_pabsw128 (v8hi) @end smallexample +The following built-in functions are available when @option{-msse4a} is used. + +@smallexample +void _mm_stream_sd (double*,__m128d); +Generates the @code{movntsd} machine instruction. +void _mm_stream_ss (float*,__m128); +Generates the @code{movntss} machine instruction. +__m128i _mm_extract_si64 (__m128i, __m128i); +Generates the @code{extrq} machine instruction with only SSE register operands. +__m128i _mm_extracti_si64 (__m128i, int, int); +Generates the @code{extrq} machine instruction with SSE register and immediate operands. +__m128i _mm_insert_si64 (__m128i, __m128i); +Generates the @code{insertq} machine instruction with only SSE register operands. +__m128i _mm_inserti_si64 (__m128i, __m128i, int, int); +Generates the @code{insertq} machine instruction with SSE register and immediate operands. +@end smallexample + The following built-in functions are available when @option{-m3dnow} is used. All of them generate the machine instruction that is part of the name. Index: contrib/gcc/ChangeLog.gcc43 =================================================================== --- contrib/gcc/ChangeLog.gcc43 (revision 237107) +++ contrib/gcc/ChangeLog.gcc43 (working copy) @@ -1,3 +1,8 @@ +2007-05-01 Dwarakanath Rajagopal (r124341) + + * doc/invoke.texi: Fix typo, 'AMD Family 10h core' instead of + 'AMD Family 10 core'. + 2007-05-01 Dwarakanath Rajagopal (r124339) * config/i386/i386.c (override_options): Accept k8-sse3, opteron-sse3 @@ -5,10 +10,27 @@ with SSE3 instruction set support. * doc/invoke.texi: Likewise. +2007-05-01 Dwarakanath Rajagopal (r124330) + + * config/i386/i386.c (override_options): Tuning 32-byte loop + alignment for amdfam10 architecture. Increasing the max loop + alignment to 24 bytes. + 2007-04-07 H.J. Lu (r123639) * config/i386/i386.c (ix86_handle_option): Handle SSSE3. +2007-03-28 Dwarakanath Rajagopal (r123313) + + * config.gcc: Accept barcelona as a variant of amdfam10. + * config/i386/i386.c (override_options): Likewise. + * doc/invoke.texi: Likewise. + +2007-02-09 Dwarakanath Rajagopal (r121763) + + * config/i386/driver-i386.c: Turn on -mtune=native for AMDFAM10. + (bit_SSE4a): New. + 2007-02-08 Harsha Jagasia (r121726) * config/i386/xmmintrin.h: Make inclusion of emmintrin.h @@ -26,6 +48,168 @@ * config/i386/i386.c (override_options): Set PTA_SSSE3 for core2. +2007-02-05 Harsha Jagasia (r121625) + + * config/i386/athlon.md (athlon_fldxf_k8, athlon_fld_k8, + athlon_fstxf_k8, athlon_fst_k8, athlon_fist, athlon_fmov, + athlon_fadd_load, athlon_fadd_load_k8, athlon_fadd, athlon_fmul, + athlon_fmul_load, athlon_fmul_load_k8, athlon_fsgn, + athlon_fdiv_load, athlon_fdiv_load_k8, athlon_fdiv_k8, + athlon_fpspc_load, athlon_fpspc, athlon_fcmov_load, + athlon_fcmov_load_k8, athlon_fcmov_k8, athlon_fcomi_load_k8, + athlon_fcomi, athlon_fcom_load_k8, athlon_fcom): Added amdfam10. + +2007-02-05 Harsha Jagasia (r121625) + + * config/i386/i386.md (x86_sahf_1, cmpfp_i_mixed, cmpfp_i_sse, + cmpfp_i_i387, cmpfp_iu_mixed, cmpfp_iu_sse, cmpfp_iu_387, + swapsi, swaphi_1, swapqi_1, swapdi_rex64, fix_truncsfdi_sse, + fix_truncdfdi_sse, fix_truncsfsi_sse, fix_truncdfsi_sse, + x86_fldcw_1, floatsisf2_mixed, floatsisf2_sse, floatdisf2_mixed, + floatdisf2_sse, floatsidf2_mixed, floatsidf2_sse, + floatdidf2_mixed, floatdidf2_sse, muldi3_1_rex64, mulsi3_1, + mulsi3_1_zext, mulhi3_1, mulqi3_1, umulqihi3_1, mulqihi3_insn, + umulditi3_insn, umulsidi3_insn, mulditi3_insn, mulsidi3_insn, + umuldi3_highpart_rex64, umulsi3_highpart_insn, + umulsi3_highpart_zext, smuldi3_highpart_rex64, + smulsi3_highpart_insn, smulsi3_highpart_zext, x86_64_shld, + x86_shld_1, x86_64_shrd, sqrtsf2_mixed, sqrtsf2_sse, + sqrtsf2_i387, sqrtdf2_mixed, sqrtdf2_sse, sqrtdf2_i387, + sqrtextendsfdf2_i387, sqrtxf2, sqrtextendsfxf2_i387, + sqrtextenddfxf2_i387): Added amdfam10_decode. + + * config/i386/athlon.md (athlon_idirect_amdfam10, + athlon_ivector_amdfam10, athlon_idirect_load_amdfam10, + athlon_ivector_load_amdfam10, athlon_idirect_both_amdfam10, + athlon_ivector_both_amdfam10, athlon_idirect_store_amdfam10, + athlon_ivector_store_amdfam10): New define_insn_reservation. + (athlon_idirect_loadmov, athlon_idirect_movstore): Added + amdfam10. + +2007-02-05 Harsha Jagasia (r121625) + + * config/i386/athlon.md (athlon_call_amdfam10, + athlon_pop_amdfam10, athlon_lea_amdfam10): New + define_insn_reservation. + (athlon_branch, athlon_push, athlon_leave_k8, athlon_imul_k8, + athlon_imul_k8_DI, athlon_imul_mem_k8, athlon_imul_mem_k8_DI, + athlon_idiv, athlon_idiv_mem, athlon_str): Added amdfam10. + +2007-02-05 Harsha Jagasia (r121625) + + * config/i386/athlon.md (athlon_sseld_amdfam10, + athlon_mmxld_amdfam10, athlon_ssest_amdfam10, + athlon_mmxssest_short_amdfam10): New define_insn_reservation. + +2007-02-05 Harsha Jagasia (r121625) + + * config/i386/athlon.md (athlon_sseins_amdfam10): New + define_insn_reservation. + * config/i386/i386.md (sseins): Added sseins to define_attr type + and define_attr unit. + * config/i386/sse.md: Set type attribute to sseins for insertq + and insertqi. + +2007-02-05 Harsha Jagasia (r121625) + + * config/i386/athlon.md (sselog_load_amdfam10, sselog_amdfam10, + ssecmpvector_load_amdfam10, ssecmpvector_amdfam10, + ssecomi_load_amdfam10, ssecomi_amdfam10, + sseaddvector_load_amdfam10, sseaddvector_amdfam10): New + define_insn_reservation. + (ssecmp_load_k8, ssecmp, sseadd_load_k8, seadd): Added amdfam10. + +2007-02-05 Harsha Jagasia (r121625) + + * config/i386/athlon.md (cvtss2sd_load_amdfam10, + cvtss2sd_amdfam10, cvtps2pd_load_amdfam10, cvtps2pd_amdfam10, + cvtsi2sd_load_amdfam10, cvtsi2ss_load_amdfam10, + cvtsi2sd_amdfam10, cvtsi2ss_amdfam10, cvtsd2ss_load_amdfam10, + cvtsd2ss_amdfam10, cvtpd2ps_load_amdfam10, cvtpd2ps_amdfam10, + cvtsX2si_load_amdfam10, cvtsX2si_amdfam10): New + define_insn_reservation. + + * config/i386/sse.md (cvtsi2ss, cvtsi2ssq, cvtss2si, + cvtss2siq, cvttss2si, cvttss2siq, cvtsi2sd, cvtsi2sdq, + cvtsd2si, cvtsd2siq, cvttsd2si, cvttsd2siq, + cvtpd2dq, cvttpd2dq, cvtsd2ss, cvtss2sd, + cvtpd2ps, cvtps2pd): Added amdfam10_decode attribute. + +2007-02-05 Harsha Jagasia (r121625) + + * config/i386/athlon.md (athlon_ssedivvector_amdfam10, + athlon_ssedivvector_load_amdfam10, athlon_ssemulvector_amdfam10, + athlon_ssemulvector_load_amdfam10): New define_insn_reservation. + (athlon_ssediv, athlon_ssediv_load_k8, athlon_ssemul, + athlon_ssemul_load_k8): Added amdfam10. + +2007-02-05 Harsha Jagasia (r121625) + + * config/i386/i386.h (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL): New macro. + (x86_sse_unaligned_move_optimal): New variable. + + * config/i386/i386.c (x86_sse_unaligned_move_optimal): Enable for + m_AMDFAM10. + (ix86_expand_vector_move_misalign): Add code to generate movupd/movups + for unaligned vector SSE double/single precision loads for AMDFAM10. + +2007-02-05 Harsha Jagasia (r121625) + + * config/i386/i386.h (TARGET_AMDFAM10): New macro. + (TARGET_CPU_CPP_BUILTINS): Add code for amdfam10. + Define TARGET_CPU_DEFAULT_amdfam10. + (TARGET_CPU_DEFAULT_NAMES): Add amdfam10. + (processor_type): Add PROCESSOR_AMDFAM10. + + * config/i386/i386.md: Add amdfam10 as a new cpu attribute to match + processor_type in config/i386/i386.h. + Enable imul peepholes for TARGET_AMDFAM10. + + * config.gcc: Add support for --with-cpu option for amdfam10. + + * config/i386/i386.c (amdfam10_cost): New variable. + (m_AMDFAM10): New macro. + (m_ATHLON_K8_AMDFAM10): New macro. + (x86_use_leave, x86_push_memory, x86_movx, x86_unroll_strlen, + x86_cmove, x86_3dnow_a, x86_deep_branch, x86_use_simode_fiop, + x86_promote_QImode, x86_integer_DFmode_moves, + x86_partial_reg_dependency, x86_memory_mismatch_stall, + x86_accumulate_outgoing_args, x86_arch_always_fancy_math_387, + x86_sse_partial_reg_dependency, x86_sse_typeless_stores, + x86_use_ffreep, x86_use_incdec, x86_four_jump_limit, + x86_schedule, x86_use_bt, x86_cmpxchg16b, x86_pad_returns): + Enable/disable for amdfam10. + (override_options): Add amdfam10_cost to processor_target_table. + Set up PROCESSOR_AMDFAM10 for amdfam10 entry in + processor_alias_table. + (ix86_issue_rate): Add PROCESSOR_AMDFAM10. + (ix86_adjust_cost): Add code for amdfam10. + +2007-02-05 Harsha Jagasia (r121625) + + * config/i386/i386.opt: Add new Advanced Bit Manipulation (-mabm) + instruction set feature flag. Add new (-mpopcnt) flag for popcnt + instruction. Add new SSE4A (-msse4a) instruction set feature flag. + * config/i386/i386.h: Add builtin definition for SSE4A. + * config/i386/i386.md: Add support for ABM instructions + (popcnt and lzcnt). + * config/i386/sse.md: Add support for SSE4A instructions + (movntss, movntsd, extrq, insertq). + * config/i386/i386.c: Add support for ABM and SSE4A builtins. + Add -march=amdfam10 flag. + * config/i386/ammintrin.h: Add support for SSE4A intrinsics. + * doc/invoke.texi: Add documentation on flags for sse4a, abm, popcnt + and amdfam10. + * doc/extend.texi: Add documentation for SSE4A builtins. + +2007-01-24 Jakub Jelinek (r121140) + + * config/i386/i386.h (x86_cmpxchg16b): Remove const. + (TARGET_CMPXCHG16B): Define to x86_cmpxchg16b. + * config/i386/i386.c (x86_cmpxchg16b): Remove const. + (override_options): Add PTA_CX16 flag. Set x86_cmpxchg16b + for CPUs that have PTA_CX16 set. + 2007-01-17 Eric Christopher (r120846) * config.gcc: Support core2 processor. Index: contrib/gcc/config.gcc =================================================================== --- contrib/gcc/config.gcc (revision 237107) +++ contrib/gcc/config.gcc (working copy) @@ -269,12 +269,12 @@ xscale-*-*) i[34567]86-*-*) cpu_type=i386 extra_headers="mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h - pmmintrin.h tmmintrin.h" + pmmintrin.h tmmintrin.h ammintrin.h" ;; x86_64-*-*) cpu_type=i386 extra_headers="mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h - pmmintrin.h tmmintrin.h" + pmmintrin.h tmmintrin.h ammintrin.h" need_64bit_hwint=yes ;; ia64-*-*) @@ -1209,14 +1209,14 @@ i[34567]86-*-solaris2*) # FIXME: -m64 for i[34567]86-*-* should be allowed just # like -m32 for x86_64-*-*. case X"${with_cpu}" in - Xgeneric|Xcore2|Xnocona|Xx86-64|Xk8|Xopteron|Xathlon64|Xathlon-fx) + Xgeneric|Xcore2|Xnocona|Xx86-64|Xamdfam10|Xbarcelona|Xk8|Xopteron|Xathlon64|Xathlon-fx) ;; X) with_cpu=generic ;; *) echo "Unsupported CPU used in --with-cpu=$with_cpu, supported values:" 1>&2 - echo "generic core2 nocona x86-64 k8 opteron athlon64 athlon-fx" 1>&2 + echo "generic core2 nocona x86-64 amdfam10 barcelona k8 opteron athlon64 athlon-fx" 1>&2 exit 1 ;; esac @@ -2515,6 +2515,9 @@ if test x$with_cpu = x ; then ;; i686-*-* | i786-*-*) case ${target_noncanonical} in + amdfam10-*|barcelona-*) + with_cpu=amdfam10 + ;; k8-*|opteron-*|athlon_64-*) with_cpu=k8 ;; @@ -2555,6 +2558,9 @@ if test x$with_cpu = x ; then ;; x86_64-*-*) case ${target_noncanonical} in + amdfam10-*|barcelona-*) + with_cpu=amdfam10 + ;; k8-*|opteron-*|athlon_64-*) with_cpu=k8 ;; @@ -2795,7 +2801,7 @@ case "${target}" in esac # OK ;; - "" | k8 | opteron | athlon64 | athlon-fx | nocona | core2 | generic) + "" | amdfam10 | barcelona | k8 | opteron | athlon64 | athlon-fx | nocona | core2 | generic) # OK ;; *) Index: contrib/gcc/config/i386/i386.md =================================================================== --- contrib/gcc/config/i386/i386.md (revision 237107) +++ contrib/gcc/config/i386/i386.md (working copy) @@ -153,6 +153,22 @@ (UNSPEC_PSHUFB 120) (UNSPEC_PSIGN 121) (UNSPEC_PALIGNR 122) + + ; For SSE4A support + (UNSPEC_EXTRQI 130) + (UNSPEC_EXTRQ 131) + (UNSPEC_INSERTQI 132) + (UNSPEC_INSERTQ 133) + + ; Other AMDFAM10 Patterns + (UNSPEC_CVTSI2SS_AMDFAM10 140) + (UNSPEC_CVTSI2SD_AMDFAM10 141) + (UNSPEC_MOVDSI2SF_AMDFAM10 142) + (UNSPEC_MOVDSI2DF_AMDFAM10 143) + (UNSPEC_UNPCKLPS_AMDFAM10 144) + (UNSPEC_UNPCKLPD_AMDFAM10 145) + (UNSPEC_CVTPS2PD_AMDFAM10 146) + (UNSPEC_CVTPD2PS_AMDFAM10 147) ]) (define_constants @@ -192,7 +208,7 @@ ;; Processor type. This attribute must exactly match the processor_type ;; enumeration in i386.h. -(define_attr "cpu" "i386,i486,pentium,pentiumpro,geode,k6,athlon,pentium4,k8,nocona,core2,generic32,generic64" +(define_attr "cpu" "i386,i486,pentium,pentiumpro,geode,k6,athlon,pentium4,k8,nocona,core2,generic32,generic64,amdfam10" (const (symbol_ref "ix86_tune"))) ;; A basic instruction type. Refinements due to arguments to be @@ -203,10 +219,10 @@ incdec,ishift,ishift1,rotate,rotate1,imul,idiv, icmp,test,ibr,setcc,icmov, push,pop,call,callv,leave, - str,cld, + str,bitmanip,cld, fmov,fop,fsgn,fmul,fdiv,fpspc,fcmov,fcmp,fxch,fistp,fisttp,frndint, sselog,sselog1,sseiadd,sseishft,sseimul, - sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt,sseicvt,ssediv, + sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt,sseicvt,ssediv,sseins, mmx,mmxmov,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft" (const_string "other")) @@ -220,7 +236,7 @@ (cond [(eq_attr "type" "fmov,fop,fsgn,fmul,fdiv,fpspc,fcmov,fcmp,fxch,fistp,fisttp,frndint") (const_string "i387") (eq_attr "type" "sselog,sselog1,sseiadd,sseishft,sseimul, - sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt,sseicvt,ssediv") + sse,ssemov,sseadd,ssemul,ssecmp,ssecomi,ssecvt,sseicvt,ssediv,sseins") (const_string "sse") (eq_attr "type" "mmx,mmxmov,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft") (const_string "mmx") @@ -230,7 +246,8 @@ ;; The (bounding maximum) length of an instruction immediate. (define_attr "length_immediate" "" - (cond [(eq_attr "type" "incdec,setcc,icmov,str,cld,lea,other,multi,idiv,leave") + (cond [(eq_attr "type" "incdec,setcc,icmov,str,cld,lea,other,multi,idiv,leave, + bitmanip") (const_int 0) (eq_attr "unit" "i387,sse,mmx") (const_int 0) @@ -284,7 +301,7 @@ ;; Set when 0f opcode prefix is used. (define_attr "prefix_0f" "" (if_then_else - (ior (eq_attr "type" "imovx,setcc,icmov") + (ior (eq_attr "type" "imovx,setcc,icmov,bitmanip") (eq_attr "unit" "sse,mmx")) (const_int 1) (const_int 0))) @@ -413,7 +430,7 @@ (const_string "load") (and (eq_attr "type" "!alu1,negnot,ishift1, - imov,imovx,icmp,test, + imov,imovx,icmp,test,bitmanip, fmov,fcmp,fsgn, sse,ssemov,ssecmp,ssecomi,ssecvt,sseicvt,sselog1, mmx,mmxmov,mmxcmp,mmxcvt") @@ -968,10 +985,11 @@ "sahf" [(set_attr "length" "1") (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "direct") (set_attr "mode" "SI")]) ;; Pentium Pro can do steps 1 through 3 in one go. - +;; comi*, ucomi*, fcomi*, ficomi*,fucomi* (i387 instructions set condition codes) (define_insn "*cmpfp_i_mixed" [(set (reg:CCFP FLAGS_REG) (compare:CCFP (match_operand 0 "register_operand" "f,x") @@ -985,7 +1003,8 @@ (if_then_else (match_operand:SF 1 "" "") (const_string "SF") (const_string "DF"))) - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "direct")]) (define_insn "*cmpfp_i_sse" [(set (reg:CCFP FLAGS_REG) @@ -1000,7 +1019,8 @@ (if_then_else (match_operand:SF 1 "" "") (const_string "SF") (const_string "DF"))) - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "direct")]) (define_insn "*cmpfp_i_i387" [(set (reg:CCFP FLAGS_REG) @@ -1019,7 +1039,8 @@ (const_string "DF") ] (const_string "XF"))) - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "direct")]) (define_insn "*cmpfp_iu_mixed" [(set (reg:CCFPU FLAGS_REG) @@ -1034,7 +1055,8 @@ (if_then_else (match_operand:SF 1 "" "") (const_string "SF") (const_string "DF"))) - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "direct")]) (define_insn "*cmpfp_iu_sse" [(set (reg:CCFPU FLAGS_REG) @@ -1049,7 +1071,8 @@ (if_then_else (match_operand:SF 1 "" "") (const_string "SF") (const_string "DF"))) - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "direct")]) (define_insn "*cmpfp_iu_387" [(set (reg:CCFPU FLAGS_REG) @@ -1068,7 +1091,8 @@ (const_string "DF") ] (const_string "XF"))) - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "direct")]) ;; Move instructions. @@ -1274,7 +1298,8 @@ [(set_attr "type" "imov") (set_attr "mode" "SI") (set_attr "pent_pair" "np") - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "double")]) (define_expand "movhi" [(set (match_operand:HI 0 "nonimmediate_operand" "") @@ -1391,8 +1416,10 @@ [(set_attr "type" "imov") (set_attr "mode" "SI") (set_attr "pent_pair" "np") - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "double")]) +;; Not added amdfam10_decode since TARGET_PARTIAL_REG_STALL is disabled for AMDFAM10 (define_insn "*swaphi_2" [(set (match_operand:HI 0 "register_operand" "+r") (match_operand:HI 1 "register_operand" "+r")) @@ -1565,8 +1592,10 @@ [(set_attr "type" "imov") (set_attr "mode" "SI") (set_attr "pent_pair" "np") - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "vector")]) +;; Not added amdfam10_decode since TARGET_PARTIAL_REG_STALL is disabled for AMDFAM10 (define_insn "*swapqi_2" [(set (match_operand:QI 0 "register_operand" "+q") (match_operand:QI 1 "register_operand" "+q")) @@ -2120,7 +2149,8 @@ [(set_attr "type" "imov") (set_attr "mode" "DI") (set_attr "pent_pair" "np") - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "double")]) (define_expand "movti" [(set (match_operand:TI 0 "nonimmediate_operand" "") @@ -3539,10 +3569,43 @@ } operands[1] = validize_mem (force_const_mem (SFmode, operands[1])); } + + if (TARGET_USE_VECTOR_CONVERTS && !optimize_size + && (GET_CODE (operands[1]) != MEM) && (GET_CODE (operands[0]) != MEM) + && TARGET_SSE_MATH && optimize) + { + /* For converting SF(xmm2) to DF(xmm1), the following code is faster + in AMDFAM10 instead of using cvtss2sd + unpcklps xmm2,xmm2 + cvtps2pd xmm2,xmm1 + */ + emit_insn (gen_sse_unpcklps_amdfam10 (operands[1], operands[1])); + emit_insn (gen_sse2_cvtps2pd_amdfam10 (operands[0], operands[1])); + DONE; + } + if (GET_CODE (operands[0]) == MEM && GET_CODE (operands[1]) == MEM) operands[1] = force_reg (SFmode, operands[1]); }) +(define_insn "sse_unpcklps_amdfam10" + [(set (match_operand:SF 0 "register_operand" "=Y") + (unspec:SF [(match_operand: SF 1 "register_operand" "0")] + UNSPEC_UNPCKLPS_AMDFAM10))] + "TARGET_USE_VECTOR_CONVERTS && TARGET_SSE" + "unpcklps\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "mode" "V4SF")]) + +(define_insn "sse2_cvtps2pd_amdfam10" + [(set (match_operand:DF 0 "register_operand" "=Y") + (unspec:DF [(match_operand: SF 1 "register_operand" "Y")] + UNSPEC_CVTPS2PD_AMDFAM10))] + "TARGET_USE_VECTOR_CONVERTS && TARGET_SSE2" + "cvtps2pd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "mode" "V2DF")]) + (define_insn "*extendsfdf2_mixed" [(set (match_operand:DF 0 "nonimmediate_operand" "=f,m,Y") (float_extend:DF (match_operand:SF 1 "nonimmediate_operand" "fm,f,mY")))] @@ -3713,6 +3776,20 @@ (match_operand:DF 1 "nonimmediate_operand" "")))] "TARGET_80387 || (TARGET_SSE2 && TARGET_SSE_MATH)" { + if (TARGET_USE_VECTOR_CONVERTS && !optimize_size + && (GET_CODE (operands[1]) != MEM) && (GET_CODE (operands[0]) != MEM) + && TARGET_SSE_MATH && optimize) + { + /* For converting DF(xmm1) to SF(xmm2), the following code is faster + in AMDFAM10 instead of using cvtsd2ss + unpcklpd xmm2,xmm2 + cvtpd2ps xmm2,xmm1 + */ + emit_insn (gen_sse_unpcklpd_amdfam10 (operands[1], operands[1])); + emit_insn (gen_sse2_cvtpd2ps_amdfam10 (operands[0], operands[1])); + DONE; + } + if (MEM_P (operands[0]) && MEM_P (operands[1])) operands[1] = force_reg (DFmode, operands[1]); @@ -3728,6 +3805,24 @@ } }) +(define_insn "sse_unpcklpd_amdfam10" + [(set (match_operand:DF 0 "register_operand" "=Y") + (unspec:DF [(match_operand: DF 1 "register_operand" "0")] + UNSPEC_UNPCKLPD_AMDFAM10))] + "TARGET_USE_VECTOR_CONVERTS && TARGET_SSE" + "unpcklpd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "mode" "V2DF")]) + +(define_insn "sse2_cvtpd2ps_amdfam10" + [(set (match_operand:SF 0 "register_operand" "=Y") + (unspec:SF [(match_operand: DF 1 "register_operand" "Y")] + UNSPEC_CVTPD2PS_AMDFAM10))] + "TARGET_USE_VECTOR_CONVERTS && TARGET_SSE2" + "cvtpd2ps\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "mode" "V4SF")]) + (define_expand "truncdfsf2_with_temp" [(parallel [(set (match_operand:SF 0 "" "") (float_truncate:SF (match_operand:DF 1 "" ""))) @@ -4150,7 +4245,8 @@ "cvttss2si{q}\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "mode" "SF") - (set_attr "athlon_decode" "double,vector")]) + (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double")]) (define_insn "fix_truncdfdi_sse" [(set (match_operand:DI 0 "register_operand" "=r,r") @@ -4159,7 +4255,8 @@ "cvttsd2si{q}\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "mode" "DF") - (set_attr "athlon_decode" "double,vector")]) + (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double")]) (define_insn "fix_truncsfsi_sse" [(set (match_operand:SI 0 "register_operand" "=r,r") @@ -4168,7 +4265,8 @@ "cvttss2si\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "mode" "DF") - (set_attr "athlon_decode" "double,vector")]) + (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double")]) (define_insn "fix_truncdfsi_sse" [(set (match_operand:SI 0 "register_operand" "=r,r") @@ -4177,7 +4275,8 @@ "cvttsd2si\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "mode" "DF") - (set_attr "athlon_decode" "double,vector")]) + (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double")]) ;; Avoid vector decoded forms of the instruction. (define_peephole2 @@ -4438,7 +4537,8 @@ [(set_attr "length" "2") (set_attr "mode" "HI") (set_attr "unit" "i387") - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "vector")]) ;; Conversion between fixed point and floating point. @@ -4474,8 +4574,47 @@ [(set (match_operand:SF 0 "register_operand" "") (float:SF (match_operand:SI 1 "nonimmediate_operand" "")))] "TARGET_80387 || TARGET_SSE_MATH" - "") + " + { + /* For converting SI to SF, the following code is faster in AMDFAM10 + mov reg32, mem32 + movd mem32, xmm + cvtdq2ps xmm,xmm + */ + + if (TARGET_USE_VECTOR_CONVERTS && !optimize_size + && (GET_CODE (operands[1]) != MEM) && TARGET_SSE_MATH + && optimize ) + { + rtx tmp; + tmp = assign_386_stack_local (SImode, SLOT_TEMP); + emit_move_insn (tmp, operands[1]); + emit_insn (gen_sse2_movdsi2sf_amdfam10 (operands[0], tmp)); + emit_insn (gen_sse2_cvtdq2ps_amdfam10 (operands[0], operands[0])); + DONE; + } + } + ") +(define_insn "sse2_cvtdq2ps_amdfam10" + [(set (match_operand:SF 0 "register_operand" "=x") + (unspec:SF [(match_operand:SF 1 "register_operand" "x")] + UNSPEC_CVTSI2SS_AMDFAM10))] + "TARGET_SSE2 && TARGET_USE_VECTOR_CONVERTS" + "cvtdq2ps\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "mode" "V4SF")]) + +(define_insn "sse2_movdsi2sf_amdfam10" + [(set (match_operand:SF 0 "register_operand" "=x") + (unspec:SF [(match_operand:SI 1 "memory_operand" "m")] + UNSPEC_MOVDSI2SF_AMDFAM10))] + "TARGET_SSE2 && TARGET_USE_VECTOR_CONVERTS" + "movd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "mode" "SF")]) + + (define_insn "*floatsisf2_mixed" [(set (match_operand:SF 0 "register_operand" "=f,?f,x,x") (float:SF (match_operand:SI 1 "nonimmediate_operand" "m,r,r,mr")))] @@ -4489,6 +4628,7 @@ (set_attr "mode" "SF") (set_attr "unit" "*,i387,*,*") (set_attr "athlon_decode" "*,*,vector,double") + (set_attr "amdfam10_decode" "*,*,vector,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatsisf2_sse" @@ -4499,6 +4639,7 @@ [(set_attr "type" "sseicvt") (set_attr "mode" "SF") (set_attr "athlon_decode" "vector,double") + (set_attr "amdfam10_decode" "vector,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatsisf2_i387" @@ -4532,6 +4673,7 @@ (set_attr "mode" "SF") (set_attr "unit" "*,i387,*,*") (set_attr "athlon_decode" "*,*,vector,double") + (set_attr "amdfam10_decode" "*,*,vector,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatdisf2_sse" @@ -4542,6 +4684,7 @@ [(set_attr "type" "sseicvt") (set_attr "mode" "SF") (set_attr "athlon_decode" "vector,double") + (set_attr "amdfam10_decode" "vector,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatdisf2_i387" @@ -4585,8 +4728,46 @@ [(set (match_operand:DF 0 "register_operand" "") (float:DF (match_operand:SI 1 "nonimmediate_operand" "")))] "TARGET_80387 || (TARGET_SSE2 && TARGET_SSE_MATH)" - "") + " + { + /* For converting SI to DF, the following code is faster in AMDFAM10 + mov reg32, mem32 + movd mem32, xmm + cvtdq2pd xmm,xmm + */ + + if (TARGET_USE_VECTOR_CONVERTS && !optimize_size + && (GET_CODE (operands[1]) != MEM) && TARGET_SSE_MATH + && optimize) + { + rtx tmp; + tmp = assign_386_stack_local (SImode, SLOT_TEMP); + emit_move_insn (tmp, operands[1]); + emit_insn (gen_sse2_movdsi2df_amdfam10 (operands[0], tmp)); + emit_insn (gen_sse2_cvtdq2pd_amdfam10 (operands[0], operands[0])); + DONE; + } + } + ") +(define_insn "sse2_cvtdq2pd_amdfam10" + [(set (match_operand:DF 0 "register_operand" "=Y") + (unspec:DF [(match_operand:DF 1 "register_operand" "Y")] + UNSPEC_CVTSI2SD_AMDFAM10))] + "TARGET_SSE2 && TARGET_USE_VECTOR_CONVERTS" + "cvtdq2pd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "mode" "DF")]) + +(define_insn "sse2_movdsi2df_amdfam10" + [(set (match_operand:DF 0 "register_operand" "=Y") + (unspec:DF [(match_operand:SI 1 "memory_operand" "m")] + UNSPEC_MOVDSI2DF_AMDFAM10))] + "TARGET_SSE2 && TARGET_USE_VECTOR_CONVERTS" + "movd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt") + (set_attr "mode" "SF")]) + (define_insn "*floatsidf2_mixed" [(set (match_operand:DF 0 "register_operand" "=f,?f,Y,Y") (float:DF (match_operand:SI 1 "nonimmediate_operand" "m,r,r,mr")))] @@ -4600,6 +4781,7 @@ (set_attr "mode" "DF") (set_attr "unit" "*,i387,*,*") (set_attr "athlon_decode" "*,*,double,direct") + (set_attr "amdfam10_decode" "*,*,vector,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatsidf2_sse" @@ -4610,6 +4792,7 @@ [(set_attr "type" "sseicvt") (set_attr "mode" "DF") (set_attr "athlon_decode" "double,direct") + (set_attr "amdfam10_decode" "vector,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatsidf2_i387" @@ -4643,6 +4826,7 @@ (set_attr "mode" "DF") (set_attr "unit" "*,i387,*,*") (set_attr "athlon_decode" "*,*,double,direct") + (set_attr "amdfam10_decode" "*,*,vector,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatdidf2_sse" @@ -4653,6 +4837,7 @@ [(set_attr "type" "sseicvt") (set_attr "mode" "DF") (set_attr "athlon_decode" "double,direct") + (set_attr "amdfam10_decode" "vector,double") (set_attr "fp_int_src" "true")]) (define_insn "*floatdidf2_i387" @@ -6860,6 +7045,14 @@ "TARGET_64BIT" "") +;; On AMDFAM10 +;; IMUL reg64, reg64, imm8 Direct +;; IMUL reg64, mem64, imm8 VectorPath +;; IMUL reg64, reg64, imm32 Direct +;; IMUL reg64, mem64, imm32 VectorPath +;; IMUL reg64, reg64 Direct +;; IMUL reg64, mem64 Direct + (define_insn "*muldi3_1_rex64" [(set (match_operand:DI 0 "register_operand" "=r,r,r") (mult:DI (match_operand:DI 1 "nonimmediate_operand" "%rm,rm,0") @@ -6882,6 +7075,11 @@ (match_operand 1 "memory_operand" "")) (const_string "vector")] (const_string "direct"))) + (set (attr "amdfam10_decode") + (cond [(and (eq_attr "alternative" "0,1") + (match_operand 1 "memory_operand" "")) + (const_string "vector")] + (const_string "direct"))) (set_attr "mode" "DI")]) (define_expand "mulsi3" @@ -6892,6 +7090,14 @@ "" "") +;; On AMDFAM10 +;; IMUL reg32, reg32, imm8 Direct +;; IMUL reg32, mem32, imm8 VectorPath +;; IMUL reg32, reg32, imm32 Direct +;; IMUL reg32, mem32, imm32 VectorPath +;; IMUL reg32, reg32 Direct +;; IMUL reg32, mem32 Direct + (define_insn "*mulsi3_1" [(set (match_operand:SI 0 "register_operand" "=r,r,r") (mult:SI (match_operand:SI 1 "nonimmediate_operand" "%rm,rm,0") @@ -6913,6 +7119,11 @@ (match_operand 1 "memory_operand" "")) (const_string "vector")] (const_string "direct"))) + (set (attr "amdfam10_decode") + (cond [(and (eq_attr "alternative" "0,1") + (match_operand 1 "memory_operand" "")) + (const_string "vector")] + (const_string "direct"))) (set_attr "mode" "SI")]) (define_insn "*mulsi3_1_zext" @@ -6938,6 +7149,11 @@ (match_operand 1 "memory_operand" "")) (const_string "vector")] (const_string "direct"))) + (set (attr "amdfam10_decode") + (cond [(and (eq_attr "alternative" "0,1") + (match_operand 1 "memory_operand" "")) + (const_string "vector")] + (const_string "direct"))) (set_attr "mode" "SI")]) (define_expand "mulhi3" @@ -6948,6 +7164,13 @@ "TARGET_HIMODE_MATH" "") +;; On AMDFAM10 +;; IMUL reg16, reg16, imm8 VectorPath +;; IMUL reg16, mem16, imm8 VectorPath +;; IMUL reg16, reg16, imm16 VectorPath +;; IMUL reg16, mem16, imm16 VectorPath +;; IMUL reg16, reg16 Direct +;; IMUL reg16, mem16 Direct (define_insn "*mulhi3_1" [(set (match_operand:HI 0 "register_operand" "=r,r,r") (mult:HI (match_operand:HI 1 "nonimmediate_operand" "%rm,rm,0") @@ -6966,6 +7189,10 @@ (eq_attr "alternative" "1,2") (const_string "vector")] (const_string "direct"))) + (set (attr "amdfam10_decode") + (cond [(eq_attr "alternative" "0,1") + (const_string "vector")] + (const_string "direct"))) (set_attr "mode" "HI")]) (define_expand "mulqi3" @@ -6976,6 +7203,10 @@ "TARGET_QIMODE_MATH" "") +;;On AMDFAM10 +;; MUL reg8 Direct +;; MUL mem8 Direct + (define_insn "*mulqi3_1" [(set (match_operand:QI 0 "register_operand" "=a") (mult:QI (match_operand:QI 1 "nonimmediate_operand" "%0") @@ -6990,6 +7221,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "direct"))) + (set_attr "amdfam10_decode" "direct") (set_attr "mode" "QI")]) (define_expand "umulqihi3" @@ -7016,6 +7248,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "direct"))) + (set_attr "amdfam10_decode" "direct") (set_attr "mode" "QI")]) (define_expand "mulqihi3" @@ -7040,6 +7273,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "direct"))) + (set_attr "amdfam10_decode" "direct") (set_attr "mode" "QI")]) (define_expand "umulditi3" @@ -7066,6 +7300,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "DI")]) ;; We can't use this pattern in 64bit mode, since it results in two separate 32bit registers @@ -7093,6 +7328,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "SI")]) (define_expand "mulditi3" @@ -7119,6 +7355,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "DI")]) (define_expand "mulsidi3" @@ -7145,6 +7382,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "SI")]) (define_expand "umuldi3_highpart" @@ -7181,6 +7419,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "DI")]) (define_expand "umulsi3_highpart" @@ -7216,6 +7455,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "SI")]) (define_insn "*umulsi3_highpart_zext" @@ -7238,6 +7478,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "SI")]) (define_expand "smuldi3_highpart" @@ -7273,6 +7514,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "DI")]) (define_expand "smulsi3_highpart" @@ -7307,6 +7549,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "SI")]) (define_insn "*smulsi3_highpart_zext" @@ -7328,6 +7571,7 @@ (if_then_else (eq_attr "cpu" "athlon") (const_string "vector") (const_string "double"))) + (set_attr "amdfam10_decode" "double") (set_attr "mode" "SI")]) ;; The patterns that match these are at the end of this file. @@ -10309,7 +10553,8 @@ [(set_attr "type" "ishift") (set_attr "prefix_0f" "1") (set_attr "mode" "DI") - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "vector")]) (define_expand "x86_64_shift_adj" [(set (reg:CCZ FLAGS_REG) @@ -10524,7 +10769,8 @@ (set_attr "prefix_0f" "1") (set_attr "mode" "SI") (set_attr "pent_pair" "np") - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "vector")]) (define_expand "x86_shift_adj_1" [(set (reg:CCZ FLAGS_REG) @@ -11284,7 +11530,8 @@ [(set_attr "type" "ishift") (set_attr "prefix_0f" "1") (set_attr "mode" "DI") - (set_attr "athlon_decode" "vector")]) + (set_attr "athlon_decode" "vector") + (set_attr "amdfam10_decode" "vector")]) (define_expand "ashrdi3" [(set (match_operand:DI 0 "shiftdi_operand" "") @@ -14558,8 +14805,24 @@ [(set (match_dup 0) (xor:SI (match_dup 0) (const_int 31))) (clobber (reg:CC FLAGS_REG))])] "" - "") +{ + if (TARGET_ABM) + { + emit_insn (gen_clzsi2_abm (operands[0], operands[1])); + DONE; + } +}) +(define_insn "clzsi2_abm" + [(set (match_operand:SI 0 "register_operand" "=r") + (clz:SI (match_operand:SI 1 "nonimmediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_ABM" + "lzcnt{l}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "SI")]) + (define_insn "*bsr" [(set (match_operand:SI 0 "register_operand" "=r") (minus:SI (const_int 31) @@ -14567,8 +14830,45 @@ (clobber (reg:CC FLAGS_REG))] "" "bsr{l}\t{%1, %0|%0, %1}" - [(set_attr "prefix_0f" "1")]) + [(set_attr "prefix_0f" "1") + (set_attr "mode" "SI")]) +(define_insn "popcountsi2" + [(set (match_operand:SI 0 "register_operand" "=r") + (popcount:SI (match_operand:SI 1 "nonimmediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_POPCNT" + "popcnt{l}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "SI")]) + +(define_insn "*popcountsi2_cmp" + [(set (reg FLAGS_REG) + (compare + (popcount:SI (match_operand:SI 1 "nonimmediate_operand" "rm")) + (const_int 0))) + (set (match_operand:SI 0 "register_operand" "=r") + (popcount:SI (match_dup 1)))] + "TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)" + "popcnt{l}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "SI")]) + +(define_insn "*popcountsi2_cmp_zext" + [(set (reg FLAGS_REG) + (compare + (popcount:SI (match_operand:SI 1 "nonimmediate_operand" "rm")) + (const_int 0))) + (set (match_operand:DI 0 "register_operand" "=r") + (zero_extend:DI(popcount:SI (match_dup 1))))] + "TARGET_64BIT && TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)" + "popcnt{l}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "SI")]) + (define_expand "clzdi2" [(parallel [(set (match_operand:DI 0 "register_operand" "") @@ -14579,8 +14879,24 @@ [(set (match_dup 0) (xor:DI (match_dup 0) (const_int 63))) (clobber (reg:CC FLAGS_REG))])] "TARGET_64BIT" - "") +{ + if (TARGET_ABM) + { + emit_insn (gen_clzdi2_abm (operands[0], operands[1])); + DONE; + } +}) +(define_insn "clzdi2_abm" + [(set (match_operand:DI 0 "register_operand" "=r") + (clz:DI (match_operand:DI 1 "nonimmediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && TARGET_ABM" + "lzcnt{q}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "DI")]) + (define_insn "*bsr_rex64" [(set (match_operand:DI 0 "register_operand" "=r") (minus:DI (const_int 63) @@ -14588,7 +14904,92 @@ (clobber (reg:CC FLAGS_REG))] "TARGET_64BIT" "bsr{q}\t{%1, %0|%0, %1}" - [(set_attr "prefix_0f" "1")]) + [(set_attr "prefix_0f" "1") + (set_attr "mode" "DI")]) + +(define_insn "popcountdi2" + [(set (match_operand:DI 0 "register_operand" "=r") + (popcount:DI (match_operand:DI 1 "nonimmediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_64BIT && TARGET_POPCNT" + "popcnt{q}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "DI")]) + +(define_insn "*popcountdi2_cmp" + [(set (reg FLAGS_REG) + (compare + (popcount:DI (match_operand:DI 1 "nonimmediate_operand" "rm")) + (const_int 0))) + (set (match_operand:DI 0 "register_operand" "=r") + (popcount:DI (match_dup 1)))] + "TARGET_64BIT && TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)" + "popcnt{q}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "DI")]) + +(define_expand "clzhi2" + [(parallel + [(set (match_operand:HI 0 "register_operand" "") + (minus:HI (const_int 15) + (clz:HI (match_operand:HI 1 "nonimmediate_operand" "")))) + (clobber (reg:CC FLAGS_REG))]) + (parallel + [(set (match_dup 0) (xor:HI (match_dup 0) (const_int 15))) + (clobber (reg:CC FLAGS_REG))])] + "" +{ + if (TARGET_ABM) + { + emit_insn (gen_clzhi2_abm (operands[0], operands[1])); + DONE; + } +}) + +(define_insn "clzhi2_abm" + [(set (match_operand:HI 0 "register_operand" "=r") + (clz:HI (match_operand:HI 1 "nonimmediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_ABM" + "lzcnt{w}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "HI")]) + +(define_insn "*bsrhi" + [(set (match_operand:HI 0 "register_operand" "=r") + (minus:HI (const_int 15) + (clz:HI (match_operand:HI 1 "nonimmediate_operand" "rm")))) + (clobber (reg:CC FLAGS_REG))] + "" + "bsr{w}\t{%1, %0|%0, %1}" + [(set_attr "prefix_0f" "1") + (set_attr "mode" "HI")]) + +(define_insn "popcounthi2" + [(set (match_operand:HI 0 "register_operand" "=r") + (popcount:HI (match_operand:HI 1 "nonimmediate_operand" ""))) + (clobber (reg:CC FLAGS_REG))] + "TARGET_POPCNT" + "popcnt{w}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "HI")]) + +(define_insn "*popcounthi2_cmp" + [(set (reg FLAGS_REG) + (compare + (popcount:HI (match_operand:HI 1 "nonimmediate_operand" "rm")) + (const_int 0))) + (set (match_operand:HI 0 "register_operand" "=r") + (popcount:HI (match_dup 1)))] + "TARGET_POPCNT && ix86_match_ccmode (insn, CCZmode)" + "popcnt{w}\t{%1, %0|%0, %1}" + [(set_attr "prefix_rep" "1") + (set_attr "type" "bitmanip") + (set_attr "mode" "HI")]) ;; Thread-local storage patterns for ELF. ;; @@ -15503,7 +15904,8 @@ "sqrtss\t{%1, %0|%0, %1}" [(set_attr "type" "sse") (set_attr "mode" "SF") - (set_attr "athlon_decode" "*")]) + (set_attr "athlon_decode" "*") + (set_attr "amdfam10_decode" "*")]) (define_insn "*sqrtsf2_i387" [(set (match_operand:SF 0 "register_operand" "=f") @@ -15541,7 +15943,8 @@ "sqrtsd\t{%1, %0|%0, %1}" [(set_attr "type" "sse") (set_attr "mode" "DF") - (set_attr "athlon_decode" "*")]) + (set_attr "athlon_decode" "*") + (set_attr "amdfam10_decode" "*")]) (define_insn "*sqrtdf2_i387" [(set (match_operand:DF 0 "register_operand" "=f") @@ -15570,7 +15973,8 @@ "fsqrt" [(set_attr "type" "fpspc") (set_attr "mode" "XF") - (set_attr "athlon_decode" "direct")]) + (set_attr "athlon_decode" "direct") + (set_attr "amdfam10_decode" "direct")]) (define_insn "*sqrtextendsfxf2_i387" [(set (match_operand:XF 0 "register_operand" "=f") @@ -15580,7 +15984,8 @@ "fsqrt" [(set_attr "type" "fpspc") (set_attr "mode" "XF") - (set_attr "athlon_decode" "direct")]) + (set_attr "athlon_decode" "direct") + (set_attr "amdfam10_decode" "direct")]) (define_insn "*sqrtextenddfxf2_i387" [(set (match_operand:XF 0 "register_operand" "=f") @@ -20391,7 +20796,7 @@ (mult:DI (match_operand:DI 1 "memory_operand" "") (match_operand:DI 2 "immediate_operand" ""))) (clobber (reg:CC FLAGS_REG))])] - "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size + "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size && !satisfies_constraint_K (operands[2])" [(set (match_dup 3) (match_dup 1)) (parallel [(set (match_dup 0) (mult:DI (match_dup 3) (match_dup 2))) @@ -20404,7 +20809,7 @@ (mult:SI (match_operand:SI 1 "memory_operand" "") (match_operand:SI 2 "immediate_operand" ""))) (clobber (reg:CC FLAGS_REG))])] - "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size + "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size && !satisfies_constraint_K (operands[2])" [(set (match_dup 3) (match_dup 1)) (parallel [(set (match_dup 0) (mult:SI (match_dup 3) (match_dup 2))) @@ -20418,7 +20823,7 @@ (mult:SI (match_operand:SI 1 "memory_operand" "") (match_operand:SI 2 "immediate_operand" "")))) (clobber (reg:CC FLAGS_REG))])] - "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size + "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size && !satisfies_constraint_K (operands[2])" [(set (match_dup 3) (match_dup 1)) (parallel [(set (match_dup 0) (zero_extend:DI (mult:SI (match_dup 3) (match_dup 2)))) @@ -20435,7 +20840,7 @@ (match_operand:DI 2 "const_int_operand" ""))) (clobber (reg:CC FLAGS_REG))]) (match_scratch:DI 3 "r")] - "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size + "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size && satisfies_constraint_K (operands[2])" [(set (match_dup 3) (match_dup 2)) (parallel [(set (match_dup 0) (mult:DI (match_dup 0) (match_dup 3))) @@ -20451,7 +20856,7 @@ (match_operand:SI 2 "const_int_operand" ""))) (clobber (reg:CC FLAGS_REG))]) (match_scratch:SI 3 "r")] - "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size + "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size && satisfies_constraint_K (operands[2])" [(set (match_dup 3) (match_dup 2)) (parallel [(set (match_dup 0) (mult:SI (match_dup 0) (match_dup 3))) @@ -20467,7 +20872,7 @@ (match_operand:HI 2 "immediate_operand" ""))) (clobber (reg:CC FLAGS_REG))]) (match_scratch:HI 3 "r")] - "(TARGET_K8 || TARGET_GENERIC64) && !optimize_size" + "(TARGET_K8 || TARGET_GENERIC64 || TARGET_AMDFAM10) && !optimize_size" [(set (match_dup 3) (match_dup 2)) (parallel [(set (match_dup 0) (mult:HI (match_dup 0) (match_dup 3))) (clobber (reg:CC FLAGS_REG))])] Index: contrib/gcc/config/i386/pmmintrin.h =================================================================== --- contrib/gcc/config/i386/pmmintrin.h (revision 237107) +++ contrib/gcc/config/i386/pmmintrin.h (working copy) @@ -1,4 +1,4 @@ -/* Copyright (C) 2003, 2004, 2005, 2006 Free Software Foundation, Inc. +/* Copyright (C) 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc. This file is part of GCC. @@ -30,7 +30,11 @@ #ifndef _PMMINTRIN_H_INCLUDED #define _PMMINTRIN_H_INCLUDED -#ifdef __SSE3__ +#ifndef __SSE3__ +# error "SSE3 instruction set not enabled" +#else + +/* We need definitions from the SSE2 and SSE header files*/ #include #include Index: contrib/gcc/config/i386/sse.md =================================================================== --- contrib/gcc/config/i386/sse.md (revision 237107) +++ contrib/gcc/config/i386/sse.md (working copy) @@ -1,5 +1,5 @@ ;; GCC machine description for SSE instructions -;; Copyright (C) 2005, 2006 +;; Copyright (C) 2005, 2006, 2007 ;; Free Software Foundation, Inc. ;; ;; This file is part of GCC. @@ -920,6 +920,7 @@ "cvtsi2ss\t{%2, %0|%0, %2}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "vector,double") + (set_attr "amdfam10_decode" "vector,double") (set_attr "mode" "SF")]) (define_insn "sse_cvtsi2ssq" @@ -933,6 +934,7 @@ "cvtsi2ssq\t{%2, %0|%0, %2}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "vector,double") + (set_attr "amdfam10_decode" "vector,double") (set_attr "mode" "SF")]) (define_insn "sse_cvtss2si" @@ -946,6 +948,7 @@ "cvtss2si\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") (set_attr "mode" "SI")]) (define_insn "sse_cvtss2siq" @@ -959,6 +962,7 @@ "cvtss2siq\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") (set_attr "mode" "DI")]) (define_insn "sse_cvttss2si" @@ -971,6 +975,7 @@ "cvttss2si\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") (set_attr "mode" "SI")]) (define_insn "sse_cvttss2siq" @@ -983,6 +988,7 @@ "cvttss2siq\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") (set_attr "mode" "DI")]) (define_insn "sse2_cvtdq2ps" @@ -1852,7 +1858,8 @@ "cvtsi2sd\t{%2, %0|%0, %2}" [(set_attr "type" "sseicvt") (set_attr "mode" "DF") - (set_attr "athlon_decode" "double,direct")]) + (set_attr "athlon_decode" "double,direct") + (set_attr "amdfam10_decode" "vector,double")]) (define_insn "sse2_cvtsi2sdq" [(set (match_operand:V2DF 0 "register_operand" "=x,x") @@ -1865,7 +1872,8 @@ "cvtsi2sdq\t{%2, %0|%0, %2}" [(set_attr "type" "sseicvt") (set_attr "mode" "DF") - (set_attr "athlon_decode" "double,direct")]) + (set_attr "athlon_decode" "double,direct") + (set_attr "amdfam10_decode" "vector,double")]) (define_insn "sse2_cvtsd2si" [(set (match_operand:SI 0 "register_operand" "=r,r") @@ -1878,6 +1886,7 @@ "cvtsd2si\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") (set_attr "mode" "SI")]) (define_insn "sse2_cvtsd2siq" @@ -1891,6 +1900,7 @@ "cvtsd2siq\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double") (set_attr "mode" "DI")]) (define_insn "sse2_cvttsd2si" @@ -1903,7 +1913,8 @@ "cvttsd2si\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "mode" "SI") - (set_attr "athlon_decode" "double,vector")]) + (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double")]) (define_insn "sse2_cvttsd2siq" [(set (match_operand:DI 0 "register_operand" "=r,r") @@ -1915,7 +1926,8 @@ "cvttsd2siq\t{%1, %0|%0, %1}" [(set_attr "type" "sseicvt") (set_attr "mode" "DI") - (set_attr "athlon_decode" "double,vector")]) + (set_attr "athlon_decode" "double,vector") + (set_attr "amdfam10_decode" "double,double")]) (define_insn "sse2_cvtdq2pd" [(set (match_operand:V2DF 0 "register_operand" "=x") @@ -1946,7 +1958,8 @@ "TARGET_SSE2" "cvtpd2dq\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") - (set_attr "mode" "TI")]) + (set_attr "mode" "TI") + (set_attr "amdfam10_decode" "double")]) (define_expand "sse2_cvttpd2dq" [(set (match_operand:V4SI 0 "register_operand" "") @@ -1964,7 +1977,8 @@ "TARGET_SSE2" "cvttpd2dq\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") - (set_attr "mode" "TI")]) + (set_attr "mode" "TI") + (set_attr "amdfam10_decode" "double")]) (define_insn "sse2_cvtsd2ss" [(set (match_operand:V4SF 0 "register_operand" "=x,x") @@ -1978,20 +1992,22 @@ "cvtsd2ss\t{%2, %0|%0, %2}" [(set_attr "type" "ssecvt") (set_attr "athlon_decode" "vector,double") + (set_attr "amdfam10_decode" "vector,double") (set_attr "mode" "SF")]) (define_insn "sse2_cvtss2sd" - [(set (match_operand:V2DF 0 "register_operand" "=x") + [(set (match_operand:V2DF 0 "register_operand" "=x,x") (vec_merge:V2DF (float_extend:V2DF (vec_select:V2SF - (match_operand:V4SF 2 "nonimmediate_operand" "xm") + (match_operand:V4SF 2 "nonimmediate_operand" "x,m") (parallel [(const_int 0) (const_int 1)]))) - (match_operand:V2DF 1 "register_operand" "0") + (match_operand:V2DF 1 "register_operand" "0,0") (const_int 1)))] "TARGET_SSE2" "cvtss2sd\t{%2, %0|%0, %2}" [(set_attr "type" "ssecvt") + (set_attr "amdfam10_decode" "vector,double") (set_attr "mode" "DF")]) (define_expand "sse2_cvtpd2ps" @@ -2012,7 +2028,8 @@ "TARGET_SSE2" "cvtpd2ps\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") - (set_attr "mode" "V4SF")]) + (set_attr "mode" "V4SF") + (set_attr "amdfam10_decode" "double")]) (define_insn "sse2_cvtps2pd" [(set (match_operand:V2DF 0 "register_operand" "=x") @@ -2023,7 +2040,8 @@ "TARGET_SSE2" "cvtps2pd\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") - (set_attr "mode" "V2DF")]) + (set_attr "mode" "V2DF") + (set_attr "amdfam10_decode" "direct")]) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; @@ -4524,3 +4542,92 @@ "pabs\t{%1, %0|%0, %1}"; [(set_attr "type" "sselog1") (set_attr "mode" "DI")]) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; +;; AMD SSE4A instructions +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(define_insn "sse4a_vmmovntv2df" + [(set (match_operand:DF 0 "memory_operand" "=m") + (unspec:DF [(vec_select:DF + (match_operand:V2DF 1 "register_operand" "x") + (parallel [(const_int 0)]))] + UNSPEC_MOVNT))] + "TARGET_SSE4A" + "movntsd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "mode" "DF")]) + +(define_insn "sse4a_movntdf" + [(set (match_operand:DF 0 "memory_operand" "=m") + (unspec:DF [(match_operand:DF 1 "register_operand" "x")] + UNSPEC_MOVNT))] + "TARGET_SSE4A" + "movntsd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "mode" "DF")]) + +(define_insn "sse4a_vmmovntv4sf" + [(set (match_operand:SF 0 "memory_operand" "=m") + (unspec:SF [(vec_select:SF + (match_operand:V4SF 1 "register_operand" "x") + (parallel [(const_int 0)]))] + UNSPEC_MOVNT))] + "TARGET_SSE4A" + "movntss\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "mode" "SF")]) + +(define_insn "sse4a_movntsf" + [(set (match_operand:SF 0 "memory_operand" "=m") + (unspec:SF [(match_operand:SF 1 "register_operand" "x")] + UNSPEC_MOVNT))] + "TARGET_SSE4A" + "movntss\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "mode" "SF")]) + +(define_insn "sse4a_extrqi" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") + (match_operand 2 "const_int_operand" "") + (match_operand 3 "const_int_operand" "")] + UNSPEC_EXTRQI))] + "TARGET_SSE4A" + "extrq\t{%3, %2, %0|%0, %2, %3}" + [(set_attr "type" "sse") + (set_attr "mode" "TI")]) + +(define_insn "sse4a_extrq" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") + (match_operand:V16QI 2 "register_operand" "x")] + UNSPEC_EXTRQ))] + "TARGET_SSE4A" + "extrq\t{%2, %0|%0, %2}" + [(set_attr "type" "sse") + (set_attr "mode" "TI")]) + +(define_insn "sse4a_insertqi" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") + (match_operand:V2DI 2 "register_operand" "x") + (match_operand 3 "const_int_operand" "") + (match_operand 4 "const_int_operand" "")] + UNSPEC_INSERTQI))] + "TARGET_SSE4A" + "insertq\t{%4, %3, %2, %0|%0, %2, %3, %4}" + [(set_attr "type" "sseins") + (set_attr "mode" "TI")]) + +(define_insn "sse4a_insertq" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0") + (match_operand:V2DI 2 "register_operand" "x")] + UNSPEC_INSERTQ))] + "TARGET_SSE4A" + "insertq\t{%2, %0|%0, %2}" + [(set_attr "type" "sseins") + (set_attr "mode" "TI")]) Index: contrib/gcc/config/i386/emmintrin.h =================================================================== --- contrib/gcc/config/i386/emmintrin.h (revision 237107) +++ contrib/gcc/config/i386/emmintrin.h (working copy) @@ -30,7 +30,11 @@ #ifndef _EMMINTRIN_H_INCLUDED #define _EMMINTRIN_H_INCLUDED -#ifdef __SSE2__ +#ifndef __SSE2__ +# error "SSE2 instruction set not enabled" +#else + +/* We need definitions from the SSE header files*/ #include /* SSE2 */ Index: contrib/gcc/config/i386/driver-i386.c =================================================================== --- contrib/gcc/config/i386/driver-i386.c (revision 237107) +++ contrib/gcc/config/i386/driver-i386.c (working copy) @@ -1,5 +1,5 @@ /* Subroutines for the gcc driver. - Copyright (C) 2006 Free Software Foundation, Inc. + Copyright (C) 2006, 2007 Free Software Foundation, Inc. This file is part of GCC. @@ -40,6 +40,7 @@ const char *host_detect_local_cpu (int argc, const #define bit_SSE3 (1 << 0) #define bit_SSSE3 (1 << 9) +#define bit_SSE4a (1 << 6) #define bit_CMPXCHG16B (1 << 13) #define bit_3DNOW (1 << 31) @@ -68,7 +69,7 @@ const char *host_detect_local_cpu (int argc, const unsigned int ext_level; unsigned char has_mmx = 0, has_3dnow = 0, has_3dnowp = 0, has_sse = 0; unsigned char has_sse2 = 0, has_sse3 = 0, has_ssse3 = 0, has_cmov = 0; - unsigned char has_longmode = 0, has_cmpxchg8b = 0; + unsigned char has_longmode = 0, has_cmpxchg8b = 0, has_sse4a = 0; unsigned char is_amd = 0; unsigned int family = 0; bool arch; @@ -120,6 +121,7 @@ const char *host_detect_local_cpu (int argc, const has_3dnow = !!(edx & bit_3DNOW); has_3dnowp = !!(edx & bit_3DNOWP); has_longmode = !!(edx & bit_LM); + has_sse4a = !!(ecx & bit_SSE4a); } is_amd = vendor == *(unsigned int*)"Auth"; @@ -132,6 +134,8 @@ const char *host_detect_local_cpu (int argc, const processor = PROCESSOR_ATHLON; if (has_sse2 || has_longmode) processor = PROCESSOR_K8; + if (has_sse4a) + processor = PROCESSOR_AMDFAM10; } else { @@ -266,6 +270,9 @@ const char *host_detect_local_cpu (int argc, const case PROCESSOR_NOCONA: cpu = "nocona"; break; + case PROCESSOR_AMDFAM10: + cpu = "amdfam10"; + break; case PROCESSOR_GENERIC32: case PROCESSOR_GENERIC64: cpu = "generic"; Index: contrib/gcc/config/i386/i386.c =================================================================== --- contrib/gcc/config/i386/i386.c (revision 237107) +++ contrib/gcc/config/i386/i386.c (working copy) @@ -548,6 +548,71 @@ struct processor_costs k8_cost = { COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ }; +struct processor_costs amdfam10_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (2), /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (4), /* DI */ + COSTS_N_INSNS (5)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (35), /* HI */ + COSTS_N_INSNS (51), /* SI */ + COSTS_N_INSNS (83), /* DI */ + COSTS_N_INSNS (83)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 9, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {3, 4, 3}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {3, 4, 3}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {4, 4, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {3, 3}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 4, 3}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 4, 5}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 3, /* MMX or SSE register to integer */ + /* On K8 + MOVD reg64, xmmreg Double FSTORE 4 + MOVD reg32, xmmreg Double FSTORE 4 + On AMDFAM10 + MOVD reg64, xmmreg Double FADD 3 + 1/1 1/1 + MOVD reg32, xmmreg Double FADD 3 + 1/1 1/1 */ + 64, /* size of prefetch block */ + /* New AMD processors never drop prefetches; if they cannot be performed + immediately, they are queued. We set number of simultaneous prefetches + to a large constant to reflect this (it probably is not a good idea not + to limit number of prefetches at all, as their execution also takes some + time). */ + 100, /* number of parallel prefetches */ + 5, /* Branch cost */ + COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (4), /* cost of FMUL instruction. */ + COSTS_N_INSNS (19), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ +}; + static const struct processor_costs pentium4_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ @@ -834,11 +899,13 @@ const struct processor_costs *ix86_cost = &pentium #define m_PENT4 (1< static __inline __m128i __attribute__((__always_inline__)) Index: contrib/gcc/config/i386/i386.opt =================================================================== --- contrib/gcc/config/i386/i386.opt (revision 237107) +++ contrib/gcc/config/i386/i386.opt (working copy) @@ -1,6 +1,6 @@ ; Options for the IA-32 and AMD64 ports of the compiler. -; Copyright (C) 2005 Free Software Foundation, Inc. +; Copyright (C) 2005, 2006, 2007 Free Software Foundation, Inc. ; ; This file is part of GCC. ; @@ -201,6 +201,22 @@ mssse3 Target Report Mask(SSSE3) Support MMX, SSE, SSE2, SSE3 and SSSE3 built-in functions and code generation +msse4a +Target Report Mask(SSE4A) +Support MMX, SSE, SSE2, SSE3 and SSE4A built-in functions and code generation + +mpopcnt +Target Report Mask(POPCNT) +Support code generation of popcount instruction for popcount built-ins +namely __builtin_popcount, __builtin_popcountl and __builtin_popcountll + +mabm +Target Report Mask(ABM) +Support code generation of Advanced Bit Manipulation (ABM) instructions, +which include popcnt and lzcnt instructions, for popcount and clz built-ins +namely __builtin_popcount, __builtin_popcountl, __builtin_popcountll and +__builtin_clz, __builtin_clzl, __builtin_clzll + msseregparm Target RejectNegative Mask(SSEREGPARM) Use SSE register passing conventions for SF and DF mode Index: contrib/gcc/config/i386/ammintrin.h =================================================================== --- contrib/gcc/config/i386/ammintrin.h (revision 0) +++ contrib/gcc/config/i386/ammintrin.h (working copy) @@ -0,0 +1,73 @@ +/* Copyright (C) 2007 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING. If not, write to + the Free Software Foundation, 51 Franklin Street, Fifth Floor, + Boston, MA 02110-1301, USA. */ + +/* As a special exception, if you include this header file into source + files compiled by GCC, this header file does not by itself cause + the resulting executable to be covered by the GNU General Public + License. This exception does not however invalidate any other + reasons why the executable file might be covered by the GNU General + Public License. */ + +/* Implemented from the specification included in the AMD Programmers + Manual Update, version 2.x */ + +#ifndef _AMMINTRIN_H_INCLUDED +#define _AMMINTRIN_H_INCLUDED + +#ifndef __SSE4A__ +# error "SSE4A instruction set not enabled" +#else + +/* We need definitions from the SSE3, SSE2 and SSE header files*/ +#include + +static __inline void __attribute__((__always_inline__)) +_mm_stream_sd (double * __P, __m128d __Y) +{ + __builtin_ia32_movntsd (__P, (__v2df) __Y); +} + +static __inline void __attribute__((__always_inline__)) +_mm_stream_ss (float * __P, __m128 __Y) +{ + __builtin_ia32_movntss (__P, (__v4sf) __Y); +} + +static __inline __m128i __attribute__((__always_inline__)) +_mm_extract_si64 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_extrq ((__v2di) __X, (__v16qi) __Y); +} + +#define _mm_extracti_si64(X, I, L) \ +((__m128i) __builtin_ia32_extrqi ((__v2di)(X), I, L)) + +static __inline __m128i __attribute__((__always_inline__)) +_mm_insert_si64 (__m128i __X,__m128i __Y) +{ + return (__m128i) __builtin_ia32_insertq ((__v2di)__X, (__v2di)__Y); +} + +#define _mm_inserti_si64(X, Y, I, L) \ +((__m128i) __builtin_ia32_insertqi ((__v2di)(X), (__v2di)(Y), I, L)) + + +#endif /* __SSE4A__ */ + +#endif /* _AMMINTRIN_H_INCLUDED */ Property changes on: contrib/gcc/config/i386/ammintrin.h ___________________________________________________________________ Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property