From 369d087d69dcace779970204daf9b736fcc24c0a Mon Sep 17 00:00:00 2001
From: Daniel J Blueman <daniel@numascale.com>
Date: Tue, 31 Mar 2015 11:26:53 +0800
Subject: [PATCH] Add Clocktree 6 support

Signed-off-by: Daniel J Blueman <daniel@numascale.com>
---
 arch/x86/include/asm/numachip/numachip_common.h |  42 ++++
 arch/x86/platform/Makefile                      |   1 +
 arch/x86/platform/numachip/Makefile             |   2 +
 arch/x86/platform/numachip/clocktree.c          | 277 ++++++++++++++++++++++++
 4 files changed, 322 insertions(+)
 create mode 100644 arch/x86/include/asm/numachip/numachip_common.h
 create mode 100644 arch/x86/platform/numachip/Makefile
 create mode 100644 arch/x86/platform/numachip/clocktree.c

diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
index 85afde1..9354459 100644
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -10,3 +10,4 @@ obj-y	+= scx200/
 obj-y	+= sfi/
 obj-y	+= ts5500/
 obj-y	+= uv/
+obj-y	+= numachip/
diff --git a/arch/x86/platform/numachip/Makefile b/arch/x86/platform/numachip/Makefile
new file mode 100644
index 0000000..3ffcc7e
--- /dev/null
+++ b/arch/x86/platform/numachip/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_X86_NUMACHIP)		+= clocktree.o
+
diff --git a/arch/x86/platform/numachip/clocktree.c b/arch/x86/platform/numachip/clocktree.c
new file mode 100644
index 0000000..01b46aa
--- /dev/null
+++ b/arch/x86/platform/numachip/clocktree.c
@@ -0,0 +1,277 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/cpu.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/moduleparam.h>
+
+#include <asm/smp.h>
+#include <asm/atomic.h>
+#include <asm/numachip/numachip.h>
+#include <asm/numachip/numachip_csr.h>
+#include <asm/numachip/numachip_common.h>
+
+#define EXPIRY (u64)(2.8E9 / 100)
+#define MANTISSA 40
+#define PCI_MMIO_CONF(sci, bus, device, func, reg) \
+	(resource_size_t)(mcfg_base | ((u64)sci << 28) | ((bus) << 20) | ((device) << 15) | ((func) << 12) | (reg))
+#define CTDEBUG 0
+#define C2MS(x) (x/(s64)230E5)
+#define N2MS(x) (x/(s64)200E5)
+#define WRAP_GRACE_PERIOD (0xffffffff / 3)
+//#define SELECTOR 1
+#define SELECTOR (smp_processor_id() == 32 || smp_processor_id() == 31)
+//#define SELECTOR(__this_cpu_inc_return(clocktree_counter) % 10000 == 0)
+
+struct clocktree {
+	s64 drift;
+	u64 last_update;
+	u64 nb2core;
+	u64 core2nb;
+	void __iomem *masterp;
+	u32 nb_tsc_last;
+	u8 updating;
+};
+
+static DEFINE_PER_CPU_READ_MOSTLY(struct clocktree *, clocktreep);
+static struct clocktree *clocktrees[0xfff] __read_mostly;
+static u64 nb2core __read_mostly;
+static u64 core2nb __read_mostly;
+static void __iomem *masterp __read_mostly;
+static u16 master_sci __read_mostly;
+static u64 __cacheline_aligned_in_smp global_last_wrap;
+static bool clocktree_enabled __read_mostly = 1;
+#if CTDEBUG >= 3
+static char __cacheline_aligned_in_smp lastmsg[128];
+static DEFINE_PER_CPU(u64, clocktree_counter);
+#endif
+#ifdef OLD
+static DEFINE_PER_CPU(u64, clocktree_last);
+#endif
+#ifdef TEST
+static u64 __cacheline_aligned_in_smp last;
+#endif
+
+module_param_named(enabled, clocktree_enabled, bool, S_IWUSR | S_IRUGO);
+
+static cycles_t clocktree_read(struct clocksource *cs)
+{
+	struct clocktree *ct;
+	u64 core_tsc, final;
+	u32 wraps;
+
+	preempt_disable();
+	ct = __this_cpu_read(clocktreep);
+	core_tsc = __native_read_tsc();
+
+	/* If stale, update */
+	if (unlikely(core_tsc > (ct->last_update + EXPIRY) && !cmpxchg(&ct->updating, 0, 1))) {
+		u64 nb_total;
+		const u32 nb_tsc = readl(ct->masterp);
+
+		/* Average core TSC reads to remove half remote cycle offset */
+		rdtsc_barrier();
+		core_tsc = ((__uint128_t)core_tsc + __native_read_tsc()) / 2;
+		ct->last_update = core_tsc + EXPIRY;
+
+		/* Check if wrap observed */
+		if (unlikely(nb_tsc < ct->nb_tsc_last)) {
+			const u64 _final = (s64)core_tsc - ct->drift;
+
+			/* Update wrap counter, avoiding racing with others */
+			const u64 _global_last_wrap = ACCESS_ONCE(global_last_wrap);
+			const u32 global_wraps = ((__uint128_t)global_last_wrap * ct->core2nb) >> (32 + MANTISSA);
+			const u32 wraps = ((__uint128_t)_final * ct->core2nb) >> (32 + MANTISSA);
+			if (wraps > global_wraps)
+				/* Expensive invalidation, so prevent racing */
+				cmpxchg(&global_last_wrap, _global_last_wrap, _final);
+		}
+		ct->nb_tsc_last = nb_tsc;
+
+		wraps = ((__uint128_t)global_last_wrap * ct->core2nb) >> (32 + MANTISSA);
+		nb_total = ((u64)wraps << 32) + nb_tsc;
+		final = ((__uint128_t)nb_total * ct->nb2core) >> MANTISSA;
+
+		/* How far remote TSC is ahead of master TSC */
+		ct->drift = (s64)core_tsc - final;
+
+#if CTDEBUG >= 2
+		if (SELECTOR)
+			pr_err("cpu=%4d/%03x final=%llums core_tsc=%llums drift=%lldms nb_tsc_last=%llums global_last_wrap=%llums nb_tsc=%llums wraps=%u nb_total=%llu\n",
+			  smp_processor_id(), read_lcsr(CSR_G0_NODE_IDS) >> 16, C2MS(final), C2MS(core_tsc),
+			  C2MS(ct->drift), N2MS(ct->nb_tsc_last), C2MS(global_last_wrap), N2MS(nb_tsc), wraps, N2MS(nb_total));
+#endif
+		barrier();
+		ct->updating = 0;
+		smp_wmb();
+		goto out;
+	}
+
+	final = (s64)core_tsc - ct->drift;
+
+#if CTDEBUG >=2
+	snprintf(lastmsg, sizeof(lastmsg), "cpu=%4d/%03x final=%llums core_tsc=%llums drift=%lldms nb_tsc_last=%lluums global_last_wrap=%llums\n",
+			  smp_processor_id(), read_lcsr(CSR_G0_NODE_IDS) >> 16, C2MS(final), C2MS(core_tsc),
+			  C2MS(ct->drift), N2MS(ct->nb_tsc_last), C2MS(global_last_wrap));
+#endif
+out:
+#ifdef OLD
+	if (final < __this_cpu_read(clocktree_last))
+		pr_err("cpu=%4d/%03x final=%llums last=%llums d=%llu core_tsc=%llums drift=%lldms nb_tsc_last=%10u global_last_wrap=%llums BACK\n", 
+		  smp_processor_id(), read_lcsr(CSR_G0_NODE_IDS) >> 16, C2MS(final), C2MS(__this_cpu_read(clocktree_last)),
+		  __this_cpu_read(clocktree_last) - final, C2MS(core_tsc), C2MS(ct->drift), ct->nb_tsc_last, C2MS(global_last_wrap));
+	else if (final > (__this_cpu_read(clocktree_last) + (u64)(2.8E9 * 10)))
+		pr_err("cpu=%4d/%03x final=%llums last=%llums d=%llu core_tsc=%llums drift=%lldms nb_tsc_last=%10u global_last_wrap=%llums FORW\n", 
+		  smp_processor_id(), read_lcsr(CSR_G0_NODE_IDS) >> 16, C2MS(final), C2MS(__this_cpu_read(clocktree_last)),
+		  final - __this_cpu_read(clocktree_last), C2MS(core_tsc), C2MS(ct->drift), ct->nb_tsc_last, C2MS(global_last_wrap));
+
+	__this_cpu_write(clocktree_last, final);
+#endif
+
+#ifdef TEST
+	u64 _last = ACCESS_ONCE(last);
+	if (final > (_last + 100000000)) {
+		u64 d = final - _last;
+		pr_err("%s", lastmsg);
+		pr_err("cpu=%4d/%03x final=%llums last=%llums d=%llums core_tsc=%llums drift=%lldms nb_tsc_last=%llums global_last_wrap=%llums updating=%u last_update=%llums FORW\n", 
+		  smp_processor_id(), read_lcsr(CSR_G0_NODE_IDS) >> 16, C2MS(final), C2MS(_last),
+		  C2MS(d), C2MS(core_tsc), C2MS(ct->drift), N2MS(ct->nb_tsc_last), C2MS(global_last_wrap), ct->updating, C2MS(ct->last_update));
+	}
+	last = final;
+#endif
+	preempt_enable();
+	return final;
+}
+
+static struct clocksource clocksource_nb = {
+	.name                   = "clocktree6",
+	.rating                 = 290,
+	.read                   = clocktree_read,
+	.mask                   = CLOCKSOURCE_MASK(64),
+	.flags                  = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+static void calibrate(void *param)
+{
+	u64 *val = (void *)param;
+	rdtscll(*val);
+}
+
+static void clocktree_online_action(void)
+{
+	struct clocktree *ctree;
+	u16 sci;
+
+	preempt_disable();
+	ctree = __this_cpu_read(clocktreep);
+	if (ctree)
+		return;
+
+	sci = read_lcsr(CSR_G0_NODE_IDS) >> 16;
+	if (!clocktrees[sci]) {
+		ctree = kzalloc_node(sizeof(*ctree), GFP_ATOMIC, cpu_to_node(smp_processor_id()));
+		BUG_ON(!ctree);
+		WARN_ON((unsigned long)ctree & 0x3f); /* Check cacheline alignment */
+
+		ctree->nb2core = nb2core;
+		ctree->core2nb = core2nb;
+		ctree->masterp = masterp;
+
+		{
+			u64 core0_tsc, nb_total, final, core_tsc = __native_read_tsc();
+			u32 wraps;
+			const u32 nb_tsc = readl(masterp);
+
+			/* Average core TSC reads to remove half remote cycle offset */
+			rdtsc_barrier();
+			core_tsc = ((__uint128_t)core_tsc + __native_read_tsc()) / 2;
+
+			if (sci == master_sci)
+				core0_tsc = __native_read_tsc();
+			else
+				smp_call_function_single(0, calibrate, &core0_tsc, 1);
+			wraps = ((__uint128_t)core0_tsc * core2nb) >> (32 + MANTISSA);
+			nb_total = ((u64)wraps << 32) + nb_tsc;
+			final = ((__uint128_t)nb_total * nb2core) >> MANTISSA;
+
+			/* How far remote TSC is ahead of master TSC */
+			ctree->drift = (s64)core_tsc - final;
+
+#if CTDEBUG >= 1
+			pr_err("<%03x drift=%lldms>", sci, C2MS(ctree->drift));
+#endif
+		}
+
+		clocktrees[sci] = ctree;
+	}
+
+	__this_cpu_write(clocktreep, clocktrees[sci]);
+	preempt_enable();
+}
+
+static int __init clocktree_init(void)
+{
+	u64 val6, mcfg_base;
+	u32 core_mhz, nb_mhz, val, boosts = 0;
+	void __iomem *mapping;
+
+	BUILD_BUG_ON(sizeof(struct clocktree) > 64);
+
+	if (numachip_system != 1)
+		return 0;
+
+	if (!clocktree_enabled) {
+		pr_err("clocktree disabled\n");
+		return 0;
+	}
+
+	if (numachip_info->observer) {
+		pr_err("clocktree: observer mode\n");
+		return 0;
+	}
+
+	master_sci = read_lcsr(CSR_G0_NODE_IDS) >> 16;
+	rdmsrl(MSR_FAM10H_MMIO_CONF_BASE, mcfg_base);
+	mcfg_base &= ~0x7f;
+
+	if (boot_cpu_data.x86 > 0x10) {
+		mapping = ioremap_nocache(PCI_MMIO_CONF(master_sci, 0, 0x18 + numachip_info->neigh_ht, 5, 0x160), 4);
+		val = readl(mapping);
+		nb_mhz = 200 * (((val >> 1) & 0x1f) + 4) / (1 + ((val >> 7) & 1));
+
+		mapping = ioremap_nocache(PCI_MMIO_CONF(master_sci, 0, 0x18 + numachip_info->neigh_ht, 4, 0x15c), 4);
+		boosts = (readl(mapping) >> 2) & 7;
+		iounmap(mapping);
+	} else {
+		mapping = ioremap_nocache(PCI_MMIO_CONF(master_sci, 0, 0x18 + numachip_info->neigh_ht, 3, 0xd4), 4);
+		val = readl(mapping);
+		iounmap(mapping);
+		rdmsrl(0xc0010071, val6);
+		nb_mhz = 200 * ((val & 0x1f) + 4) / (1 + ((val6 >> 22) & 1));
+	}
+
+	rdmsrl(MSR_AMD_PSTATE_DEF_BASE + boosts, val6); /* P0 */
+	core_mhz = 100 * ((val6 & 0x3f) + 0x10) >> ((val6 >> 6) & 7);
+	nb2core = ((u64)core_mhz << MANTISSA) / nb_mhz;
+	core2nb = ((u64)nb_mhz << MANTISSA) / core_mhz;
+	masterp = ioremap_nocache(PCI_MMIO_CONF(master_sci, 0, 0x18 + numachip_info->neigh_ht, 2, 0xb0), 4);
+#ifdef UNNEEDED
+	{
+		/* Calculate how many times NB TSC has wrapped and write offset */
+		u64 local = __native_read_tsc();
+		u64 wrapped = ((__uint128_t)local * core2nb) >> MANTISSA;
+		writel(wrapped & 0xffffffff, masterp);
+	}
+#endif
+	pr_err("clocktree: core_mhz=%u nb_mhz=%u\n", core_mhz, nb_mhz);
+
+	clocktree_online_action(); /* Setup core 0 */
+	x86_cpuinit.early_percpu_clock_init = clocktree_online_action;
+	clocksource_register_khz(&clocksource_nb, core_mhz * 1000);
+	global_last_wrap = native_read_tsc();
+
+	return 0;
+}
+
+early_initcall(clocktree_init);
-- 
1.9.1

