/* * based on linux-2.6.17.13/arch/i386/kernel/apic.c * * Local APIC handling, local APIC timers * * (c) 1999, 2000 Ingo Molnar * * Fixes * Maciej W. Rozycki : Bits for genuine 82489DX APICs; * thanks to Eric Gilmore * and Rolf G. Tews * for testing these extensively. * Maciej W. Rozycki : Various updates and fixes. * Mikael Pettersson : Power Management for UP-APIC. * Pavel Machek and * Mikael Pettersson : PM converted to driver model. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Knob to control our willingness to enable the local APIC. */ int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */ /* * Debug level */ int apic_verbosity; static void apic_pm_activate(void); int modern_apic(void) { unsigned int lvr, version; /* AMD systems use old APIC versions, so check the CPU */ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && boot_cpu_data.x86 >= 0xf) return 1; lvr = apic_read(APIC_LVR); version = GET_APIC_VERSION(lvr); return version >= 0x14; } /* * 'what should we do if we get a hw irq event on an illegal vector'. * each architecture has to answer this themselves. */ void ack_bad_irq(unsigned int irq) { printk("unexpected IRQ trap at vector %02x\n", irq); /* * Currently unexpected vectors happen only on SMP and APIC. * We _must_ ack these because every local APIC has only N * irq slots per priority level, and a 'hanging, unacked' IRQ * holds up an irq slot - in excessive cases (when multiple * unexpected vectors occur) that might lock up the APIC * completely. * But only ack when the APIC is enabled -AK */ if (cpu_has_apic) ack_APIC_irq(); } void __init apic_intr_init(void) { #ifdef CONFIG_SMP smp_intr_init(); #endif /* self generated IPI for local APIC timer */ set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); /* IPI vectors for APIC spurious and error interrupts */ set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); /* thermal monitor LVT interrupt */ #ifdef CONFIG_X86_MCE_P4THERMAL set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); #endif } /* Using APIC to generate smp_local_timer_interrupt? */ int using_apic_timer = 0; static int enabled_via_apicbase; void enable_NMI_through_LVT0 (void * dummy) { unsigned int v, ver; ver = apic_read(APIC_LVR); ver = GET_APIC_VERSION(ver); v = APIC_DM_NMI; /* unmask and set to NMI */ if (!APIC_INTEGRATED(ver)) /* 82489DX */ v |= APIC_LVT_LEVEL_TRIGGER; apic_write_around(APIC_LVT0, v); } int get_physical_broadcast(void) { if (modern_apic()) return 0xff; else return 0xf; } int get_maxlvt(void) { unsigned int v, ver, maxlvt; v = apic_read(APIC_LVR); ver = GET_APIC_VERSION(v); /* 82489DXs do not report # of LVT entries. */ maxlvt = APIC_INTEGRATED(ver) ? GET_APIC_MAXLVT(v) : 2; return maxlvt; } void clear_local_APIC(void) { int maxlvt; unsigned long v; maxlvt = get_maxlvt(); /* * Masking an LVT entry on a P6 can trigger a local APIC error * if the vector is zero. Mask LVTERR first to prevent this. */ if (maxlvt >= 3) { v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED); } /* * Careful: we have to set masks only first to deassert * any level-triggered sources. */ v = apic_read(APIC_LVTT); apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); v = apic_read(APIC_LVT0); apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); v = apic_read(APIC_LVT1); apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED); if (maxlvt >= 4) { v = apic_read(APIC_LVTPC); apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); } /* lets not touch this if we didn't frob it */ #ifdef CONFIG_X86_MCE_P4THERMAL if (maxlvt >= 5) { v = apic_read(APIC_LVTTHMR); apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED); } #endif /* * Clean APIC state for other OSs: */ apic_write_around(APIC_LVTT, APIC_LVT_MASKED); apic_write_around(APIC_LVT0, APIC_LVT_MASKED); apic_write_around(APIC_LVT1, APIC_LVT_MASKED); if (maxlvt >= 3) apic_write_around(APIC_LVTERR, APIC_LVT_MASKED); if (maxlvt >= 4) apic_write_around(APIC_LVTPC, APIC_LVT_MASKED); #ifdef CONFIG_X86_MCE_P4THERMAL if (maxlvt >= 5) apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED); #endif v = GET_APIC_VERSION(apic_read(APIC_LVR)); if (APIC_INTEGRATED(v)) { /* !82489DX */ if (maxlvt > 3) /* Due to Pentium errata 3AP and 11AP. */ apic_write(APIC_ESR, 0); apic_read(APIC_ESR); } } void __init connect_bsp_APIC(void) { if (pic_mode) { /* * Do not trust the local APIC being empty at bootup. */ clear_local_APIC(); /* * PIC mode, enable APIC mode in the IMCR, i.e. * connect BSP's local APIC to INT and NMI lines. */ apic_printk(APIC_VERBOSE, "leaving PIC mode, " "enabling APIC mode.\n"); outb(0x70, 0x22); outb(0x01, 0x23); } enable_apic_mode(); } void disconnect_bsp_APIC(int virt_wire_setup) { if (pic_mode) { /* * Put the board back into PIC mode (has an effect * only on certain older boards). Note that APIC * interrupts, including IPIs, won't work beyond * this point! The only exception are INIT IPIs. */ apic_printk(APIC_VERBOSE, "disabling APIC mode, " "entering PIC mode.\n"); outb(0x70, 0x22); outb(0x00, 0x23); } else { /* Go back to Virtual Wire compatibility mode */ unsigned long value; /* For the spurious interrupt use vector F, and enable it */ value = apic_read(APIC_SPIV); value &= ~APIC_VECTOR_MASK; value |= APIC_SPIV_APIC_ENABLED; value |= 0xf; apic_write_around(APIC_SPIV, value); if (!virt_wire_setup) { /* For LVT0 make it edge triggered, active high, external and enabled */ value = apic_read(APIC_LVT0); value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); apic_write_around(APIC_LVT0, value); } else { /* Disable LVT0 */ apic_write_around(APIC_LVT0, APIC_LVT_MASKED); } /* For LVT1 make it edge triggered, active high, nmi and enabled */ value = apic_read(APIC_LVT1); value &= ~( APIC_MODE_MASK | APIC_SEND_PENDING | APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); apic_write_around(APIC_LVT1, value); } } void disable_local_APIC(void) { unsigned long value; clear_local_APIC(); /* * Disable APIC (implies clearing of registers * for 82489DX!). */ value = apic_read(APIC_SPIV); value &= ~APIC_SPIV_APIC_ENABLED; apic_write_around(APIC_SPIV, value); if (enabled_via_apicbase) { unsigned int l, h; rdmsr(MSR_IA32_APICBASE, l, h); l &= ~MSR_IA32_APICBASE_ENABLE; wrmsr(MSR_IA32_APICBASE, l, h); } } /* * This is to verify that we're looking at a real local APIC. * Check these against your board if the CPUs aren't getting * started for no apparent reason. */ int __init verify_local_APIC(void) { unsigned int reg0, reg1; /* * The version register is read-only in a real APIC. */ reg0 = apic_read(APIC_LVR); apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); reg1 = apic_read(APIC_LVR); apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); /* * The two version reads above should print the same * numbers. If the second one is different, then we * poke at a non-APIC. */ if (reg1 != reg0) return 0; /* * Check if the version looks reasonably. */ reg1 = GET_APIC_VERSION(reg0); if (reg1 == 0x00 || reg1 == 0xff) return 0; reg1 = get_maxlvt(); if (reg1 < 0x02 || reg1 == 0xff) return 0; /* * The ID register is read/write in a real APIC. */ reg0 = apic_read(APIC_ID); apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); /* * The next two are just to see if we have sane values. * They're only really relevant if we're in Virtual Wire * compatibility mode, but most boxes are anymore. */ reg0 = apic_read(APIC_LVT0); apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0); reg1 = apic_read(APIC_LVT1); apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); return 1; } void __init sync_Arb_IDs(void) { /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not needed on AMD */ if (modern_apic()) return; /* * Wait for idle. */ apic_wait_icr_idle(); apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT); } extern void __error_in_apic_c (void); /* * An initial setup of the virtual wire mode. */ void __init init_bsp_APIC(void) { unsigned long value, ver; /* * Don't do the setup now if we have a SMP BIOS as the * through-I/O-APIC virtual wire mode might be active. */ if (smp_found_config || !cpu_has_apic) return; value = apic_read(APIC_LVR); ver = GET_APIC_VERSION(value); /* * Do not trust the local APIC being empty at bootup. */ clear_local_APIC(); /* * Enable APIC. */ value = apic_read(APIC_SPIV); value &= ~APIC_VECTOR_MASK; value |= APIC_SPIV_APIC_ENABLED; /* This bit is reserved on P4/Xeon and should be cleared */ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 15)) value &= ~APIC_SPIV_FOCUS_DISABLED; else value |= APIC_SPIV_FOCUS_DISABLED; value |= SPURIOUS_APIC_VECTOR; apic_write_around(APIC_SPIV, value); /* * Set up the virtual wire mode. */ apic_write_around(APIC_LVT0, APIC_DM_EXTINT); value = APIC_DM_NMI; if (!APIC_INTEGRATED(ver)) /* 82489DX */ value |= APIC_LVT_LEVEL_TRIGGER; apic_write_around(APIC_LVT1, value); } void __devinit setup_local_APIC(void) { unsigned long oldvalue, value, ver, maxlvt; int i, j; /* Pound the ESR really hard over the head with a big hammer - mbligh */ if (esr_disable) { apic_write(APIC_ESR, 0); apic_write(APIC_ESR, 0); apic_write(APIC_ESR, 0); apic_write(APIC_ESR, 0); } value = apic_read(APIC_LVR); ver = GET_APIC_VERSION(value); if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f) __error_in_apic_c(); /* * Double-check whether this APIC is really registered. */ if (!apic_id_registered()) BUG(); /* * Intel recommends to set DFR, LDR and TPR before enabling * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel * document number 292116). So here it goes... */ init_apic_ldr(); /* * Set Task Priority to 'accept all'. We never change this * later on. */ value = apic_read(APIC_TASKPRI); value &= ~APIC_TPRI_MASK; apic_write_around(APIC_TASKPRI, value); /* * After a crash, we no longer service the interrupts and a pending * interrupt from previous kernel might still have ISR bit set. * * Most probably by now CPU has serviced that pending interrupt and * it might not have done the ack_APIC_irq() because it thought, * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it * does not clear the ISR bit and cpu thinks it has already serivced * the interrupt. Hence a vector might get locked. It was noticed * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. */ for (i = APIC_ISR_NR - 1; i >= 0; i--) { value = apic_read(APIC_ISR + i*0x10); for (j = 31; j >= 0; j--) { if (value & (1< 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); oldvalue = apic_read(APIC_ESR); value = ERROR_APIC_VECTOR; // enables sending errors apic_write_around(APIC_LVTERR, value); /* * spec says clear errors after enabling vector. */ if (maxlvt > 3) apic_write(APIC_ESR, 0); value = apic_read(APIC_ESR); if (value != oldvalue) apic_printk(APIC_VERBOSE, "ESR value before enabling " "vector: 0x%08lx after: 0x%08lx\n", oldvalue, value); } else { if (esr_disable) /* * Something untraceble is creating bad interrupts on * secondary quads ... for the moment, just leave the * ESR disabled - we can't do anything useful with the * errors anyway - mbligh */ printk("Leaving ESR disabled.\n"); else printk("No ESR for 82489DX.\n"); } if (nmi_watchdog == NMI_LOCAL_APIC) setup_apic_nmi_watchdog(); apic_pm_activate(); } static struct { int active; /* r/w apic fields */ unsigned int apic_id; unsigned int apic_taskpri; unsigned int apic_ldr; unsigned int apic_dfr; unsigned int apic_spiv; unsigned int apic_lvtt; unsigned int apic_lvtpc; unsigned int apic_lvt0; unsigned int apic_lvt1; unsigned int apic_lvterr; unsigned int apic_tmict; unsigned int apic_tdcr; unsigned int apic_thmr; } apic_pm_state; int lapic_suspend(void) { unsigned long flags; if (!apic_pm_state.active) return 0; apic_pm_state.apic_id = apic_read(APIC_ID); apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); apic_pm_state.apic_ldr = apic_read(APIC_LDR); apic_pm_state.apic_dfr = apic_read(APIC_DFR); apic_pm_state.apic_spiv = apic_read(APIC_SPIV); apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); apic_pm_state.apic_tmict = apic_read(APIC_TMICT); apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); local_irq_save(flags); disable_local_APIC(); local_irq_restore(flags); return 0; } int lapic_resume(void) { unsigned int l, h; unsigned long flags; if (!apic_pm_state.active) return 0; local_irq_save(flags); /* * Make sure the APICBASE points to the right address * * FIXME! This will be wrong if we ever support suspend on * SMP! We'll need to do this as part of the CPU restore! */ rdmsr(MSR_IA32_APICBASE, l, h); l &= ~MSR_IA32_APICBASE_BASE; l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; wrmsr(MSR_IA32_APICBASE, l, h); apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); apic_write(APIC_ID, apic_pm_state.apic_id); apic_write(APIC_DFR, apic_pm_state.apic_dfr); apic_write(APIC_LDR, apic_pm_state.apic_ldr); apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); apic_write(APIC_SPIV, apic_pm_state.apic_spiv); apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); apic_write(APIC_TMICT, apic_pm_state.apic_tmict); apic_write(APIC_ESR, 0); apic_read(APIC_ESR); apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); apic_write(APIC_ESR, 0); apic_read(APIC_ESR); local_irq_restore(flags); return 0; } /* * If Linux enabled the LAPIC against the BIOS default * disable it down before re-entering the BIOS on shutdown. * Otherwise the BIOS may get confused and not power-off. * Additionally clear all LVT entries before disable_local_APIC * for the case where Linux didn't enable the LAPIC. */ void lapic_shutdown(void) { unsigned long flags; if (!cpu_has_apic) return; local_irq_save(flags); clear_local_APIC(); if (enabled_via_apicbase) disable_local_APIC(); local_irq_restore(flags); } static void apic_pm_activate(void) { apic_pm_state.active = 1; } /* * Detect and enable local APICs on non-SMP boards. * Original code written by Keir Fraser. */ static void __init lapic_disable(char *str) { enable_local_apic = -1; clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); } custom_param("nolapic", lapic_disable); static void __init lapic_enable(char *str) { enable_local_apic = 1; } custom_param("lapic", lapic_enable); static void __init apic_set_verbosity(char *str) { if (strcmp("debug", str) == 0) apic_verbosity = APIC_DEBUG; else if (strcmp("verbose", str) == 0) apic_verbosity = APIC_VERBOSE; else printk(KERN_WARNING "APIC Verbosity level %s not recognised" " use apic_verbosity=verbose or apic_verbosity=debug", str); } custom_param("apic_verbosity", apic_set_verbosity); static int __init detect_init_APIC (void) { u32 h, l, features; /* Disabled by kernel option? */ if (enable_local_apic < 0) return -1; switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) || (boot_cpu_data.x86 == 15)) break; goto no_apic; case X86_VENDOR_INTEL: if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 || (boot_cpu_data.x86 == 5 && cpu_has_apic)) break; goto no_apic; default: goto no_apic; } if (!cpu_has_apic) { /* * Over-ride BIOS and try to enable the local * APIC only if "lapic" specified. */ if (enable_local_apic <= 0) { printk("Local APIC disabled by BIOS -- " "you can enable it with \"lapic\"\n"); return -1; } /* * Some BIOSes disable the local APIC in the * APIC_BASE MSR. This can only be done in * software for Intel P6 or later and AMD K7 * (Model > 1) or later. */ rdmsr(MSR_IA32_APICBASE, l, h); if (!(l & MSR_IA32_APICBASE_ENABLE)) { printk("Local APIC disabled by BIOS -- reenabling.\n"); l &= ~MSR_IA32_APICBASE_BASE; l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; wrmsr(MSR_IA32_APICBASE, l, h); enabled_via_apicbase = 1; } } /* * The APIC feature bi
/*
 * lib/msg.c		Netlink Messages Interface
 *
 *	This library is free software; you can redistribute it and/or
 *	modify it under the terms of the GNU Lesser General Public
 *	License as published by the Free Software Foundation version 2.1
 *	of the License.
 *
 * Copyright (c) 2003-2008 Thomas Graf <tgraf@suug.ch>
 */

/**
 * @ingroup core
 * @defgroup msg Messages
 * Netlink Message Construction/Parsing Interface
 * 
 * The following information is partly extracted from RFC3549
 * (ftp://ftp.rfc-editor.org/in-notes/rfc3549.txt)
 *
 * @par Message Format
 * Netlink messages consist of a byte stream with one or multiple
 * Netlink headers and an associated payload.  If the payload is too big
 * to fit into a single message it, can be split over multiple Netlink
 * messages, collectively called a multipart message.  For multipart
 * messages, the first and all following headers have the \c NLM_F_MULTI
 * Netlink header flag set, except for the last header which has the
 * Netlink header type \c NLMSG_DONE.
 *
 * @par
 * The Netlink message header (\link nlmsghdr struct nlmsghdr\endlink) is shown below.
 * @code   
 * 0                   1                   2                   3
 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 * |                          Length                             |
 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 * |            Type              |           Flags              |
 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 * |                      Sequence Number                        |
 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 * |                      Process ID (PID)                       |
 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 * @endcode
 *
 * @par
 * The netlink message header and payload must be aligned properly:
 * @code
 *  <------- NLMSG_ALIGN(hlen) ------> <---- NLMSG_ALIGN(len) --->
 * +----------------------------+- - -+- - - - - - - - - - -+- - -+
 * |           Header           | Pad |       Payload       | Pad |
 * |      struct nlmsghdr       |     |                     |     |
 * +----------------------------+- - -+- - - - - - - - - - -+- - -+
 * @endcode
 * @par
 * Message Format:
 * @code
 *    <--- nlmsg_total_size(payload)  --->
 *    <-- nlmsg_msg_size(payload) ->
 *   +----------+- - -+-------------+- - -+-------- - -
 *   | nlmsghdr | Pad |   Payload   | Pad | nlmsghdr
 *   +----------+- - -+-------------+- - -+-------- - -
 *   nlmsg_data(nlh)---^                   ^
 *   nlmsg_next(nlh)-----------------------+
 * @endcode
 * @par
 * The payload may consist of arbitary data but may have strict
 * alignment and formatting rules depening on the specific netlink
 * families.
 * @par
 * @code
 *    <---------------------- nlmsg_len(nlh) --------------------->
 *    <------ hdrlen ------>       <- nlmsg_attrlen(nlh, hdrlen) ->
 *   +----------------------+- - -+--------------------------------+
 *   |     Family Header    | Pad |           Attributes           |
 *   +----------------------+- - -+--------------------------------+
 *   nlmsg_attrdata(nlh, hdrlen)---^
 * @endcode
 * @par The ACK Netlink Message
 * This message is actually used to denote both an ACK and a NACK.
 * Typically, the direction is from FEC to CPC (in response to an ACK
 * request message).  However, the CPC should be able to send ACKs back
 * to FEC when requested.
 * @code
 *  0                   1                   2                   3
 *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 * |                       Netlink message header                  |
 * |                       type = NLMSG_ERROR                      |
 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 * |                          Error code                           |
 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 * |                       OLD Netlink message header              |
 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 * @endcode
 *
 * @par Example
 * @code
 * // Various methods exist to create/allocate a new netlink
 * // message. 
 * //
 * // nlmsg_alloc() will allocate an empty netlink message with
 * // a maximum payload size which defaults to the page size of
 * // the system. This default size can be modified using the
 * // function nlmsg_set_default_size().
 * struct nl_msg *msg = nlmsg_alloc();
 *
 * // Very often, the message type and message flags are known
 * // at allocation time while the other fields are auto generated:
 * struct nl_msg *msg = nlmsg_alloc_simple(MY_TYPE, MY_FLAGS);
 *
 * // Alternatively an existing netlink message header can be used
 * // to inherit the header values:
 * struct nlmsghdr hdr = {
 * 	.nlmsg_type = MY_TYPE,
 * 	.nlmsg_flags = MY_FLAGS,
 * };
 * struct nl_msg *msg = nlmsg_inherit(&hdr);
 *
 * // Last but not least, netlink messages received from netlink sockets
 * // can be converted into nl_msg objects using nlmsg_convert(). This
 * // will create a message with a maximum payload size which equals the
 * // length of the existing netlink message, therefore no more data can
 * // be appened without calling nlmsg_expand() first.
 * struct nl_msg *msg = nlmsg_convert(nlh_from_nl_sock);
 *
 * // Payload may be added to the message via nlmsg_append(). The fourth
 * // parameter specifies the number of alignment bytes the data should
 * // be padding with at the end. Common values are 0 to disable it or
 * // NLMSG_ALIGNTO to ensure proper netlink message padding.
 * nlmsg_append(msg, &mydata, sizeof(mydata), 0);
 *
 * // Sometimes it may be necessary to reserve room for data but defer
 * // the actual copying to a later point, nlmsg_reserve() can be used
 * // for this purpose:
 * void *data = nlmsg_reserve(msg, sizeof(mydata), NLMSG_ALIGNTO);
 *
 * // Attributes may be added using the attributes interface.
 *
 * // After successful use of the message, the memory must be freed
 * // using nlmsg_free()
 * nlmsg_free(msg);
 * @endcode
 * 
 * @par 4) Parsing messages
 * @code
 * int n;
 * unsigned char *buf;
 * struct nlmsghdr *hdr;
 *
 * n = nl_recv(handle, NULL, &buf);
 * 
 * hdr = (struct nlmsghdr *) buf;
 * while (nlmsg_ok(hdr, n)) {
 * 	// Process message here...
 * 	hdr = nlmsg_next(hdr, &n);
 * }
 * @endcode
 * @{
 */

#include <netlink-local.h>
#include <netlink/netlink.h>
#include <netlink/utils.h>
#include <netlink/cache.h>
#include <netlink/attr.h>
#include <netlink/msg.h>
#include <linux/socket.h>

static size_t default_msg_size;

static void __init init_msg_size(void)
{
	default_msg_size = getpagesize();
}

/**
 * @name Attribute Access
 * @{
 */

//** @} */

/**
 * @name Message Parsing
 * @{
 */

/**
 * check if the netlink message fits into the remaining bytes
 * @arg nlh		netlink message header
 * @arg remaining	number of bytes remaining in message stream
 */
int nlmsg_ok(const struct nlmsghdr *nlh, int remaining)
{
	return (remaining >= sizeof(struct nlmsghdr) &&
		nlh->nlmsg_len >= sizeof(struct nlmsghdr) &&
		nlh->nlmsg_len <= remaining);
}

/**
 * next netlink message in message stream
 * @arg nlh		netlink message header
 * @arg remaining	number of bytes remaining in message stream
 *
 * @returns the next netlink message in the message stream and
 * decrements remaining by the size of the current message.
 */
struct nlmsghdr *nlmsg_next(struct nlmsghdr *nlh, int *remaining)
{
	int totlen = NLMSG_ALIGN(nlh->nlmsg_len);

	*remaining -= totlen;

	return (struct nlmsghdr *) ((unsigned char *) nlh + totlen);
}

/**
 * parse attributes of a netlink message
 * @arg nlh		netlink message header
 * @arg hdrlen		length of family specific header
 * @arg tb		destination array with maxtype+1 elements
 * @arg maxtype		maximum attribute type to be expected
 * @arg policy		validation policy
 *
 * See nla_parse()
 */
int nlmsg_parse(struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[],
		int maxtype, struct nla_policy *policy)
{
	if (!nlmsg_valid_hdr(nlh, hdrlen))
		return -NLE_MSG_TOOSHORT;

	return nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen),
			 nlmsg_attrlen(nlh, hdrlen), policy);
}

/**
 * nlmsg_validate - validate a netlink message including attributes
 * @arg nlh		netlinket message header
 * @arg hdrlen		length of familiy specific header
 * @arg maxtype		maximum attribute type to be expected
 * @arg policy		validation policy
 */
int nlmsg_validate(struct nlmsghdr *nlh, int hdrlen, int maxtype,
		   struct nla_policy *policy)
{
	if (!nlmsg_valid_hdr(nlh, hdrlen))
		return -NLE_MSG_TOOSHORT;

	return nla_validate(nlmsg_attrdata(nlh, hdrlen),
			    nlmsg_attrlen(nlh, hdrlen), maxtype, policy);
}

/** @} */

/**
 * @name Message Building/Access
 * @{
 */

static struct nl_msg *__nlmsg_alloc(size_t len)
{
	struct nl_msg *nm;

	nm = calloc(1, sizeof(*nm));
	if (!nm)
		goto errout;

	nm->nm_refcnt = 1;

	nm->nm_nlh = malloc(len);
	if (!nm->nm_nlh)
		goto errout;

	memset(nm->nm_nlh, 0, sizeof(struct nlmsghdr));

	nm->nm_protocol = -1;
	nm->nm_size = len;
	nm->nm_nlh->nlmsg_len = nlmsg_total_size(0);

	NL_DBG(2, "msg %p: Allocated new message, maxlen=%zu\n", nm, len);

	return nm;
errout:
	free(nm);
	return NULL;
}

/**
 * Allocate a new netlink message with the default maximum payload size.
 *
 * Allocates a new netlink message without any further payload. The
 * maximum payload size defaults to PAGESIZE or as otherwise specified
 * with nlmsg_set_default_size().
 *
 * @return Newly allocated netlink message or NULL.
 */
struct nl_msg *nlmsg_alloc(void)
{
	return __nlmsg_alloc(default_msg_size);
}

/**
 * Allocate a new netlink message with maximum payload size specified.
 */
struct nl_msg *nlmsg_alloc_size(size_t max)
{
	return __nlmsg_alloc(max);
}

/**
 * Allocate a new netlink message and inherit netlink message header
 * @arg hdr		Netlink message header template
 *
 * Allocates a new netlink message and inherits the original message
 * header. If \a hdr is not NULL it will be used as a template for
 * the netlink message header, otherwise the header is left blank.
 * 
 * @return Newly allocated netlink message or NULL
 */
struct nl_msg *nlmsg_inherit(struct nlmsghdr *hdr)
{
	struct nl_msg *nm;

	nm = nlmsg_alloc();
	if (nm && hdr) {
		struct nlmsghdr *new = nm->nm_nlh;

		new->nlmsg_type = hdr->nlmsg_type;
		new->nlmsg_flags = hdr->nlmsg_flags;
		new->nlmsg_seq = hdr->nlmsg_seq;
		new->nlmsg_pid = hdr->nlmsg_pid;
	}

	return nm;
}

/**
 * Allocate a new netlink message
 * @arg nlmsgtype	Netlink message type
 * @arg flags		Message flags.
 *
 * @return Newly allocated netlink message or NULL.
 */
struct nl_msg *nlmsg_alloc_simple(int nlmsgtype, int flags)
{
	struct nl_msg *msg;
	struct nlmsghdr nlh = {
		.nlmsg_type = nlmsgtype,
		.nlmsg_flags = flags,
	};

	msg = nlmsg_inherit(&nlh);
	if (msg)
		NL_DBG(2, "msg %p: Allocated new simple message\n", msg);

	return msg;
}

/**
 * Set the default maximum message payload size for allocated messages
 * @arg max		Size of payload in bytes.
 */
void nlmsg_set_default_size(size_t max)
{
	if (max < nlmsg_total_size(0))
		max = nlmsg_total_size(0);

	default_msg_size = max;
}

/**
 * Convert a netlink message received from a netlink socket to a nl_msg
 * @arg hdr		Netlink message received from netlink socket.
 *
 * Allocates a new netlink message and copies all of the data pointed to
 * by \a hdr into the new message object.
 *
 * @return Newly allocated netlink message or NULL.
 */
struct nl_msg *nlmsg_convert(struct nlmsghdr *hdr)
{
	struct nl_msg *nm;

	nm = __nlmsg_alloc(NLMSG_ALIGN(hdr->nlmsg_len));
	if (!nm)
		goto errout;

	memcpy(nm->nm_nlh, hdr, hdr->nlmsg_len);

	return nm;
errout:
	nlmsg_free(nm);
	return NULL;
}

/**
 * Reserve room for additional data in a netlink message
 * @arg n		netlink message
 * @arg len		length of additional data to reserve room for
 * @arg pad		number of bytes to align data to
 *
 * Reserves room for additional data at the tail of the an
 * existing netlink message. Eventual padding required will
 * be zeroed out.
 *
 * @return Pointer to start of additional data tailroom or NULL.
 */
void *nlmsg_reserve(struct nl_msg *n, size_t len, int pad)
{
	void *buf = n->nm_nlh;
	size_t nlmsg_len = n->nm_nlh->nlmsg_len;
	size_t tlen;

	tlen = pad ? ((len + (pad - 1)) & ~(pad - 1)) : len;

	if ((tlen + nlmsg_len) > n->nm_size)
		return NULL;

	buf += nlmsg_len;
	n->nm_nlh->nlmsg_len += tlen;

	if (tlen > len)
		memset(buf + len, 0, tlen - len);

	NL_DBG(2, "msg %p: Reserved %zu bytes, pad=%d, nlmsg_len=%d\n",
		  n, len, pad, n->nm_nlh->nlmsg_len);

	return buf;
}

/**
 * Append data to tail of a netlink message
 * @arg n		netlink message
 * @arg data		data to add
 * @arg len		length of data
 * @arg pad		Number of bytes to align data to.
 *
 * Extends the netlink message as needed and appends the data of given
 * length to the message. 
 *
 * @return 0 on success or a negative error code
 */
int nlmsg_append(struct nl_msg *n, void *data, size_t len, int pad)
{
	void *tmp;

	tmp = nlmsg_reserve(n, len, pad);
	if (tmp == NULL)
		return -NLE_NOMEM;

	memcpy(tmp, data, len);
	NL_DBG(2, "msg %p: Appended %zu bytes with padding %d\n", n, len, pad);

	return 0;
}

/**
 * Add a netlink message header to a netlink message
 * @arg n		netlink message
 * @arg pid		netlink process id or NL_AUTO_PID
 * @arg seq		sequence number of message or NL_AUTO_SEQ
 * @arg type		message type
 * @arg payload		length of message payload
 * @arg flags		message flags
 *
 * Adds or overwrites the netlink message header in an existing message
 * object. If \a payload is greater-than zero additional room will be
 * reserved, f.e. for family specific headers. It can be accesed via
 * nlmsg_data().
 *
 * @return A pointer to the netlink message header or NULL.
 */
struct nlmsghdr *nlmsg_put(struct nl_msg *n, uint32_t pid, uint32_t seq,
			   int type, int payload, int flags)
{
	struct nlmsghdr *nlh;

	if (n->nm_nlh->nlmsg_len < NLMSG_HDRLEN)
		BUG();

	nlh = (struct nlmsghdr *) n->nm_nlh;
	nlh->nlmsg_type = type;
	nlh->nlmsg_flags = flags;
	nlh->nlmsg_pid = pid;
	nlh->nlmsg_seq = seq;

	NL_DBG(2, "msg %p: Added netlink header type=%d, flags=%d, pid=%d, "
		  "seq=%d\n", n, type, flags, pid, seq);

	if (payload > 0 &&
	    nlmsg_reserve(n, payload, NLMSG_ALIGNTO) == NULL)
		return NULL;

	return nlh;
}

/**
 * Release a reference from an netlink message
 * @arg msg		message to release reference from
 *
 * Frees memory after the last reference has been released.
 */
void nlmsg_free(struct nl_msg *msg)
{
	if (!msg)
		return;

	msg->nm_refcnt--;
	NL_DBG(4, "Returned message reference %p, %d remaining\n",
	       msg, msg->nm_refcnt);

	if (msg->nm_refcnt < 0)
		BUG();

	if (msg->nm_refcnt <= 0) {
		free(msg->nm_nlh);
		free(msg);
		NL_DBG(2, "msg %p: Freed\n", msg);
	}
}

/** @} */

/**
 * @name Direct Parsing
 * @{
 */

/** @cond SKIP */
struct dp_xdata {
	void (*cb)(struct nl_object *, void *);
	void *arg;
};
/** @endcond */

static int parse_cb(struct nl_object *obj, struct nl_parser_param *p)
{
	struct dp_xdata *x = p->pp_arg;

	x->cb(obj, x->arg);
	return 0;
}

int nl_msg_parse(struct nl_msg *msg, void (*cb)(struct nl_object *, void *),
		 void *arg)
{
	struct nl_cache_ops *ops;
	struct nl_parser_param p = {
		.pp_cb = parse_cb
	};
	struct dp_xdata x = {
		.cb = cb,
		.arg = arg,
	};

	ops = nl_cache_ops_associate(nlmsg_get_proto(msg),
				     nlmsg_hdr(msg)->nlmsg_type);
	if (ops == NULL)
		return -NLE_MSGTYPE_NOSUPPORT;
	p.pp_arg = &x;

	return nl_cache_parse(ops, NULL, nlmsg_hdr(msg), &p);
}

/** @} */