From dc7a12bdfccd94c31f79e294f16f7549bd411b49 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Sun, 14 Apr 2019 15:51:10 -0300 Subject: docs: arm: convert docs to ReST and rename to *.rst Converts ARM the text files to ReST, preparing them to be an architecture book. The conversion is actually: - add blank lines and identation in order to identify paragraphs; - fix tables markups; - add some lists markups; - mark literal blocks; - adjust title markups. At its new index.rst, let's add a :orphan: while this is not linked to the main index.rst file, in order to avoid build warnings. Signed-off-by: Mauro Carvalho Chehab Reviewed-by Corentin Labbe # For sun4i-ss --- arch/arm/Kconfig | 2 +- arch/arm/common/mcpm_entry.c | 2 +- arch/arm/common/mcpm_head.S | 2 +- arch/arm/common/vlock.S | 2 +- arch/arm/include/asm/setup.h | 2 +- arch/arm/include/uapi/asm/setup.h | 2 +- arch/arm/kernel/entry-armv.S | 2 +- arch/arm/mach-exynos/common.h | 2 +- arch/arm/mach-ixp4xx/Kconfig | 14 +++++++------- arch/arm/mach-s3c24xx/pm.c | 2 +- arch/arm/mm/Kconfig | 4 ++-- arch/arm/plat-samsung/Kconfig | 6 +++--- arch/arm/tools/mach-types | 2 +- arch/arm64/Kconfig | 2 +- arch/arm64/kernel/kuser32.S | 2 +- arch/mips/bmips/setup.c | 2 +- 16 files changed, 25 insertions(+), 25 deletions(-) (limited to 'arch') diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 2bf1ce39a96d..6425871e9903 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -2142,7 +2142,7 @@ config VFP Say Y to include VFP support code in the kernel. This is needed if your hardware includes a VFP unit. - Please see for + Please see for release notes and additional status information. Say N if your target does not have VFP hardware. diff --git a/arch/arm/common/mcpm_entry.c b/arch/arm/common/mcpm_entry.c index e24ad60891b2..8a9aeeb504dd 100644 --- a/arch/arm/common/mcpm_entry.c +++ b/arch/arm/common/mcpm_entry.c @@ -21,7 +21,7 @@ /* * The public API for this code is documented in arch/arm/include/asm/mcpm.h. * For a comprehensive description of the main algorithm used here, please - * see Documentation/arm/cluster-pm-race-avoidance.txt. + * see Documentation/arm/cluster-pm-race-avoidance.rst. */ struct sync_struct mcpm_sync; diff --git a/arch/arm/common/mcpm_head.S b/arch/arm/common/mcpm_head.S index d5bd75dd576d..291d969bc719 100644 --- a/arch/arm/common/mcpm_head.S +++ b/arch/arm/common/mcpm_head.S @@ -5,7 +5,7 @@ * Created by: Nicolas Pitre, March 2012 * Copyright: (C) 2012-2013 Linaro Limited * - * Refer to Documentation/arm/cluster-pm-race-avoidance.txt + * Refer to Documentation/arm/cluster-pm-race-avoidance.rst * for details of the synchronisation algorithms used here. */ diff --git a/arch/arm/common/vlock.S b/arch/arm/common/vlock.S index 9675cc15d0c4..f1c7fd44f1b1 100644 --- a/arch/arm/common/vlock.S +++ b/arch/arm/common/vlock.S @@ -6,7 +6,7 @@ * Copyright: (C) 2012-2013 Linaro Limited * * This algorithm is described in more detail in - * Documentation/arm/vlocks.txt. + * Documentation/arm/vlocks.rst. */ #include diff --git a/arch/arm/include/asm/setup.h b/arch/arm/include/asm/setup.h index 77e5582c2259..67d20712cb48 100644 --- a/arch/arm/include/asm/setup.h +++ b/arch/arm/include/asm/setup.h @@ -5,7 +5,7 @@ * Copyright (C) 1997-1999 Russell King * * Structure passed to kernel to tell it about the - * hardware it's running on. See Documentation/arm/Setup + * hardware it's running on. See Documentation/arm/setup.rst * for more info. */ #ifndef __ASMARM_SETUP_H diff --git a/arch/arm/include/uapi/asm/setup.h b/arch/arm/include/uapi/asm/setup.h index 6b335a9ff8c8..25ceda63b284 100644 --- a/arch/arm/include/uapi/asm/setup.h +++ b/arch/arm/include/uapi/asm/setup.h @@ -9,7 +9,7 @@ * published by the Free Software Foundation. * * Structure passed to kernel to tell it about the - * hardware it's running on. See Documentation/arm/Setup + * hardware it's running on. See Documentation/arm/setup.rst * for more info. */ #ifndef _UAPI__ASMARM_SETUP_H diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index 0b8cfdd60b90..858d4e541532 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -826,7 +826,7 @@ ENDPROC(__switch_to) * existing ones. This mechanism should be used only for things that are * really small and justified, and not be abused freely. * - * See Documentation/arm/kernel_user_helpers.txt for formal definitions. + * See Documentation/arm/kernel_user_helpers.rst for formal definitions. */ THUMB( .arm ) diff --git a/arch/arm/mach-exynos/common.h b/arch/arm/mach-exynos/common.h index c93356a8d662..56411bb63d45 100644 --- a/arch/arm/mach-exynos/common.h +++ b/arch/arm/mach-exynos/common.h @@ -106,7 +106,7 @@ void exynos_firmware_init(void); #define C2_STATE (1 << 3) /* * Magic values for bootloader indicating chosen low power mode. - * See also Documentation/arm/Samsung/Bootloader-interface.txt + * See also Documentation/arm/samsung/bootloader-interface.rst */ #define EXYNOS_SLEEP_MAGIC 0x00000bad #define EXYNOS_AFTR_MAGIC 0xfcba0d10 diff --git a/arch/arm/mach-ixp4xx/Kconfig b/arch/arm/mach-ixp4xx/Kconfig index fc5378b00f3d..f7211b57b1e7 100644 --- a/arch/arm/mach-ixp4xx/Kconfig +++ b/arch/arm/mach-ixp4xx/Kconfig @@ -33,7 +33,7 @@ config MACH_AVILA help Say 'Y' here if you want your kernel to support the Gateworks Avila Network Platform. For more information on this platform, - see . + see . config MACH_LOFT bool "Loft" @@ -49,7 +49,7 @@ config ARCH_ADI_COYOTE help Say 'Y' here if you want your kernel to support the ADI Engineering Coyote Gateway Reference Platform. For more - information on this platform, see . + information on this platform, see . config MACH_GATEWAY7001 bool "Gateway 7001" @@ -72,21 +72,21 @@ config ARCH_IXDP425 help Say 'Y' here if you want your kernel to support Intel's IXDP425 Development Platform (Also known as Richfield). - For more information on this platform, see . + For more information on this platform, see . config MACH_IXDPG425 bool "IXDPG425" help Say 'Y' here if you want your kernel to support Intel's IXDPG425 Development Platform (Also known as Montajade). - For more information on this platform, see . + For more information on this platform, see . config MACH_IXDP465 bool "IXDP465" help Say 'Y' here if you want your kernel to support Intel's IXDP465 Development Platform (Also known as BMP). - For more information on this platform, see . + For more information on this platform, see . config MACH_GORAMO_MLR bool "GORAMO Multi Link Router" @@ -99,7 +99,7 @@ config MACH_KIXRP435 help Say 'Y' here if you want your kernel to support Intel's KIXRP435 Reference Platform. - For more information on this platform, see . + For more information on this platform, see . # # IXCDP1100 is the exact same HW as IXDP425, but with a different machine @@ -116,7 +116,7 @@ config ARCH_PRPMC1100 help Say 'Y' here if you want your kernel to support the Motorola PrPCM1100 Processor Mezanine Module. For more information on - this platform, see . + this platform, see . config MACH_NAS100D bool diff --git a/arch/arm/mach-s3c24xx/pm.c b/arch/arm/mach-s3c24xx/pm.c index adcb90645460..c64988c609ad 100644 --- a/arch/arm/mach-s3c24xx/pm.c +++ b/arch/arm/mach-s3c24xx/pm.c @@ -5,7 +5,7 @@ // // S3C24XX Power Manager (Suspend-To-RAM) support // -// See Documentation/arm/Samsung-S3C24XX/Suspend.txt for more information +// See Documentation/arm/samsung-s3c24xx/suspend.rst for more information // // Parts based on arch/arm/mach-pxa/pm.c // diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index cc798115aa9b..820b60a50125 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig @@ -709,7 +709,7 @@ config ARM_VIRT_EXT assistance. A compliant bootloader is required in order to make maximum - use of this feature. Refer to Documentation/arm/Booting for + use of this feature. Refer to Documentation/arm/booting.rst for details. config SWP_EMULATE @@ -875,7 +875,7 @@ config KUSER_HELPERS the CPU type fitted to the system. This permits binaries to be run on ARMv4 through to ARMv7 without modification. - See Documentation/arm/kernel_user_helpers.txt for details. + See Documentation/arm/kernel_user_helpers.rst for details. However, the fixed address nature of these helpers can be used by ROP (return orientated programming) authors when creating diff --git a/arch/arm/plat-samsung/Kconfig b/arch/arm/plat-samsung/Kconfig index 53da57fba39c..301e572651c0 100644 --- a/arch/arm/plat-samsung/Kconfig +++ b/arch/arm/plat-samsung/Kconfig @@ -243,7 +243,7 @@ config SAMSUNG_PM_DEBUG depends on DEBUG_EXYNOS_UART || DEBUG_S3C24XX_UART || DEBUG_S3C2410_UART help Say Y here if you want verbose debugging from the PM Suspend and - Resume code. See + Resume code. See for more information. config S3C_PM_DEBUG_LED_SMDK @@ -268,7 +268,7 @@ config SAMSUNG_PM_CHECK Note, this can take several seconds depending on memory size and CPU speed. - See + See config SAMSUNG_PM_CHECK_CHUNKSIZE int "S3C2410 PM Suspend CRC Chunksize (KiB)" @@ -280,7 +280,7 @@ config SAMSUNG_PM_CHECK_CHUNKSIZE the CRC data block will take more memory, but will identify any faults with better precision. - See + See config SAMSUNG_WAKEMASK bool diff --git a/arch/arm/tools/mach-types b/arch/arm/tools/mach-types index 4eac94c1eb6f..9e74c7ff6b04 100644 --- a/arch/arm/tools/mach-types +++ b/arch/arm/tools/mach-types @@ -7,7 +7,7 @@ # http://www.arm.linux.org.uk/developer/machines/download.php # # Please do not send patches to this file; it is automatically generated! -# To add an entry into this database, please see Documentation/arm/README, +# To add an entry into this database, please see Documentation/arm/arm.rst, # or visit: # # http://www.arm.linux.org.uk/developer/machines/?action=new diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a36ff61321ce..a4b22bbf0590 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1142,7 +1142,7 @@ config KUSER_HELPERS the system. This permits binaries to be run on ARMv4 through to ARMv8 without modification. - See Documentation/arm/kernel_user_helpers.txt for details. + See Documentation/arm/kernel_user_helpers.rst for details. However, the fixed address nature of these helpers can be used by ROP (return orientated programming) authors when creating diff --git a/arch/arm64/kernel/kuser32.S b/arch/arm64/kernel/kuser32.S index 49825e9e421e..42bd8c0c60e0 100644 --- a/arch/arm64/kernel/kuser32.S +++ b/arch/arm64/kernel/kuser32.S @@ -10,7 +10,7 @@ * aarch32_setup_additional_pages() and are provided for compatibility * reasons with 32 bit (aarch32) applications that need them. * - * See Documentation/arm/kernel_user_helpers.txt for formal definitions. + * See Documentation/arm/kernel_user_helpers.rst for formal definitions. */ #include diff --git a/arch/mips/bmips/setup.c b/arch/mips/bmips/setup.c index 1738a06396f9..2f81a94c71a6 100644 --- a/arch/mips/bmips/setup.c +++ b/arch/mips/bmips/setup.c @@ -162,7 +162,7 @@ void __init plat_mem_setup(void) ioport_resource.start = 0; ioport_resource.end = ~0; - /* intended to somewhat resemble ARM; see Documentation/arm/Booting */ + /* intended to somewhat resemble ARM; see Documentation/arm/booting.rst */ if (fw_arg0 == 0 && fw_arg1 == 0xffffffff) dtb = phys_to_virt(fw_arg2); else if (fw_passed_dtb) /* UHI interface or appended dtb */ -- cgit v1.2.3-70-g09d2 From db9a0975a20c1f21c108b9d44545792d790593e4 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 18 Apr 2019 10:10:33 -0300 Subject: docs: ia64: convert to ReST Rename the ia64 documentation files to ReST, add an index for them and adjust in order to produce a nice html output via the Sphinx build system. There are two upper case file names. Rename them to lower case, as we're working to avoid upper case file names at Documentation. At its new index.rst, let's add a :orphan: while this is not linked to the main index.rst file, in order to avoid build warnings. Signed-off-by: Mauro Carvalho Chehab --- Documentation/ia64/IRQ-redir.txt | 69 --- Documentation/ia64/README | 43 -- Documentation/ia64/aliasing.rst | 246 +++++++++ Documentation/ia64/aliasing.txt | 221 -------- Documentation/ia64/efirtc.rst | 144 +++++ Documentation/ia64/efirtc.txt | 128 ----- Documentation/ia64/err_inject.rst | 1067 ++++++++++++++++++++++++++++++++++++ Documentation/ia64/err_inject.txt | 1068 ------------------------------------- Documentation/ia64/fsys.rst | 303 +++++++++++ Documentation/ia64/fsys.txt | 286 ---------- Documentation/ia64/ia64.rst | 49 ++ Documentation/ia64/index.rst | 18 + Documentation/ia64/irq-redir.rst | 80 +++ Documentation/ia64/mca.rst | 198 +++++++ Documentation/ia64/mca.txt | 194 ------- Documentation/ia64/serial.rst | 165 ++++++ Documentation/ia64/serial.txt | 151 ------ Documentation/ia64/xen.rst | 206 +++++++ Documentation/ia64/xen.txt | 183 ------- MAINTAINERS | 2 +- arch/ia64/kernel/efi.c | 2 +- arch/ia64/kernel/fsys.S | 2 +- arch/ia64/mm/ioremap.c | 2 +- arch/ia64/pci/pci.c | 2 +- 24 files changed, 2481 insertions(+), 2348 deletions(-) delete mode 100644 Documentation/ia64/IRQ-redir.txt delete mode 100644 Documentation/ia64/README create mode 100644 Documentation/ia64/aliasing.rst delete mode 100644 Documentation/ia64/aliasing.txt create mode 100644 Documentation/ia64/efirtc.rst delete mode 100644 Documentation/ia64/efirtc.txt create mode 100644 Documentation/ia64/err_inject.rst delete mode 100644 Documentation/ia64/err_inject.txt create mode 100644 Documentation/ia64/fsys.rst delete mode 100644 Documentation/ia64/fsys.txt create mode 100644 Documentation/ia64/ia64.rst create mode 100644 Documentation/ia64/index.rst create mode 100644 Documentation/ia64/irq-redir.rst create mode 100644 Documentation/ia64/mca.rst delete mode 100644 Documentation/ia64/mca.txt create mode 100644 Documentation/ia64/serial.rst delete mode 100644 Documentation/ia64/serial.txt create mode 100644 Documentation/ia64/xen.rst delete mode 100644 Documentation/ia64/xen.txt (limited to 'arch') diff --git a/Documentation/ia64/IRQ-redir.txt b/Documentation/ia64/IRQ-redir.txt deleted file mode 100644 index f7bd72261283..000000000000 --- a/Documentation/ia64/IRQ-redir.txt +++ /dev/null @@ -1,69 +0,0 @@ -IRQ affinity on IA64 platforms ------------------------------- - 07.01.2002, Erich Focht - - -By writing to /proc/irq/IRQ#/smp_affinity the interrupt routing can be -controlled. The behavior on IA64 platforms is slightly different from -that described in Documentation/IRQ-affinity.txt for i386 systems. - -Because of the usage of SAPIC mode and physical destination mode the -IRQ target is one particular CPU and cannot be a mask of several -CPUs. Only the first non-zero bit is taken into account. - - -Usage examples: - -The target CPU has to be specified as a hexadecimal CPU mask. The -first non-zero bit is the selected CPU. This format has been kept for -compatibility reasons with i386. - -Set the delivery mode of interrupt 41 to fixed and route the -interrupts to CPU #3 (logical CPU number) (2^3=0x08): - echo "8" >/proc/irq/41/smp_affinity - -Set the default route for IRQ number 41 to CPU 6 in lowest priority -delivery mode (redirectable): - echo "r 40" >/proc/irq/41/smp_affinity - -The output of the command - cat /proc/irq/IRQ#/smp_affinity -gives the target CPU mask for the specified interrupt vector. If the CPU -mask is preceded by the character "r", the interrupt is redirectable -(i.e. lowest priority mode routing is used), otherwise its route is -fixed. - - - -Initialization and default behavior: - -If the platform features IRQ redirection (info provided by SAL) all -IO-SAPIC interrupts are initialized with CPU#0 as their default target -and the routing is the so called "lowest priority mode" (actually -fixed SAPIC mode with hint). The XTP chipset registers are used as hints -for the IRQ routing. Currently in Linux XTP registers can have three -values: - - minimal for an idle task, - - normal if any other task runs, - - maximal if the CPU is going to be switched off. -The IRQ is routed to the CPU with lowest XTP register value, the -search begins at the default CPU. Therefore most of the interrupts -will be handled by CPU #0. - -If the platform doesn't feature interrupt redirection IOSAPIC fixed -routing is used. The target CPUs are distributed in a round robin -manner. IRQs will be routed only to the selected target CPUs. Check -with - cat /proc/interrupts - - - -Comments: - -On large (multi-node) systems it is recommended to route the IRQs to -the node to which the corresponding device is connected. -For systems like the NEC AzusA we get IRQ node-affinity for free. This -is because usually the chipsets on each node redirect the interrupts -only to their own CPUs (as they cannot see the XTP registers on the -other nodes). - diff --git a/Documentation/ia64/README b/Documentation/ia64/README deleted file mode 100644 index aa17f2154cba..000000000000 --- a/Documentation/ia64/README +++ /dev/null @@ -1,43 +0,0 @@ - Linux kernel release 2.4.xx for the IA-64 Platform - - These are the release notes for Linux version 2.4 for IA-64 - platform. This document provides information specific to IA-64 - ONLY, to get additional information about the Linux kernel also - read the original Linux README provided with the kernel. - -INSTALLING the kernel: - - - IA-64 kernel installation is the same as the other platforms, see - original README for details. - - -SOFTWARE REQUIREMENTS - - Compiling and running this kernel requires an IA-64 compliant GCC - compiler. And various software packages also compiled with an - IA-64 compliant GCC compiler. - - -CONFIGURING the kernel: - - Configuration is the same, see original README for details. - - -COMPILING the kernel: - - - Compiling this kernel doesn't differ from other platform so read - the original README for details BUT make sure you have an IA-64 - compliant GCC compiler. - -IA-64 SPECIFICS - - - General issues: - - o Hardly any performance tuning has been done. Obvious targets - include the library routines (IP checksum, etc.). Less - obvious targets include making sure we don't flush the TLB - needlessly, etc. - - o SMP locks cleanup/optimization - - o IA32 support. Currently experimental. It mostly works. diff --git a/Documentation/ia64/aliasing.rst b/Documentation/ia64/aliasing.rst new file mode 100644 index 000000000000..a08b36aba015 --- /dev/null +++ b/Documentation/ia64/aliasing.rst @@ -0,0 +1,246 @@ +================================== +Memory Attribute Aliasing on IA-64 +================================== + +Bjorn Helgaas + +May 4, 2006 + + +Memory Attributes +================= + + Itanium supports several attributes for virtual memory references. + The attribute is part of the virtual translation, i.e., it is + contained in the TLB entry. The ones of most interest to the Linux + kernel are: + + == ====================== + WB Write-back (cacheable) + UC Uncacheable + WC Write-coalescing + == ====================== + + System memory typically uses the WB attribute. The UC attribute is + used for memory-mapped I/O devices. The WC attribute is uncacheable + like UC is, but writes may be delayed and combined to increase + performance for things like frame buffers. + + The Itanium architecture requires that we avoid accessing the same + page with both a cacheable mapping and an uncacheable mapping[1]. + + The design of the chipset determines which attributes are supported + on which regions of the address space. For example, some chipsets + support either WB or UC access to main memory, while others support + only WB access. + +Memory Map +========== + + Platform firmware describes the physical memory map and the + supported attributes for each region. At boot-time, the kernel uses + the EFI GetMemoryMap() interface. ACPI can also describe memory + devices and the attributes they support, but Linux/ia64 currently + doesn't use this information. + + The kernel uses the efi_memmap table returned from GetMemoryMap() to + learn the attributes supported by each region of physical address + space. Unfortunately, this table does not completely describe the + address space because some machines omit some or all of the MMIO + regions from the map. + + The kernel maintains another table, kern_memmap, which describes the + memory Linux is actually using and the attribute for each region. + This contains only system memory; it does not contain MMIO space. + + The kern_memmap table typically contains only a subset of the system + memory described by the efi_memmap. Linux/ia64 can't use all memory + in the system because of constraints imposed by the identity mapping + scheme. + + The efi_memmap table is preserved unmodified because the original + boot-time information is required for kexec. + +Kernel Identify Mappings +======================== + + Linux/ia64 identity mappings are done with large pages, currently + either 16MB or 64MB, referred to as "granules." Cacheable mappings + are speculative[2], so the processor can read any location in the + page at any time, independent of the programmer's intentions. This + means that to avoid attribute aliasing, Linux can create a cacheable + identity mapping only when the entire granule supports cacheable + access. + + Therefore, kern_memmap contains only full granule-sized regions that + can referenced safely by an identity mapping. + + Uncacheable mappings are not speculative, so the processor will + generate UC accesses only to locations explicitly referenced by + software. This allows UC identity mappings to cover granules that + are only partially populated, or populated with a combination of UC + and WB regions. + +User Mappings +============= + + User mappings are typically done with 16K or 64K pages. The smaller + page size allows more flexibility because only 16K or 64K has to be + homogeneous with respect to memory attributes. + +Potential Attribute Aliasing Cases +================================== + + There are several ways the kernel creates new mappings: + +mmap of /dev/mem +---------------- + + This uses remap_pfn_range(), which creates user mappings. These + mappings may be either WB or UC. If the region being mapped + happens to be in kern_memmap, meaning that it may also be mapped + by a kernel identity mapping, the user mapping must use the same + attribute as the kernel mapping. + + If the region is not in kern_memmap, the user mapping should use + an attribute reported as being supported in the EFI memory map. + + Since the EFI memory map does not describe MMIO on some + machines, this should use an uncacheable mapping as a fallback. + +mmap of /sys/class/pci_bus/.../legacy_mem +----------------------------------------- + + This is very similar to mmap of /dev/mem, except that legacy_mem + only allows mmap of the one megabyte "legacy MMIO" area for a + specific PCI bus. Typically this is the first megabyte of + physical address space, but it may be different on machines with + several VGA devices. + + "X" uses this to access VGA frame buffers. Using legacy_mem + rather than /dev/mem allows multiple instances of X to talk to + different VGA cards. + + The /dev/mem mmap constraints apply. + +mmap of /proc/bus/pci/.../??.? +------------------------------ + + This is an MMIO mmap of PCI functions, which additionally may or + may not be requested as using the WC attribute. + + If WC is requested, and the region in kern_memmap is either WC + or UC, and the EFI memory map designates the region as WC, then + the WC mapping is allowed. + + Otherwise, the user mapping must use the same attribute as the + kernel mapping. + +read/write of /dev/mem +---------------------- + + This uses copy_from_user(), which implicitly uses a kernel + identity mapping. This is obviously safe for things in + kern_memmap. + + There may be corner cases of things that are not in kern_memmap, + but could be accessed this way. For example, registers in MMIO + space are not in kern_memmap, but could be accessed with a UC + mapping. This would not cause attribute aliasing. But + registers typically can be accessed only with four-byte or + eight-byte accesses, and the copy_from_user() path doesn't allow + any control over the access size, so this would be dangerous. + +ioremap() +--------- + + This returns a mapping for use inside the kernel. + + If the region is in kern_memmap, we should use the attribute + specified there. + + If the EFI memory map reports that the entire granule supports + WB, we should use that (granules that are partially reserved + or occupied by firmware do not appear in kern_memmap). + + If the granule contains non-WB memory, but we can cover the + region safely with kernel page table mappings, we can use + ioremap_page_range() as most other architectures do. + + Failing all of the above, we have to fall back to a UC mapping. + +Past Problem Cases +================== + +mmap of various MMIO regions from /dev/mem by "X" on Intel platforms +-------------------------------------------------------------------- + + The EFI memory map may not report these MMIO regions. + + These must be allowed so that X will work. This means that + when the EFI memory map is incomplete, every /dev/mem mmap must + succeed. It may create either WB or UC user mappings, depending + on whether the region is in kern_memmap or the EFI memory map. + +mmap of 0x0-0x9FFFF /dev/mem by "hwinfo" on HP sx1000 with VGA enabled +---------------------------------------------------------------------- + + The EFI memory map reports the following attributes: + + =============== ======= ================== + 0x00000-0x9FFFF WB only + 0xA0000-0xBFFFF UC only (VGA frame buffer) + 0xC0000-0xFFFFF WB only + =============== ======= ================== + + This mmap is done with user pages, not kernel identity mappings, + so it is safe to use WB mappings. + + The kernel VGA driver may ioremap the VGA frame buffer at 0xA0000, + which uses a granule-sized UC mapping. This granule will cover some + WB-only memory, but since UC is non-speculative, the processor will + never generate an uncacheable reference to the WB-only areas unless + the driver explicitly touches them. + +mmap of 0x0-0xFFFFF legacy_mem by "X" +------------------------------------- + + If the EFI memory map reports that the entire range supports the + same attributes, we can allow the mmap (and we will prefer WB if + supported, as is the case with HP sx[12]000 machines with VGA + disabled). + + If EFI reports the range as partly WB and partly UC (as on sx[12]000 + machines with VGA enabled), we must fail the mmap because there's no + safe attribute to use. + + If EFI reports some of the range but not all (as on Intel firmware + that doesn't report the VGA frame buffer at all), we should fail the + mmap and force the user to map just the specific region of interest. + +mmap of 0xA0000-0xBFFFF legacy_mem by "X" on HP sx1000 with VGA disabled +------------------------------------------------------------------------ + + The EFI memory map reports the following attributes:: + + 0x00000-0xFFFFF WB only (no VGA MMIO hole) + + This is a special case of the previous case, and the mmap should + fail for the same reason as above. + +read of /sys/devices/.../rom +---------------------------- + + For VGA devices, this may cause an ioremap() of 0xC0000. This + used to be done with a UC mapping, because the VGA frame buffer + at 0xA0000 prevents use of a WB granule. The UC mapping causes + an MCA on HP sx[12]000 chipsets. + + We should use WB page table mappings to avoid covering the VGA + frame buffer. + +Notes +===== + + [1] SDM rev 2.2, vol 2, sec 4.4.1. + [2] SDM rev 2.2, vol 2, sec 4.4.6. diff --git a/Documentation/ia64/aliasing.txt b/Documentation/ia64/aliasing.txt deleted file mode 100644 index 5a4dea6abebd..000000000000 --- a/Documentation/ia64/aliasing.txt +++ /dev/null @@ -1,221 +0,0 @@ - MEMORY ATTRIBUTE ALIASING ON IA-64 - - Bjorn Helgaas - - May 4, 2006 - - -MEMORY ATTRIBUTES - - Itanium supports several attributes for virtual memory references. - The attribute is part of the virtual translation, i.e., it is - contained in the TLB entry. The ones of most interest to the Linux - kernel are: - - WB Write-back (cacheable) - UC Uncacheable - WC Write-coalescing - - System memory typically uses the WB attribute. The UC attribute is - used for memory-mapped I/O devices. The WC attribute is uncacheable - like UC is, but writes may be delayed and combined to increase - performance for things like frame buffers. - - The Itanium architecture requires that we avoid accessing the same - page with both a cacheable mapping and an uncacheable mapping[1]. - - The design of the chipset determines which attributes are supported - on which regions of the address space. For example, some chipsets - support either WB or UC access to main memory, while others support - only WB access. - -MEMORY MAP - - Platform firmware describes the physical memory map and the - supported attributes for each region. At boot-time, the kernel uses - the EFI GetMemoryMap() interface. ACPI can also describe memory - devices and the attributes they support, but Linux/ia64 currently - doesn't use this information. - - The kernel uses the efi_memmap table returned from GetMemoryMap() to - learn the attributes supported by each region of physical address - space. Unfortunately, this table does not completely describe the - address space because some machines omit some or all of the MMIO - regions from the map. - - The kernel maintains another table, kern_memmap, which describes the - memory Linux is actually using and the attribute for each region. - This contains only system memory; it does not contain MMIO space. - - The kern_memmap table typically contains only a subset of the system - memory described by the efi_memmap. Linux/ia64 can't use all memory - in the system because of constraints imposed by the identity mapping - scheme. - - The efi_memmap table is preserved unmodified because the original - boot-time information is required for kexec. - -KERNEL IDENTITY MAPPINGS - - Linux/ia64 identity mappings are done with large pages, currently - either 16MB or 64MB, referred to as "granules." Cacheable mappings - are speculative[2], so the processor can read any location in the - page at any time, independent of the programmer's intentions. This - means that to avoid attribute aliasing, Linux can create a cacheable - identity mapping only when the entire granule supports cacheable - access. - - Therefore, kern_memmap contains only full granule-sized regions that - can referenced safely by an identity mapping. - - Uncacheable mappings are not speculative, so the processor will - generate UC accesses only to locations explicitly referenced by - software. This allows UC identity mappings to cover granules that - are only partially populated, or populated with a combination of UC - and WB regions. - -USER MAPPINGS - - User mappings are typically done with 16K or 64K pages. The smaller - page size allows more flexibility because only 16K or 64K has to be - homogeneous with respect to memory attributes. - -POTENTIAL ATTRIBUTE ALIASING CASES - - There are several ways the kernel creates new mappings: - - mmap of /dev/mem - - This uses remap_pfn_range(), which creates user mappings. These - mappings may be either WB or UC. If the region being mapped - happens to be in kern_memmap, meaning that it may also be mapped - by a kernel identity mapping, the user mapping must use the same - attribute as the kernel mapping. - - If the region is not in kern_memmap, the user mapping should use - an attribute reported as being supported in the EFI memory map. - - Since the EFI memory map does not describe MMIO on some - machines, this should use an uncacheable mapping as a fallback. - - mmap of /sys/class/pci_bus/.../legacy_mem - - This is very similar to mmap of /dev/mem, except that legacy_mem - only allows mmap of the one megabyte "legacy MMIO" area for a - specific PCI bus. Typically this is the first megabyte of - physical address space, but it may be different on machines with - several VGA devices. - - "X" uses this to access VGA frame buffers. Using legacy_mem - rather than /dev/mem allows multiple instances of X to talk to - different VGA cards. - - The /dev/mem mmap constraints apply. - - mmap of /proc/bus/pci/.../??.? - - This is an MMIO mmap of PCI functions, which additionally may or - may not be requested as using the WC attribute. - - If WC is requested, and the region in kern_memmap is either WC - or UC, and the EFI memory map designates the region as WC, then - the WC mapping is allowed. - - Otherwise, the user mapping must use the same attribute as the - kernel mapping. - - read/write of /dev/mem - - This uses copy_from_user(), which implicitly uses a kernel - identity mapping. This is obviously safe for things in - kern_memmap. - - There may be corner cases of things that are not in kern_memmap, - but could be accessed this way. For example, registers in MMIO - space are not in kern_memmap, but could be accessed with a UC - mapping. This would not cause attribute aliasing. But - registers typically can be accessed only with four-byte or - eight-byte accesses, and the copy_from_user() path doesn't allow - any control over the access size, so this would be dangerous. - - ioremap() - - This returns a mapping for use inside the kernel. - - If the region is in kern_memmap, we should use the attribute - specified there. - - If the EFI memory map reports that the entire granule supports - WB, we should use that (granules that are partially reserved - or occupied by firmware do not appear in kern_memmap). - - If the granule contains non-WB memory, but we can cover the - region safely with kernel page table mappings, we can use - ioremap_page_range() as most other architectures do. - - Failing all of the above, we have to fall back to a UC mapping. - -PAST PROBLEM CASES - - mmap of various MMIO regions from /dev/mem by "X" on Intel platforms - - The EFI memory map may not report these MMIO regions. - - These must be allowed so that X will work. This means that - when the EFI memory map is incomplete, every /dev/mem mmap must - succeed. It may create either WB or UC user mappings, depending - on whether the region is in kern_memmap or the EFI memory map. - - mmap of 0x0-0x9FFFF /dev/mem by "hwinfo" on HP sx1000 with VGA enabled - - The EFI memory map reports the following attributes: - 0x00000-0x9FFFF WB only - 0xA0000-0xBFFFF UC only (VGA frame buffer) - 0xC0000-0xFFFFF WB only - - This mmap is done with user pages, not kernel identity mappings, - so it is safe to use WB mappings. - - The kernel VGA driver may ioremap the VGA frame buffer at 0xA0000, - which uses a granule-sized UC mapping. This granule will cover some - WB-only memory, but since UC is non-speculative, the processor will - never generate an uncacheable reference to the WB-only areas unless - the driver explicitly touches them. - - mmap of 0x0-0xFFFFF legacy_mem by "X" - - If the EFI memory map reports that the entire range supports the - same attributes, we can allow the mmap (and we will prefer WB if - supported, as is the case with HP sx[12]000 machines with VGA - disabled). - - If EFI reports the range as partly WB and partly UC (as on sx[12]000 - machines with VGA enabled), we must fail the mmap because there's no - safe attribute to use. - - If EFI reports some of the range but not all (as on Intel firmware - that doesn't report the VGA frame buffer at all), we should fail the - mmap and force the user to map just the specific region of interest. - - mmap of 0xA0000-0xBFFFF legacy_mem by "X" on HP sx1000 with VGA disabled - - The EFI memory map reports the following attributes: - 0x00000-0xFFFFF WB only (no VGA MMIO hole) - - This is a special case of the previous case, and the mmap should - fail for the same reason as above. - - read of /sys/devices/.../rom - - For VGA devices, this may cause an ioremap() of 0xC0000. This - used to be done with a UC mapping, because the VGA frame buffer - at 0xA0000 prevents use of a WB granule. The UC mapping causes - an MCA on HP sx[12]000 chipsets. - - We should use WB page table mappings to avoid covering the VGA - frame buffer. - -NOTES - - [1] SDM rev 2.2, vol 2, sec 4.4.1. - [2] SDM rev 2.2, vol 2, sec 4.4.6. diff --git a/Documentation/ia64/efirtc.rst b/Documentation/ia64/efirtc.rst new file mode 100644 index 000000000000..2f7ff5026308 --- /dev/null +++ b/Documentation/ia64/efirtc.rst @@ -0,0 +1,144 @@ +========================== +EFI Real Time Clock driver +========================== + +S. Eranian + +March 2000 + +1. Introduction +=============== + +This document describes the efirtc.c driver has provided for +the IA-64 platform. + +The purpose of this driver is to supply an API for kernel and user applications +to get access to the Time Service offered by EFI version 0.92. + +EFI provides 4 calls one can make once the OS is booted: GetTime(), +SetTime(), GetWakeupTime(), SetWakeupTime() which are all supported by this +driver. We describe those calls as well the design of the driver in the +following sections. + +2. Design Decisions +=================== + +The original ideas was to provide a very simple driver to get access to, +at first, the time of day service. This is required in order to access, in a +portable way, the CMOS clock. A program like /sbin/hwclock uses such a clock +to initialize the system view of the time during boot. + +Because we wanted to minimize the impact on existing user-level apps using +the CMOS clock, we decided to expose an API that was very similar to the one +used today with the legacy RTC driver (driver/char/rtc.c). However, because +EFI provides a simpler services, not all ioctl() are available. Also +new ioctl()s have been introduced for things that EFI provides but not the +legacy. + +EFI uses a slightly different way of representing the time, noticeably +the reference date is different. Year is the using the full 4-digit format. +The Epoch is January 1st 1998. For backward compatibility reasons we don't +expose this new way of representing time. Instead we use something very +similar to the struct tm, i.e. struct rtc_time, as used by hwclock. +One of the reasons for doing it this way is to allow for EFI to still evolve +without necessarily impacting any of the user applications. The decoupling +enables flexibility and permits writing wrapper code is ncase things change. + +The driver exposes two interfaces, one via the device file and a set of +ioctl()s. The other is read-only via the /proc filesystem. + +As of today we don't offer a /proc/sys interface. + +To allow for a uniform interface between the legacy RTC and EFI time service, +we have created the include/linux/rtc.h header file to contain only the +"public" API of the two drivers. The specifics of the legacy RTC are still +in include/linux/mc146818rtc.h. + + +3. Time of day service +====================== + +The part of the driver gives access to the time of day service of EFI. +Two ioctl()s, compatible with the legacy RTC calls: + + Read the CMOS clock:: + + ioctl(d, RTC_RD_TIME, &rtc); + + Write the CMOS clock:: + + ioctl(d, RTC_SET_TIME, &rtc); + +The rtc is a pointer to a data structure defined in rtc.h which is close +to a struct tm:: + + struct rtc_time { + int tm_sec; + int tm_min; + int tm_hour; + int tm_mday; + int tm_mon; + int tm_year; + int tm_wday; + int tm_yday; + int tm_isdst; + }; + +The driver takes care of converting back an forth between the EFI time and +this format. + +Those two ioctl()s can be exercised with the hwclock command: + +For reading:: + + # /sbin/hwclock --show + Mon Mar 6 15:32:32 2000 -0.910248 seconds + +For setting:: + + # /sbin/hwclock --systohc + +Root privileges are required to be able to set the time of day. + +4. Wakeup Alarm service +======================= + +EFI provides an API by which one can program when a machine should wakeup, +i.e. reboot. This is very different from the alarm provided by the legacy +RTC which is some kind of interval timer alarm. For this reason we don't use +the same ioctl()s to get access to the service. Instead we have +introduced 2 news ioctl()s to the interface of an RTC. + +We have added 2 new ioctl()s that are specific to the EFI driver: + + Read the current state of the alarm:: + + ioctl(d, RTC_WKLAM_RD, &wkt) + + Set the alarm or change its status:: + + ioctl(d, RTC_WKALM_SET, &wkt) + +The wkt structure encapsulates a struct rtc_time + 2 extra fields to get +status information:: + + struct rtc_wkalrm { + + unsigned char enabled; /* =1 if alarm is enabled */ + unsigned char pending; /* =1 if alarm is pending */ + + struct rtc_time time; + } + +As of today, none of the existing user-level apps supports this feature. +However writing such a program should be hard by simply using those two +ioctl(). + +Root privileges are required to be able to set the alarm. + +5. References +============= + +Checkout the following Web site for more information on EFI: + +http://developer.intel.com/technology/efi/ diff --git a/Documentation/ia64/efirtc.txt b/Documentation/ia64/efirtc.txt deleted file mode 100644 index 057e6bebda8f..000000000000 --- a/Documentation/ia64/efirtc.txt +++ /dev/null @@ -1,128 +0,0 @@ -EFI Real Time Clock driver -------------------------------- -S. Eranian -March 2000 - -I/ Introduction - -This document describes the efirtc.c driver has provided for -the IA-64 platform. - -The purpose of this driver is to supply an API for kernel and user applications -to get access to the Time Service offered by EFI version 0.92. - -EFI provides 4 calls one can make once the OS is booted: GetTime(), -SetTime(), GetWakeupTime(), SetWakeupTime() which are all supported by this -driver. We describe those calls as well the design of the driver in the -following sections. - -II/ Design Decisions - -The original ideas was to provide a very simple driver to get access to, -at first, the time of day service. This is required in order to access, in a -portable way, the CMOS clock. A program like /sbin/hwclock uses such a clock -to initialize the system view of the time during boot. - -Because we wanted to minimize the impact on existing user-level apps using -the CMOS clock, we decided to expose an API that was very similar to the one -used today with the legacy RTC driver (driver/char/rtc.c). However, because -EFI provides a simpler services, not all ioctl() are available. Also -new ioctl()s have been introduced for things that EFI provides but not the -legacy. - -EFI uses a slightly different way of representing the time, noticeably -the reference date is different. Year is the using the full 4-digit format. -The Epoch is January 1st 1998. For backward compatibility reasons we don't -expose this new way of representing time. Instead we use something very -similar to the struct tm, i.e. struct rtc_time, as used by hwclock. -One of the reasons for doing it this way is to allow for EFI to still evolve -without necessarily impacting any of the user applications. The decoupling -enables flexibility and permits writing wrapper code is ncase things change. - -The driver exposes two interfaces, one via the device file and a set of -ioctl()s. The other is read-only via the /proc filesystem. - -As of today we don't offer a /proc/sys interface. - -To allow for a uniform interface between the legacy RTC and EFI time service, -we have created the include/linux/rtc.h header file to contain only the -"public" API of the two drivers. The specifics of the legacy RTC are still -in include/linux/mc146818rtc.h. - - -III/ Time of day service - -The part of the driver gives access to the time of day service of EFI. -Two ioctl()s, compatible with the legacy RTC calls: - - Read the CMOS clock: ioctl(d, RTC_RD_TIME, &rtc); - - Write the CMOS clock: ioctl(d, RTC_SET_TIME, &rtc); - -The rtc is a pointer to a data structure defined in rtc.h which is close -to a struct tm: - -struct rtc_time { - int tm_sec; - int tm_min; - int tm_hour; - int tm_mday; - int tm_mon; - int tm_year; - int tm_wday; - int tm_yday; - int tm_isdst; -}; - -The driver takes care of converting back an forth between the EFI time and -this format. - -Those two ioctl()s can be exercised with the hwclock command: - -For reading: -# /sbin/hwclock --show -Mon Mar 6 15:32:32 2000 -0.910248 seconds - -For setting: -# /sbin/hwclock --systohc - -Root privileges are required to be able to set the time of day. - -IV/ Wakeup Alarm service - -EFI provides an API by which one can program when a machine should wakeup, -i.e. reboot. This is very different from the alarm provided by the legacy -RTC which is some kind of interval timer alarm. For this reason we don't use -the same ioctl()s to get access to the service. Instead we have -introduced 2 news ioctl()s to the interface of an RTC. - -We have added 2 new ioctl()s that are specific to the EFI driver: - - Read the current state of the alarm - ioctl(d, RTC_WKLAM_RD, &wkt) - - Set the alarm or change its status - ioctl(d, RTC_WKALM_SET, &wkt) - -The wkt structure encapsulates a struct rtc_time + 2 extra fields to get -status information: - -struct rtc_wkalrm { - - unsigned char enabled; /* =1 if alarm is enabled */ - unsigned char pending; /* =1 if alarm is pending */ - - struct rtc_time time; -} - -As of today, none of the existing user-level apps supports this feature. -However writing such a program should be hard by simply using those two -ioctl(). - -Root privileges are required to be able to set the alarm. - -V/ References. - -Checkout the following Web site for more information on EFI: - -http://developer.intel.com/technology/efi/ diff --git a/Documentation/ia64/err_inject.rst b/Documentation/ia64/err_inject.rst new file mode 100644 index 000000000000..900f71e93a29 --- /dev/null +++ b/Documentation/ia64/err_inject.rst @@ -0,0 +1,1067 @@ +======================================== +IPF Machine Check (MC) error inject tool +======================================== + +IPF Machine Check (MC) error inject tool is used to inject MC +errors from Linux. The tool is a test bed for IPF MC work flow including +hardware correctable error handling, OS recoverable error handling, MC +event logging, etc. + +The tool includes two parts: a kernel driver and a user application +sample. The driver provides interface to PAL to inject error +and query error injection capabilities. The driver code is in +arch/ia64/kernel/err_inject.c. The application sample (shown below) +provides a combination of various errors and calls the driver's interface +(sysfs interface) to inject errors or query error injection capabilities. + +The tool can be used to test Intel IPF machine MC handling capabilities. +It's especially useful for people who can not access hardware MC injection +tool to inject error. It's also very useful to integrate with other +software test suits to do stressful testing on IPF. + +Below is a sample application as part of the whole tool. The sample +can be used as a working test tool. Or it can be expanded to include +more features. It also can be a integrated into a library or other user +application to have more thorough test. + +The sample application takes err.conf as error configuration input. GCC +compiles the code. After you install err_inject driver, you can run +this sample application to inject errors. + +Errata: Itanium 2 Processors Specification Update lists some errata against +the pal_mc_error_inject PAL procedure. The following err.conf has been tested +on latest Montecito PAL. + +err.conf:: + + #This is configuration file for err_inject_tool. + #The format of the each line is: + #cpu, loop, interval, err_type_info, err_struct_info, err_data_buffer + #where + # cpu: logical cpu number the error will be inject in. + # loop: times the error will be injected. + # interval: In second. every so often one error is injected. + # err_type_info, err_struct_info: PAL parameters. + # + #Note: All values are hex w/o or w/ 0x prefix. + + + #On cpu2, inject only total 0x10 errors, interval 5 seconds + #corrected, data cache, hier-2, physical addr(assigned by tool code). + #working on Montecito latest PAL. + 2, 10, 5, 4101, 95 + + #On cpu4, inject and consume total 0x10 errors, interval 5 seconds + #corrected, data cache, hier-2, physical addr(assigned by tool code). + #working on Montecito latest PAL. + 4, 10, 5, 4109, 95 + + #On cpu15, inject and consume total 0x10 errors, interval 5 seconds + #recoverable, DTR0, hier-2. + #working on Montecito latest PAL. + 0xf, 0x10, 5, 4249, 15 + +The sample application source code: + +err_injection_tool.c:: + + /* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Copyright (C) 2006 Intel Co + * Fenghua Yu + * + */ + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + + #define MAX_FN_SIZE 256 + #define MAX_BUF_SIZE 256 + #define DATA_BUF_SIZE 256 + #define NR_CPUS 512 + #define MAX_TASK_NUM 2048 + #define MIN_INTERVAL 5 // seconds + #define ERR_DATA_BUFFER_SIZE 3 // Three 8-byte. + #define PARA_FIELD_NUM 5 + #define MASK_SIZE (NR_CPUS/64) + #define PATH_FORMAT "/sys/devices/system/cpu/cpu%d/err_inject/" + + int sched_setaffinity(pid_t pid, unsigned int len, unsigned long *mask); + + int verbose; + #define vbprintf if (verbose) printf + + int log_info(int cpu, const char *fmt, ...) + { + FILE *log; + char fn[MAX_FN_SIZE]; + char buf[MAX_BUF_SIZE]; + va_list args; + + sprintf(fn, "%d.log", cpu); + log=fopen(fn, "a+"); + if (log==NULL) { + perror("Error open:"); + return -1; + } + + va_start(args, fmt); + vprintf(fmt, args); + memset(buf, 0, MAX_BUF_SIZE); + vsprintf(buf, fmt, args); + va_end(args); + + fwrite(buf, sizeof(buf), 1, log); + fclose(log); + + return 0; + } + + typedef unsigned long u64; + typedef unsigned int u32; + + typedef union err_type_info_u { + struct { + u64 mode : 3, /* 0-2 */ + err_inj : 3, /* 3-5 */ + err_sev : 2, /* 6-7 */ + err_struct : 5, /* 8-12 */ + struct_hier : 3, /* 13-15 */ + reserved : 48; /* 16-63 */ + } err_type_info_u; + u64 err_type_info; + } err_type_info_t; + + typedef union err_struct_info_u { + struct { + u64 siv : 1, /* 0 */ + c_t : 2, /* 1-2 */ + cl_p : 3, /* 3-5 */ + cl_id : 3, /* 6-8 */ + cl_dp : 1, /* 9 */ + reserved1 : 22, /* 10-31 */ + tiv : 1, /* 32 */ + trigger : 4, /* 33-36 */ + trigger_pl : 3, /* 37-39 */ + reserved2 : 24; /* 40-63 */ + } err_struct_info_cache; + struct { + u64 siv : 1, /* 0 */ + tt : 2, /* 1-2 */ + tc_tr : 2, /* 3-4 */ + tr_slot : 8, /* 5-12 */ + reserved1 : 19, /* 13-31 */ + tiv : 1, /* 32 */ + trigger : 4, /* 33-36 */ + trigger_pl : 3, /* 37-39 */ + reserved2 : 24; /* 40-63 */ + } err_struct_info_tlb; + struct { + u64 siv : 1, /* 0 */ + regfile_id : 4, /* 1-4 */ + reg_num : 7, /* 5-11 */ + reserved1 : 20, /* 12-31 */ + tiv : 1, /* 32 */ + trigger : 4, /* 33-36 */ + trigger_pl : 3, /* 37-39 */ + reserved2 : 24; /* 40-63 */ + } err_struct_info_register; + struct { + u64 reserved; + } err_struct_info_bus_processor_interconnect; + u64 err_struct_info; + } err_struct_info_t; + + typedef union err_data_buffer_u { + struct { + u64 trigger_addr; /* 0-63 */ + u64 inj_addr; /* 64-127 */ + u64 way : 5, /* 128-132 */ + index : 20, /* 133-152 */ + : 39; /* 153-191 */ + } err_data_buffer_cache; + struct { + u64 trigger_addr; /* 0-63 */ + u64 inj_addr; /* 64-127 */ + u64 way : 5, /* 128-132 */ + index : 20, /* 133-152 */ + reserved : 39; /* 153-191 */ + } err_data_buffer_tlb; + struct { + u64 trigger_addr; /* 0-63 */ + } err_data_buffer_register; + struct { + u64 reserved; /* 0-63 */ + } err_data_buffer_bus_processor_interconnect; + u64 err_data_buffer[ERR_DATA_BUFFER_SIZE]; + } err_data_buffer_t; + + typedef union capabilities_u { + struct { + u64 i : 1, + d : 1, + rv : 1, + tag : 1, + data : 1, + mesi : 1, + dp : 1, + reserved1 : 3, + pa : 1, + va : 1, + wi : 1, + reserved2 : 20, + trigger : 1, + trigger_pl : 1, + reserved3 : 30; + } capabilities_cache; + struct { + u64 d : 1, + i : 1, + rv : 1, + tc : 1, + tr : 1, + reserved1 : 27, + trigger : 1, + trigger_pl : 1, + reserved2 : 30; + } capabilities_tlb; + struct { + u64 gr_b0 : 1, + gr_b1 : 1, + fr : 1, + br : 1, + pr : 1, + ar : 1, + cr : 1, + rr : 1, + pkr : 1, + dbr : 1, + ibr : 1, + pmc : 1, + pmd : 1, + reserved1 : 3, + regnum : 1, + reserved2 : 15, + trigger : 1, + trigger_pl : 1, + reserved3 : 30; + } capabilities_register; + struct { + u64 reserved; + } capabilities_bus_processor_interconnect; + } capabilities_t; + + typedef struct resources_s { + u64 ibr0 : 1, + ibr2 : 1, + ibr4 : 1, + ibr6 : 1, + dbr0 : 1, + dbr2 : 1, + dbr4 : 1, + dbr6 : 1, + reserved : 48; + } resources_t; + + + long get_page_size(void) + { + long page_size=sysconf(_SC_PAGESIZE); + return page_size; + } + + #define PAGE_SIZE (get_page_size()==-1?0x4000:get_page_size()) + #define SHM_SIZE (2*PAGE_SIZE*NR_CPUS) + #define SHM_VA 0x2000000100000000 + + int shmid; + void *shmaddr; + + int create_shm(void) + { + key_t key; + char fn[MAX_FN_SIZE]; + + /* cpu0 is always existing */ + sprintf(fn, PATH_FORMAT, 0); + if ((key = ftok(fn, 's')) == -1) { + perror("ftok"); + return -1; + } + + shmid = shmget(key, SHM_SIZE, 0644 | IPC_CREAT); + if (shmid == -1) { + if (errno==EEXIST) { + shmid = shmget(key, SHM_SIZE, 0); + if (shmid == -1) { + perror("shmget"); + return -1; + } + } + else { + perror("shmget"); + return -1; + } + } + vbprintf("shmid=%d", shmid); + + /* connect to the segment: */ + shmaddr = shmat(shmid, (void *)SHM_VA, 0); + if (shmaddr == (void*)-1) { + perror("shmat"); + return -1; + } + + memset(shmaddr, 0, SHM_SIZE); + mlock(shmaddr, SHM_SIZE); + + return 0; + } + + int free_shm() + { + munlock(shmaddr, SHM_SIZE); + shmdt(shmaddr); + semctl(shmid, 0, IPC_RMID); + + return 0; + } + + #ifdef _SEM_SEMUN_UNDEFINED + union semun + { + int val; + struct semid_ds *buf; + unsigned short int *array; + struct seminfo *__buf; + }; + #endif + + u32 mode=1; /* 1: physical mode; 2: virtual mode. */ + int one_lock=1; + key_t key[NR_CPUS]; + int semid[NR_CPUS]; + + int create_sem(int cpu) + { + union semun arg; + char fn[MAX_FN_SIZE]; + int sid; + + sprintf(fn, PATH_FORMAT, cpu); + sprintf(fn, "%s/%s", fn, "err_type_info"); + if ((key[cpu] = ftok(fn, 'e')) == -1) { + perror("ftok"); + return -1; + } + + if (semid[cpu]!=0) + return 0; + + /* clear old semaphore */ + if ((sid = semget(key[cpu], 1, 0)) != -1) + semctl(sid, 0, IPC_RMID); + + /* get one semaphore */ + if ((semid[cpu] = semget(key[cpu], 1, IPC_CREAT | IPC_EXCL)) == -1) { + perror("semget"); + printf("Please remove semaphore with key=0x%lx, then run the tool.\n", + (u64)key[cpu]); + return -1; + } + + vbprintf("semid[%d]=0x%lx, key[%d]=%lx\n",cpu,(u64)semid[cpu],cpu, + (u64)key[cpu]); + /* initialize the semaphore to 1: */ + arg.val = 1; + if (semctl(semid[cpu], 0, SETVAL, arg) == -1) { + perror("semctl"); + return -1; + } + + return 0; + } + + static int lock(int cpu) + { + struct sembuf lock; + + lock.sem_num = cpu; + lock.sem_op = 1; + semop(semid[cpu], &lock, 1); + + return 0; + } + + static int unlock(int cpu) + { + struct sembuf unlock; + + unlock.sem_num = cpu; + unlock.sem_op = -1; + semop(semid[cpu], &unlock, 1); + + return 0; + } + + void free_sem(int cpu) + { + semctl(semid[cpu], 0, IPC_RMID); + } + + int wr_multi(char *fn, unsigned long *data, int size) + { + int fd; + char buf[MAX_BUF_SIZE]; + int ret; + + if (size==1) + sprintf(buf, "%lx", *data); + else if (size==3) + sprintf(buf, "%lx,%lx,%lx", data[0], data[1], data[2]); + else { + fprintf(stderr,"write to file with wrong size!\n"); + return -1; + } + + fd=open(fn, O_RDWR); + if (!fd) { + perror("Error:"); + return -1; + } + ret=write(fd, buf, sizeof(buf)); + close(fd); + return ret; + } + + int wr(char *fn, unsigned long data) + { + return wr_multi(fn, &data, 1); + } + + int rd(char *fn, unsigned long *data) + { + int fd; + char buf[MAX_BUF_SIZE]; + + fd=open(fn, O_RDONLY); + if (fd<0) { + perror("Error:"); + return -1; + } + read(fd, buf, MAX_BUF_SIZE); + *data=strtoul(buf, NULL, 16); + close(fd); + return 0; + } + + int rd_status(char *path, int *status) + { + char fn[MAX_FN_SIZE]; + sprintf(fn, "%s/status", path); + if (rd(fn, (u64*)status)<0) { + perror("status reading error.\n"); + return -1; + } + + return 0; + } + + int rd_capabilities(char *path, u64 *capabilities) + { + char fn[MAX_FN_SIZE]; + sprintf(fn, "%s/capabilities", path); + if (rd(fn, capabilities)<0) { + perror("capabilities reading error.\n"); + return -1; + } + + return 0; + } + + int rd_all(char *path) + { + unsigned long err_type_info, err_struct_info, err_data_buffer; + int status; + unsigned long capabilities, resources; + char fn[MAX_FN_SIZE]; + + sprintf(fn, "%s/err_type_info", path); + if (rd(fn, &err_type_info)<0) { + perror("err_type_info reading error.\n"); + return -1; + } + printf("err_type_info=%lx\n", err_type_info); + + sprintf(fn, "%s/err_struct_info", path); + if (rd(fn, &err_struct_info)<0) { + perror("err_struct_info reading error.\n"); + return -1; + } + printf("err_struct_info=%lx\n", err_struct_info); + + sprintf(fn, "%s/err_data_buffer", path); + if (rd(fn, &err_data_buffer)<0) { + perror("err_data_buffer reading error.\n"); + return -1; + } + printf("err_data_buffer=%lx\n", err_data_buffer); + + sprintf(fn, "%s/status", path); + if (rd("status", (u64*)&status)<0) { + perror("status reading error.\n"); + return -1; + } + printf("status=%d\n", status); + + sprintf(fn, "%s/capabilities", path); + if (rd(fn,&capabilities)<0) { + perror("capabilities reading error.\n"); + return -1; + } + printf("capabilities=%lx\n", capabilities); + + sprintf(fn, "%s/resources", path); + if (rd(fn, &resources)<0) { + perror("resources reading error.\n"); + return -1; + } + printf("resources=%lx\n", resources); + + return 0; + } + + int query_capabilities(char *path, err_type_info_t err_type_info, + u64 *capabilities) + { + char fn[MAX_FN_SIZE]; + err_struct_info_t err_struct_info; + err_data_buffer_t err_data_buffer; + + err_struct_info.err_struct_info=0; + memset(err_data_buffer.err_data_buffer, -1, ERR_DATA_BUFFER_SIZE*8); + + sprintf(fn, "%s/err_type_info", path); + wr(fn, err_type_info.err_type_info); + sprintf(fn, "%s/err_struct_info", path); + wr(fn, 0x0); + sprintf(fn, "%s/err_data_buffer", path); + wr_multi(fn, err_data_buffer.err_data_buffer, ERR_DATA_BUFFER_SIZE); + + // Fire pal_mc_error_inject procedure. + sprintf(fn, "%s/call_start", path); + wr(fn, mode); + + if (rd_capabilities(path, capabilities)<0) + return -1; + + return 0; + } + + int query_all_capabilities() + { + int status; + err_type_info_t err_type_info; + int err_sev, err_struct, struct_hier; + int cap=0; + u64 capabilities; + char path[MAX_FN_SIZE]; + + err_type_info.err_type_info=0; // Initial + err_type_info.err_type_info_u.mode=0; // Query mode; + err_type_info.err_type_info_u.err_inj=0; + + printf("All capabilities implemented in pal_mc_error_inject:\n"); + sprintf(path, PATH_FORMAT ,0); + for (err_sev=0;err_sev<3;err_sev++) + for (err_struct=0;err_struct<5;err_struct++) + for (struct_hier=0;struct_hier<5;struct_hier++) + { + status=-1; + capabilities=0; + err_type_info.err_type_info_u.err_sev=err_sev; + err_type_info.err_type_info_u.err_struct=err_struct; + err_type_info.err_type_info_u.struct_hier=struct_hier; + + if (query_capabilities(path, err_type_info, &capabilities)<0) + continue; + + if (rd_status(path, &status)<0) + continue; + + if (status==0) { + cap=1; + printf("For err_sev=%d, err_struct=%d, struct_hier=%d: ", + err_sev, err_struct, struct_hier); + printf("capabilities 0x%lx\n", capabilities); + } + } + if (!cap) { + printf("No capabilities supported.\n"); + return 0; + } + + return 0; + } + + int err_inject(int cpu, char *path, err_type_info_t err_type_info, + err_struct_info_t err_struct_info, + err_data_buffer_t err_data_buffer) + { + int status; + char fn[MAX_FN_SIZE]; + + log_info(cpu, "err_type_info=%lx, err_struct_info=%lx, ", + err_type_info.err_type_info, + err_struct_info.err_struct_info); + log_info(cpu,"err_data_buffer=[%lx,%lx,%lx]\n", + err_data_buffer.err_data_buffer[0], + err_data_buffer.err_data_buffer[1], + err_data_buffer.err_data_buffer[2]); + sprintf(fn, "%s/err_type_info", path); + wr(fn, err_type_info.err_type_info); + sprintf(fn, "%s/err_struct_info", path); + wr(fn, err_struct_info.err_struct_info); + sprintf(fn, "%s/err_data_buffer", path); + wr_multi(fn, err_data_buffer.err_data_buffer, ERR_DATA_BUFFER_SIZE); + + // Fire pal_mc_error_inject procedure. + sprintf(fn, "%s/call_start", path); + wr(fn,mode); + + if (rd_status(path, &status)<0) { + vbprintf("fail: read status\n"); + return -100; + } + + if (status!=0) { + log_info(cpu, "fail: status=%d\n", status); + return status; + } + + return status; + } + + static int construct_data_buf(char *path, err_type_info_t err_type_info, + err_struct_info_t err_struct_info, + err_data_buffer_t *err_data_buffer, + void *va1) + { + char fn[MAX_FN_SIZE]; + u64 virt_addr=0, phys_addr=0; + + vbprintf("va1=%lx\n", (u64)va1); + memset(&err_data_buffer->err_data_buffer_cache, 0, ERR_DATA_BUFFER_SIZE*8); + + switch (err_type_info.err_type_info_u.err_struct) { + case 1: // Cache + switch (err_struct_info.err_struct_info_cache.cl_id) { + case 1: //Virtual addr + err_data_buffer->err_data_buffer_cache.inj_addr=(u64)va1; + break; + case 2: //Phys addr + sprintf(fn, "%s/virtual_to_phys", path); + virt_addr=(u64)va1; + if (wr(fn,virt_addr)<0) + return -1; + rd(fn, &phys_addr); + err_data_buffer->err_data_buffer_cache.inj_addr=phys_addr; + break; + default: + printf("Not supported cl_id\n"); + break; + } + break; + case 2: // TLB + break; + case 3: // Register file + break; + case 4: // Bus/system interconnect + default: + printf("Not supported err_struct\n"); + break; + } + + return 0; + } + + typedef struct { + u64 cpu; + u64 loop; + u64 interval; + u64 err_type_info; + u64 err_struct_info; + u64 err_data_buffer[ERR_DATA_BUFFER_SIZE]; + } parameters_t; + + parameters_t line_para; + int para; + + static int empty_data_buffer(u64 *err_data_buffer) + { + int empty=1; + int i; + + for (i=0;iMIN_INTERVAL + ?interval:MIN_INTERVAL; + parameters[num].err_type_info=err_type_info_conf; + parameters[num].err_struct_info=err_struct_info_conf; + memcpy(parameters[num++].err_data_buffer, + err_data_buffer_conf,ERR_DATA_BUFFER_SIZE*8) ; + + if (num>=MAX_TASK_NUM) + break; + } + } + else { + parameters[0].cpu=line_para.cpu; + parameters[0].loop=line_para.loop; + parameters[0].interval= line_para.interval>MIN_INTERVAL + ?line_para.interval:MIN_INTERVAL; + parameters[0].err_type_info=line_para.err_type_info; + parameters[0].err_struct_info=line_para.err_struct_info; + memcpy(parameters[0].err_data_buffer, + line_para.err_data_buffer,ERR_DATA_BUFFER_SIZE*8) ; + + num=1; + } + + /* Create semaphore: If one_lock, one semaphore for all processors. + Otherwise, one semaphore for each processor. */ + if (one_lock) { + if (create_sem(0)) { + printf("Can not create semaphore...exit\n"); + free_sem(0); + return -1; + } + } + else { + for (i=0;i - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define MAX_FN_SIZE 256 -#define MAX_BUF_SIZE 256 -#define DATA_BUF_SIZE 256 -#define NR_CPUS 512 -#define MAX_TASK_NUM 2048 -#define MIN_INTERVAL 5 // seconds -#define ERR_DATA_BUFFER_SIZE 3 // Three 8-byte. -#define PARA_FIELD_NUM 5 -#define MASK_SIZE (NR_CPUS/64) -#define PATH_FORMAT "/sys/devices/system/cpu/cpu%d/err_inject/" - -int sched_setaffinity(pid_t pid, unsigned int len, unsigned long *mask); - -int verbose; -#define vbprintf if (verbose) printf - -int log_info(int cpu, const char *fmt, ...) -{ - FILE *log; - char fn[MAX_FN_SIZE]; - char buf[MAX_BUF_SIZE]; - va_list args; - - sprintf(fn, "%d.log", cpu); - log=fopen(fn, "a+"); - if (log==NULL) { - perror("Error open:"); - return -1; - } - - va_start(args, fmt); - vprintf(fmt, args); - memset(buf, 0, MAX_BUF_SIZE); - vsprintf(buf, fmt, args); - va_end(args); - - fwrite(buf, sizeof(buf), 1, log); - fclose(log); - - return 0; -} - -typedef unsigned long u64; -typedef unsigned int u32; - -typedef union err_type_info_u { - struct { - u64 mode : 3, /* 0-2 */ - err_inj : 3, /* 3-5 */ - err_sev : 2, /* 6-7 */ - err_struct : 5, /* 8-12 */ - struct_hier : 3, /* 13-15 */ - reserved : 48; /* 16-63 */ - } err_type_info_u; - u64 err_type_info; -} err_type_info_t; - -typedef union err_struct_info_u { - struct { - u64 siv : 1, /* 0 */ - c_t : 2, /* 1-2 */ - cl_p : 3, /* 3-5 */ - cl_id : 3, /* 6-8 */ - cl_dp : 1, /* 9 */ - reserved1 : 22, /* 10-31 */ - tiv : 1, /* 32 */ - trigger : 4, /* 33-36 */ - trigger_pl : 3, /* 37-39 */ - reserved2 : 24; /* 40-63 */ - } err_struct_info_cache; - struct { - u64 siv : 1, /* 0 */ - tt : 2, /* 1-2 */ - tc_tr : 2, /* 3-4 */ - tr_slot : 8, /* 5-12 */ - reserved1 : 19, /* 13-31 */ - tiv : 1, /* 32 */ - trigger : 4, /* 33-36 */ - trigger_pl : 3, /* 37-39 */ - reserved2 : 24; /* 40-63 */ - } err_struct_info_tlb; - struct { - u64 siv : 1, /* 0 */ - regfile_id : 4, /* 1-4 */ - reg_num : 7, /* 5-11 */ - reserved1 : 20, /* 12-31 */ - tiv : 1, /* 32 */ - trigger : 4, /* 33-36 */ - trigger_pl : 3, /* 37-39 */ - reserved2 : 24; /* 40-63 */ - } err_struct_info_register; - struct { - u64 reserved; - } err_struct_info_bus_processor_interconnect; - u64 err_struct_info; -} err_struct_info_t; - -typedef union err_data_buffer_u { - struct { - u64 trigger_addr; /* 0-63 */ - u64 inj_addr; /* 64-127 */ - u64 way : 5, /* 128-132 */ - index : 20, /* 133-152 */ - : 39; /* 153-191 */ - } err_data_buffer_cache; - struct { - u64 trigger_addr; /* 0-63 */ - u64 inj_addr; /* 64-127 */ - u64 way : 5, /* 128-132 */ - index : 20, /* 133-152 */ - reserved : 39; /* 153-191 */ - } err_data_buffer_tlb; - struct { - u64 trigger_addr; /* 0-63 */ - } err_data_buffer_register; - struct { - u64 reserved; /* 0-63 */ - } err_data_buffer_bus_processor_interconnect; - u64 err_data_buffer[ERR_DATA_BUFFER_SIZE]; -} err_data_buffer_t; - -typedef union capabilities_u { - struct { - u64 i : 1, - d : 1, - rv : 1, - tag : 1, - data : 1, - mesi : 1, - dp : 1, - reserved1 : 3, - pa : 1, - va : 1, - wi : 1, - reserved2 : 20, - trigger : 1, - trigger_pl : 1, - reserved3 : 30; - } capabilities_cache; - struct { - u64 d : 1, - i : 1, - rv : 1, - tc : 1, - tr : 1, - reserved1 : 27, - trigger : 1, - trigger_pl : 1, - reserved2 : 30; - } capabilities_tlb; - struct { - u64 gr_b0 : 1, - gr_b1 : 1, - fr : 1, - br : 1, - pr : 1, - ar : 1, - cr : 1, - rr : 1, - pkr : 1, - dbr : 1, - ibr : 1, - pmc : 1, - pmd : 1, - reserved1 : 3, - regnum : 1, - reserved2 : 15, - trigger : 1, - trigger_pl : 1, - reserved3 : 30; - } capabilities_register; - struct { - u64 reserved; - } capabilities_bus_processor_interconnect; -} capabilities_t; - -typedef struct resources_s { - u64 ibr0 : 1, - ibr2 : 1, - ibr4 : 1, - ibr6 : 1, - dbr0 : 1, - dbr2 : 1, - dbr4 : 1, - dbr6 : 1, - reserved : 48; -} resources_t; - - -long get_page_size(void) -{ - long page_size=sysconf(_SC_PAGESIZE); - return page_size; -} - -#define PAGE_SIZE (get_page_size()==-1?0x4000:get_page_size()) -#define SHM_SIZE (2*PAGE_SIZE*NR_CPUS) -#define SHM_VA 0x2000000100000000 - -int shmid; -void *shmaddr; - -int create_shm(void) -{ - key_t key; - char fn[MAX_FN_SIZE]; - - /* cpu0 is always existing */ - sprintf(fn, PATH_FORMAT, 0); - if ((key = ftok(fn, 's')) == -1) { - perror("ftok"); - return -1; - } - - shmid = shmget(key, SHM_SIZE, 0644 | IPC_CREAT); - if (shmid == -1) { - if (errno==EEXIST) { - shmid = shmget(key, SHM_SIZE, 0); - if (shmid == -1) { - perror("shmget"); - return -1; - } - } - else { - perror("shmget"); - return -1; - } - } - vbprintf("shmid=%d", shmid); - - /* connect to the segment: */ - shmaddr = shmat(shmid, (void *)SHM_VA, 0); - if (shmaddr == (void*)-1) { - perror("shmat"); - return -1; - } - - memset(shmaddr, 0, SHM_SIZE); - mlock(shmaddr, SHM_SIZE); - - return 0; -} - -int free_shm() -{ - munlock(shmaddr, SHM_SIZE); - shmdt(shmaddr); - semctl(shmid, 0, IPC_RMID); - - return 0; -} - -#ifdef _SEM_SEMUN_UNDEFINED -union semun -{ - int val; - struct semid_ds *buf; - unsigned short int *array; - struct seminfo *__buf; -}; -#endif - -u32 mode=1; /* 1: physical mode; 2: virtual mode. */ -int one_lock=1; -key_t key[NR_CPUS]; -int semid[NR_CPUS]; - -int create_sem(int cpu) -{ - union semun arg; - char fn[MAX_FN_SIZE]; - int sid; - - sprintf(fn, PATH_FORMAT, cpu); - sprintf(fn, "%s/%s", fn, "err_type_info"); - if ((key[cpu] = ftok(fn, 'e')) == -1) { - perror("ftok"); - return -1; - } - - if (semid[cpu]!=0) - return 0; - - /* clear old semaphore */ - if ((sid = semget(key[cpu], 1, 0)) != -1) - semctl(sid, 0, IPC_RMID); - - /* get one semaphore */ - if ((semid[cpu] = semget(key[cpu], 1, IPC_CREAT | IPC_EXCL)) == -1) { - perror("semget"); - printf("Please remove semaphore with key=0x%lx, then run the tool.\n", - (u64)key[cpu]); - return -1; - } - - vbprintf("semid[%d]=0x%lx, key[%d]=%lx\n",cpu,(u64)semid[cpu],cpu, - (u64)key[cpu]); - /* initialize the semaphore to 1: */ - arg.val = 1; - if (semctl(semid[cpu], 0, SETVAL, arg) == -1) { - perror("semctl"); - return -1; - } - - return 0; -} - -static int lock(int cpu) -{ - struct sembuf lock; - - lock.sem_num = cpu; - lock.sem_op = 1; - semop(semid[cpu], &lock, 1); - - return 0; -} - -static int unlock(int cpu) -{ - struct sembuf unlock; - - unlock.sem_num = cpu; - unlock.sem_op = -1; - semop(semid[cpu], &unlock, 1); - - return 0; -} - -void free_sem(int cpu) -{ - semctl(semid[cpu], 0, IPC_RMID); -} - -int wr_multi(char *fn, unsigned long *data, int size) -{ - int fd; - char buf[MAX_BUF_SIZE]; - int ret; - - if (size==1) - sprintf(buf, "%lx", *data); - else if (size==3) - sprintf(buf, "%lx,%lx,%lx", data[0], data[1], data[2]); - else { - fprintf(stderr,"write to file with wrong size!\n"); - return -1; - } - - fd=open(fn, O_RDWR); - if (!fd) { - perror("Error:"); - return -1; - } - ret=write(fd, buf, sizeof(buf)); - close(fd); - return ret; -} - -int wr(char *fn, unsigned long data) -{ - return wr_multi(fn, &data, 1); -} - -int rd(char *fn, unsigned long *data) -{ - int fd; - char buf[MAX_BUF_SIZE]; - - fd=open(fn, O_RDONLY); - if (fd<0) { - perror("Error:"); - return -1; - } - read(fd, buf, MAX_BUF_SIZE); - *data=strtoul(buf, NULL, 16); - close(fd); - return 0; -} - -int rd_status(char *path, int *status) -{ - char fn[MAX_FN_SIZE]; - sprintf(fn, "%s/status", path); - if (rd(fn, (u64*)status)<0) { - perror("status reading error.\n"); - return -1; - } - - return 0; -} - -int rd_capabilities(char *path, u64 *capabilities) -{ - char fn[MAX_FN_SIZE]; - sprintf(fn, "%s/capabilities", path); - if (rd(fn, capabilities)<0) { - perror("capabilities reading error.\n"); - return -1; - } - - return 0; -} - -int rd_all(char *path) -{ - unsigned long err_type_info, err_struct_info, err_data_buffer; - int status; - unsigned long capabilities, resources; - char fn[MAX_FN_SIZE]; - - sprintf(fn, "%s/err_type_info", path); - if (rd(fn, &err_type_info)<0) { - perror("err_type_info reading error.\n"); - return -1; - } - printf("err_type_info=%lx\n", err_type_info); - - sprintf(fn, "%s/err_struct_info", path); - if (rd(fn, &err_struct_info)<0) { - perror("err_struct_info reading error.\n"); - return -1; - } - printf("err_struct_info=%lx\n", err_struct_info); - - sprintf(fn, "%s/err_data_buffer", path); - if (rd(fn, &err_data_buffer)<0) { - perror("err_data_buffer reading error.\n"); - return -1; - } - printf("err_data_buffer=%lx\n", err_data_buffer); - - sprintf(fn, "%s/status", path); - if (rd("status", (u64*)&status)<0) { - perror("status reading error.\n"); - return -1; - } - printf("status=%d\n", status); - - sprintf(fn, "%s/capabilities", path); - if (rd(fn,&capabilities)<0) { - perror("capabilities reading error.\n"); - return -1; - } - printf("capabilities=%lx\n", capabilities); - - sprintf(fn, "%s/resources", path); - if (rd(fn, &resources)<0) { - perror("resources reading error.\n"); - return -1; - } - printf("resources=%lx\n", resources); - - return 0; -} - -int query_capabilities(char *path, err_type_info_t err_type_info, - u64 *capabilities) -{ - char fn[MAX_FN_SIZE]; - err_struct_info_t err_struct_info; - err_data_buffer_t err_data_buffer; - - err_struct_info.err_struct_info=0; - memset(err_data_buffer.err_data_buffer, -1, ERR_DATA_BUFFER_SIZE*8); - - sprintf(fn, "%s/err_type_info", path); - wr(fn, err_type_info.err_type_info); - sprintf(fn, "%s/err_struct_info", path); - wr(fn, 0x0); - sprintf(fn, "%s/err_data_buffer", path); - wr_multi(fn, err_data_buffer.err_data_buffer, ERR_DATA_BUFFER_SIZE); - - // Fire pal_mc_error_inject procedure. - sprintf(fn, "%s/call_start", path); - wr(fn, mode); - - if (rd_capabilities(path, capabilities)<0) - return -1; - - return 0; -} - -int query_all_capabilities() -{ - int status; - err_type_info_t err_type_info; - int err_sev, err_struct, struct_hier; - int cap=0; - u64 capabilities; - char path[MAX_FN_SIZE]; - - err_type_info.err_type_info=0; // Initial - err_type_info.err_type_info_u.mode=0; // Query mode; - err_type_info.err_type_info_u.err_inj=0; - - printf("All capabilities implemented in pal_mc_error_inject:\n"); - sprintf(path, PATH_FORMAT ,0); - for (err_sev=0;err_sev<3;err_sev++) - for (err_struct=0;err_struct<5;err_struct++) - for (struct_hier=0;struct_hier<5;struct_hier++) - { - status=-1; - capabilities=0; - err_type_info.err_type_info_u.err_sev=err_sev; - err_type_info.err_type_info_u.err_struct=err_struct; - err_type_info.err_type_info_u.struct_hier=struct_hier; - - if (query_capabilities(path, err_type_info, &capabilities)<0) - continue; - - if (rd_status(path, &status)<0) - continue; - - if (status==0) { - cap=1; - printf("For err_sev=%d, err_struct=%d, struct_hier=%d: ", - err_sev, err_struct, struct_hier); - printf("capabilities 0x%lx\n", capabilities); - } - } - if (!cap) { - printf("No capabilities supported.\n"); - return 0; - } - - return 0; -} - -int err_inject(int cpu, char *path, err_type_info_t err_type_info, - err_struct_info_t err_struct_info, - err_data_buffer_t err_data_buffer) -{ - int status; - char fn[MAX_FN_SIZE]; - - log_info(cpu, "err_type_info=%lx, err_struct_info=%lx, ", - err_type_info.err_type_info, - err_struct_info.err_struct_info); - log_info(cpu,"err_data_buffer=[%lx,%lx,%lx]\n", - err_data_buffer.err_data_buffer[0], - err_data_buffer.err_data_buffer[1], - err_data_buffer.err_data_buffer[2]); - sprintf(fn, "%s/err_type_info", path); - wr(fn, err_type_info.err_type_info); - sprintf(fn, "%s/err_struct_info", path); - wr(fn, err_struct_info.err_struct_info); - sprintf(fn, "%s/err_data_buffer", path); - wr_multi(fn, err_data_buffer.err_data_buffer, ERR_DATA_BUFFER_SIZE); - - // Fire pal_mc_error_inject procedure. - sprintf(fn, "%s/call_start", path); - wr(fn,mode); - - if (rd_status(path, &status)<0) { - vbprintf("fail: read status\n"); - return -100; - } - - if (status!=0) { - log_info(cpu, "fail: status=%d\n", status); - return status; - } - - return status; -} - -static int construct_data_buf(char *path, err_type_info_t err_type_info, - err_struct_info_t err_struct_info, - err_data_buffer_t *err_data_buffer, - void *va1) -{ - char fn[MAX_FN_SIZE]; - u64 virt_addr=0, phys_addr=0; - - vbprintf("va1=%lx\n", (u64)va1); - memset(&err_data_buffer->err_data_buffer_cache, 0, ERR_DATA_BUFFER_SIZE*8); - - switch (err_type_info.err_type_info_u.err_struct) { - case 1: // Cache - switch (err_struct_info.err_struct_info_cache.cl_id) { - case 1: //Virtual addr - err_data_buffer->err_data_buffer_cache.inj_addr=(u64)va1; - break; - case 2: //Phys addr - sprintf(fn, "%s/virtual_to_phys", path); - virt_addr=(u64)va1; - if (wr(fn,virt_addr)<0) - return -1; - rd(fn, &phys_addr); - err_data_buffer->err_data_buffer_cache.inj_addr=phys_addr; - break; - default: - printf("Not supported cl_id\n"); - break; - } - break; - case 2: // TLB - break; - case 3: // Register file - break; - case 4: // Bus/system interconnect - default: - printf("Not supported err_struct\n"); - break; - } - - return 0; -} - -typedef struct { - u64 cpu; - u64 loop; - u64 interval; - u64 err_type_info; - u64 err_struct_info; - u64 err_data_buffer[ERR_DATA_BUFFER_SIZE]; -} parameters_t; - -parameters_t line_para; -int para; - -static int empty_data_buffer(u64 *err_data_buffer) -{ - int empty=1; - int i; - - for (i=0;iMIN_INTERVAL - ?interval:MIN_INTERVAL; - parameters[num].err_type_info=err_type_info_conf; - parameters[num].err_struct_info=err_struct_info_conf; - memcpy(parameters[num++].err_data_buffer, - err_data_buffer_conf,ERR_DATA_BUFFER_SIZE*8) ; - - if (num>=MAX_TASK_NUM) - break; - } - } - else { - parameters[0].cpu=line_para.cpu; - parameters[0].loop=line_para.loop; - parameters[0].interval= line_para.interval>MIN_INTERVAL - ?line_para.interval:MIN_INTERVAL; - parameters[0].err_type_info=line_para.err_type_info; - parameters[0].err_struct_info=line_para.err_struct_info; - memcpy(parameters[0].err_data_buffer, - line_para.err_data_buffer,ERR_DATA_BUFFER_SIZE*8) ; - - num=1; - } - - /* Create semaphore: If one_lock, one semaphore for all processors. - Otherwise, one semaphore for each processor. */ - if (one_lock) { - if (create_sem(0)) { - printf("Can not create semaphore...exit\n"); - free_sem(0); - return -1; - } - } - else { - for (i=0;i + +Using the "epc" instruction effectively introduces a new mode of +execution to the ia64 linux kernel. We call this mode the +"fsys-mode". To recap, the normal states of execution are: + + - kernel mode: + Both the register stack and the memory stack have been + switched over to kernel memory. The user-level state is saved + in a pt-regs structure at the top of the kernel memory stack. + + - user mode: + Both the register stack and the kernel stack are in + user memory. The user-level state is contained in the + CPU registers. + + - bank 0 interruption-handling mode: + This is the non-interruptible state which all + interruption-handlers start execution in. The user-level + state remains in the CPU registers and some kernel state may + be stored in bank 0 of registers r16-r31. + +In contrast, fsys-mode has the following special properties: + + - execution is at privilege level 0 (most-privileged) + + - CPU registers may contain a mixture of user-level and kernel-level + state (it is the responsibility of the kernel to ensure that no + security-sensitive kernel-level state is leaked back to + user-level) + + - execution is interruptible and preemptible (an fsys-mode handler + can disable interrupts and avoid all other interruption-sources + to avoid preemption) + + - neither the memory-stack nor the register-stack can be trusted while + in fsys-mode (they point to the user-level stacks, which may + be invalid, or completely bogus addresses) + +In summary, fsys-mode is much more similar to running in user-mode +than it is to running in kernel-mode. Of course, given that the +privilege level is at level 0, this means that fsys-mode requires some +care (see below). + + +How to tell fsys-mode +===================== + +Linux operates in fsys-mode when (a) the privilege level is 0 (most +privileged) and (b) the stacks have NOT been switched to kernel memory +yet. For convenience, the header file provides +three macros:: + + user_mode(regs) + user_stack(task,regs) + fsys_mode(task,regs) + +The "regs" argument is a pointer to a pt_regs structure. The "task" +argument is a pointer to the task structure to which the "regs" +pointer belongs to. user_mode() returns TRUE if the CPU state pointed +to by "regs" was executing in user mode (privilege level 3). +user_stack() returns TRUE if the state pointed to by "regs" was +executing on the user-level stack(s). Finally, fsys_mode() returns +TRUE if the CPU state pointed to by "regs" was executing in fsys-mode. +The fsys_mode() macro is equivalent to the expression:: + + !user_mode(regs) && user_stack(task,regs) + +How to write an fsyscall handler +================================ + +The file arch/ia64/kernel/fsys.S contains a table of fsyscall-handlers +(fsyscall_table). This table contains one entry for each system call. +By default, a system call is handled by fsys_fallback_syscall(). This +routine takes care of entering (full) kernel mode and calling the +normal Linux system call handler. For performance-critical system +calls, it is possible to write a hand-tuned fsyscall_handler. For +example, fsys.S contains fsys_getpid(), which is a hand-tuned version +of the getpid() system call. + +The entry and exit-state of an fsyscall handler is as follows: + +Machine state on entry to fsyscall handler +------------------------------------------ + + ========= =============================================================== + r10 0 + r11 saved ar.pfs (a user-level value) + r15 system call number + r16 "current" task pointer (in normal kernel-mode, this is in r13) + r32-r39 system call arguments + b6 return address (a user-level value) + ar.pfs previous frame-state (a user-level value) + PSR.be cleared to zero (i.e., little-endian byte order is in effect) + - all other registers may contain values passed in from user-mode + ========= =============================================================== + +Required machine state on exit to fsyscall handler +-------------------------------------------------- + + ========= =========================================================== + r11 saved ar.pfs (as passed into the fsyscall handler) + r15 system call number (as passed into the fsyscall handler) + r32-r39 system call arguments (as passed into the fsyscall handler) + b6 return address (as passed into the fsyscall handler) + ar.pfs previous frame-state (as passed into the fsyscall handler) + ========= =========================================================== + +Fsyscall handlers can execute with very little overhead, but with that +speed comes a set of restrictions: + + * Fsyscall-handlers MUST check for any pending work in the flags + member of the thread-info structure and if any of the + TIF_ALLWORK_MASK flags are set, the handler needs to fall back on + doing a full system call (by calling fsys_fallback_syscall). + + * Fsyscall-handlers MUST preserve incoming arguments (r32-r39, r11, + r15, b6, and ar.pfs) because they will be needed in case of a + system call restart. Of course, all "preserved" registers also + must be preserved, in accordance to the normal calling conventions. + + * Fsyscall-handlers MUST check argument registers for containing a + NaT value before using them in any way that could trigger a + NaT-consumption fault. If a system call argument is found to + contain a NaT value, an fsyscall-handler may return immediately + with r8=EINVAL, r10=-1. + + * Fsyscall-handlers MUST NOT use the "alloc" instruction or perform + any other operation that would trigger mandatory RSE + (register-stack engine) traffic. + + * Fsyscall-handlers MUST NOT write to any stacked registers because + it is not safe to assume that user-level called a handler with the + proper number of arguments. + + * Fsyscall-handlers need to be careful when accessing per-CPU variables: + unless proper safe-guards are taken (e.g., interruptions are avoided), + execution may be pre-empted and resumed on another CPU at any given + time. + + * Fsyscall-handlers must be careful not to leak sensitive kernel' + information back to user-level. In particular, before returning to + user-level, care needs to be taken to clear any scratch registers + that could contain sensitive information (note that the current + task pointer is not considered sensitive: it's already exposed + through ar.k6). + + * Fsyscall-handlers MUST NOT access user-memory without first + validating access-permission (this can be done typically via + probe.r.fault and/or probe.w.fault) and without guarding against + memory access exceptions (this can be done with the EX() macros + defined by asmmacro.h). + +The above restrictions may seem draconian, but remember that it's +possible to trade off some of the restrictions by paying a slightly +higher overhead. For example, if an fsyscall-handler could benefit +from the shadow register bank, it could temporarily disable PSR.i and +PSR.ic, switch to bank 0 (bsw.0) and then use the shadow registers as +needed. In other words, following the above rules yields extremely +fast system call execution (while fully preserving system call +semantics), but there is also a lot of flexibility in handling more +complicated cases. + +Signal handling +=============== + +The delivery of (asynchronous) signals must be delayed until fsys-mode +is exited. This is accomplished with the help of the lower-privilege +transfer trap: arch/ia64/kernel/process.c:do_notify_resume_user() +checks whether the interrupted task was in fsys-mode and, if so, sets +PSR.lp and returns immediately. When fsys-mode is exited via the +"br.ret" instruction that lowers the privilege level, a trap will +occur. The trap handler clears PSR.lp again and returns immediately. +The kernel exit path then checks for and delivers any pending signals. + +PSR Handling +============ + +The "epc" instruction doesn't change the contents of PSR at all. This +is in contrast to a regular interruption, which clears almost all +bits. Because of that, some care needs to be taken to ensure things +work as expected. The following discussion describes how each PSR bit +is handled. + +======= ======================================================================= +PSR.be Cleared when entering fsys-mode. A srlz.d instruction is used + to ensure the CPU is in little-endian mode before the first + load/store instruction is executed. PSR.be is normally NOT + restored upon return from an fsys-mode handler. In other + words, user-level code must not rely on PSR.be being preserved + across a system call. +PSR.up Unchanged. +PSR.ac Unchanged. +PSR.mfl Unchanged. Note: fsys-mode handlers must not write-registers! +PSR.mfh Unchanged. Note: fsys-mode handlers must not write-registers! +PSR.ic Unchanged. Note: fsys-mode handlers can clear the bit, if needed. +PSR.i Unchanged. Note: fsys-mode handlers can clear the bit, if needed. +PSR.pk Unchanged. +PSR.dt Unchanged. +PSR.dfl Unchanged. Note: fsys-mode handlers must not write-registers! +PSR.dfh Unchanged. Note: fsys-mode handlers must not write-registers! +PSR.sp Unchanged. +PSR.pp Unchanged. +PSR.di Unchanged. +PSR.si Unchanged. +PSR.db Unchanged. The kernel prevents user-level from setting a hardware + breakpoint that triggers at any privilege level other than + 3 (user-mode). +PSR.lp Unchanged. +PSR.tb Lazy redirect. If a taken-branch trap occurs while in + fsys-mode, the trap-handler modifies the saved machine state + such that execution resumes in the gate page at + syscall_via_break(), with privilege level 3. Note: the + taken branch would occur on the branch invoking the + fsyscall-handler, at which point, by definition, a syscall + restart is still safe. If the system call number is invalid, + the fsys-mode handler will return directly to user-level. This + return will trigger a taken-branch trap, but since the trap is + taken _after_ restoring the privilege level, the CPU has already + left fsys-mode, so no special treatment is needed. +PSR.rt Unchanged. +PSR.cpl Cleared to 0. +PSR.is Unchanged (guaranteed to be 0 on entry to the gate page). +PSR.mc Unchanged. +PSR.it Unchanged (guaranteed to be 1). +PSR.id Unchanged. Note: the ia64 linux kernel never sets this bit. +PSR.da Unchanged. Note: the ia64 linux kernel never sets this bit. +PSR.dd Unchanged. Note: the ia64 linux kernel never sets this bit. +PSR.ss Lazy redirect. If set, "epc" will cause a Single Step Trap to + be taken. The trap handler then modifies the saved machine + state such that execution resumes in the gate page at + syscall_via_break(), with privilege level 3. +PSR.ri Unchanged. +PSR.ed Unchanged. Note: This bit could only have an effect if an fsys-mode + handler performed a speculative load that gets NaTted. If so, this + would be the normal & expected behavior, so no special treatment is + needed. +PSR.bn Unchanged. Note: fsys-mode handlers may clear the bit, if needed. + Doing so requires clearing PSR.i and PSR.ic as well. +PSR.ia Unchanged. Note: the ia64 linux kernel never sets this bit. +======= ======================================================================= + +Using fast system calls +======================= + +To use fast system calls, userspace applications need simply call +__kernel_syscall_via_epc(). For example + +-- example fgettimeofday() call -- + +-- fgettimeofday.S -- + +:: + + #include + + GLOBAL_ENTRY(fgettimeofday) + .prologue + .save ar.pfs, r11 + mov r11 = ar.pfs + .body + + mov r2 = 0xa000000000020660;; // gate address + // found by inspection of System.map for the + // __kernel_syscall_via_epc() function. See + // below for how to do this for real. + + mov b7 = r2 + mov r15 = 1087 // gettimeofday syscall + ;; + br.call.sptk.many b6 = b7 + ;; + + .restore sp + + mov ar.pfs = r11 + br.ret.sptk.many rp;; // return to caller + END(fgettimeofday) + +-- end fgettimeofday.S -- + +In reality, getting the gate address is accomplished by two extra +values passed via the ELF auxiliary vector (include/asm-ia64/elf.h) + + * AT_SYSINFO : is the address of __kernel_syscall_via_epc() + * AT_SYSINFO_EHDR : is the address of the kernel gate ELF DSO + +The ELF DSO is a pre-linked library that is mapped in by the kernel at +the gate page. It is a proper ELF shared object so, with a dynamic +loader that recognises the library, you should be able to make calls to +the exported functions within it as with any other shared library. +AT_SYSINFO points into the kernel DSO at the +__kernel_syscall_via_epc() function for historical reasons (it was +used before the kernel DSO) and as a convenience. diff --git a/Documentation/ia64/fsys.txt b/Documentation/ia64/fsys.txt deleted file mode 100644 index 59dd689d9b86..000000000000 --- a/Documentation/ia64/fsys.txt +++ /dev/null @@ -1,286 +0,0 @@ --*-Mode: outline-*- - - Light-weight System Calls for IA-64 - ----------------------------------- - - Started: 13-Jan-2003 - Last update: 27-Sep-2003 - - David Mosberger-Tang - - -Using the "epc" instruction effectively introduces a new mode of -execution to the ia64 linux kernel. We call this mode the -"fsys-mode". To recap, the normal states of execution are: - - - kernel mode: - Both the register stack and the memory stack have been - switched over to kernel memory. The user-level state is saved - in a pt-regs structure at the top of the kernel memory stack. - - - user mode: - Both the register stack and the kernel stack are in - user memory. The user-level state is contained in the - CPU registers. - - - bank 0 interruption-handling mode: - This is the non-interruptible state which all - interruption-handlers start execution in. The user-level - state remains in the CPU registers and some kernel state may - be stored in bank 0 of registers r16-r31. - -In contrast, fsys-mode has the following special properties: - - - execution is at privilege level 0 (most-privileged) - - - CPU registers may contain a mixture of user-level and kernel-level - state (it is the responsibility of the kernel to ensure that no - security-sensitive kernel-level state is leaked back to - user-level) - - - execution is interruptible and preemptible (an fsys-mode handler - can disable interrupts and avoid all other interruption-sources - to avoid preemption) - - - neither the memory-stack nor the register-stack can be trusted while - in fsys-mode (they point to the user-level stacks, which may - be invalid, or completely bogus addresses) - -In summary, fsys-mode is much more similar to running in user-mode -than it is to running in kernel-mode. Of course, given that the -privilege level is at level 0, this means that fsys-mode requires some -care (see below). - - -* How to tell fsys-mode - -Linux operates in fsys-mode when (a) the privilege level is 0 (most -privileged) and (b) the stacks have NOT been switched to kernel memory -yet. For convenience, the header file provides -three macros: - - user_mode(regs) - user_stack(task,regs) - fsys_mode(task,regs) - -The "regs" argument is a pointer to a pt_regs structure. The "task" -argument is a pointer to the task structure to which the "regs" -pointer belongs to. user_mode() returns TRUE if the CPU state pointed -to by "regs" was executing in user mode (privilege level 3). -user_stack() returns TRUE if the state pointed to by "regs" was -executing on the user-level stack(s). Finally, fsys_mode() returns -TRUE if the CPU state pointed to by "regs" was executing in fsys-mode. -The fsys_mode() macro is equivalent to the expression: - - !user_mode(regs) && user_stack(task,regs) - -* How to write an fsyscall handler - -The file arch/ia64/kernel/fsys.S contains a table of fsyscall-handlers -(fsyscall_table). This table contains one entry for each system call. -By default, a system call is handled by fsys_fallback_syscall(). This -routine takes care of entering (full) kernel mode and calling the -normal Linux system call handler. For performance-critical system -calls, it is possible to write a hand-tuned fsyscall_handler. For -example, fsys.S contains fsys_getpid(), which is a hand-tuned version -of the getpid() system call. - -The entry and exit-state of an fsyscall handler is as follows: - -** Machine state on entry to fsyscall handler: - - - r10 = 0 - - r11 = saved ar.pfs (a user-level value) - - r15 = system call number - - r16 = "current" task pointer (in normal kernel-mode, this is in r13) - - r32-r39 = system call arguments - - b6 = return address (a user-level value) - - ar.pfs = previous frame-state (a user-level value) - - PSR.be = cleared to zero (i.e., little-endian byte order is in effect) - - all other registers may contain values passed in from user-mode - -** Required machine state on exit to fsyscall handler: - - - r11 = saved ar.pfs (as passed into the fsyscall handler) - - r15 = system call number (as passed into the fsyscall handler) - - r32-r39 = system call arguments (as passed into the fsyscall handler) - - b6 = return address (as passed into the fsyscall handler) - - ar.pfs = previous frame-state (as passed into the fsyscall handler) - -Fsyscall handlers can execute with very little overhead, but with that -speed comes a set of restrictions: - - o Fsyscall-handlers MUST check for any pending work in the flags - member of the thread-info structure and if any of the - TIF_ALLWORK_MASK flags are set, the handler needs to fall back on - doing a full system call (by calling fsys_fallback_syscall). - - o Fsyscall-handlers MUST preserve incoming arguments (r32-r39, r11, - r15, b6, and ar.pfs) because they will be needed in case of a - system call restart. Of course, all "preserved" registers also - must be preserved, in accordance to the normal calling conventions. - - o Fsyscall-handlers MUST check argument registers for containing a - NaT value before using them in any way that could trigger a - NaT-consumption fault. If a system call argument is found to - contain a NaT value, an fsyscall-handler may return immediately - with r8=EINVAL, r10=-1. - - o Fsyscall-handlers MUST NOT use the "alloc" instruction or perform - any other operation that would trigger mandatory RSE - (register-stack engine) traffic. - - o Fsyscall-handlers MUST NOT write to any stacked registers because - it is not safe to assume that user-level called a handler with the - proper number of arguments. - - o Fsyscall-handlers need to be careful when accessing per-CPU variables: - unless proper safe-guards are taken (e.g., interruptions are avoided), - execution may be pre-empted and resumed on another CPU at any given - time. - - o Fsyscall-handlers must be careful not to leak sensitive kernel' - information back to user-level. In particular, before returning to - user-level, care needs to be taken to clear any scratch registers - that could contain sensitive information (note that the current - task pointer is not considered sensitive: it's already exposed - through ar.k6). - - o Fsyscall-handlers MUST NOT access user-memory without first - validating access-permission (this can be done typically via - probe.r.fault and/or probe.w.fault) and without guarding against - memory access exceptions (this can be done with the EX() macros - defined by asmmacro.h). - -The above restrictions may seem draconian, but remember that it's -possible to trade off some of the restrictions by paying a slightly -higher overhead. For example, if an fsyscall-handler could benefit -from the shadow register bank, it could temporarily disable PSR.i and -PSR.ic, switch to bank 0 (bsw.0) and then use the shadow registers as -needed. In other words, following the above rules yields extremely -fast system call execution (while fully preserving system call -semantics), but there is also a lot of flexibility in handling more -complicated cases. - -* Signal handling - -The delivery of (asynchronous) signals must be delayed until fsys-mode -is exited. This is accomplished with the help of the lower-privilege -transfer trap: arch/ia64/kernel/process.c:do_notify_resume_user() -checks whether the interrupted task was in fsys-mode and, if so, sets -PSR.lp and returns immediately. When fsys-mode is exited via the -"br.ret" instruction that lowers the privilege level, a trap will -occur. The trap handler clears PSR.lp again and returns immediately. -The kernel exit path then checks for and delivers any pending signals. - -* PSR Handling - -The "epc" instruction doesn't change the contents of PSR at all. This -is in contrast to a regular interruption, which clears almost all -bits. Because of that, some care needs to be taken to ensure things -work as expected. The following discussion describes how each PSR bit -is handled. - -PSR.be Cleared when entering fsys-mode. A srlz.d instruction is used - to ensure the CPU is in little-endian mode before the first - load/store instruction is executed. PSR.be is normally NOT - restored upon return from an fsys-mode handler. In other - words, user-level code must not rely on PSR.be being preserved - across a system call. -PSR.up Unchanged. -PSR.ac Unchanged. -PSR.mfl Unchanged. Note: fsys-mode handlers must not write-registers! -PSR.mfh Unchanged. Note: fsys-mode handlers must not write-registers! -PSR.ic Unchanged. Note: fsys-mode handlers can clear the bit, if needed. -PSR.i Unchanged. Note: fsys-mode handlers can clear the bit, if needed. -PSR.pk Unchanged. -PSR.dt Unchanged. -PSR.dfl Unchanged. Note: fsys-mode handlers must not write-registers! -PSR.dfh Unchanged. Note: fsys-mode handlers must not write-registers! -PSR.sp Unchanged. -PSR.pp Unchanged. -PSR.di Unchanged. -PSR.si Unchanged. -PSR.db Unchanged. The kernel prevents user-level from setting a hardware - breakpoint that triggers at any privilege level other than 3 (user-mode). -PSR.lp Unchanged. -PSR.tb Lazy redirect. If a taken-branch trap occurs while in - fsys-mode, the trap-handler modifies the saved machine state - such that execution resumes in the gate page at - syscall_via_break(), with privilege level 3. Note: the - taken branch would occur on the branch invoking the - fsyscall-handler, at which point, by definition, a syscall - restart is still safe. If the system call number is invalid, - the fsys-mode handler will return directly to user-level. This - return will trigger a taken-branch trap, but since the trap is - taken _after_ restoring the privilege level, the CPU has already - left fsys-mode, so no special treatment is needed. -PSR.rt Unchanged. -PSR.cpl Cleared to 0. -PSR.is Unchanged (guaranteed to be 0 on entry to the gate page). -PSR.mc Unchanged. -PSR.it Unchanged (guaranteed to be 1). -PSR.id Unchanged. Note: the ia64 linux kernel never sets this bit. -PSR.da Unchanged. Note: the ia64 linux kernel never sets this bit. -PSR.dd Unchanged. Note: the ia64 linux kernel never sets this bit. -PSR.ss Lazy redirect. If set, "epc" will cause a Single Step Trap to - be taken. The trap handler then modifies the saved machine - state such that execution resumes in the gate page at - syscall_via_break(), with privilege level 3. -PSR.ri Unchanged. -PSR.ed Unchanged. Note: This bit could only have an effect if an fsys-mode - handler performed a speculative load that gets NaTted. If so, this - would be the normal & expected behavior, so no special treatment is - needed. -PSR.bn Unchanged. Note: fsys-mode handlers may clear the bit, if needed. - Doing so requires clearing PSR.i and PSR.ic as well. -PSR.ia Unchanged. Note: the ia64 linux kernel never sets this bit. - -* Using fast system calls - -To use fast system calls, userspace applications need simply call -__kernel_syscall_via_epc(). For example - --- example fgettimeofday() call -- --- fgettimeofday.S -- - -#include - -GLOBAL_ENTRY(fgettimeofday) -.prologue -.save ar.pfs, r11 -mov r11 = ar.pfs -.body - -mov r2 = 0xa000000000020660;; // gate address - // found by inspection of System.map for the - // __kernel_syscall_via_epc() function. See - // below for how to do this for real. - -mov b7 = r2 -mov r15 = 1087 // gettimeofday syscall -;; -br.call.sptk.many b6 = b7 -;; - -.restore sp - -mov ar.pfs = r11 -br.ret.sptk.many rp;; // return to caller -END(fgettimeofday) - --- end fgettimeofday.S -- - -In reality, getting the gate address is accomplished by two extra -values passed via the ELF auxiliary vector (include/asm-ia64/elf.h) - - o AT_SYSINFO : is the address of __kernel_syscall_via_epc() - o AT_SYSINFO_EHDR : is the address of the kernel gate ELF DSO - -The ELF DSO is a pre-linked library that is mapped in by the kernel at -the gate page. It is a proper ELF shared object so, with a dynamic -loader that recognises the library, you should be able to make calls to -the exported functions within it as with any other shared library. -AT_SYSINFO points into the kernel DSO at the -__kernel_syscall_via_epc() function for historical reasons (it was -used before the kernel DSO) and as a convenience. diff --git a/Documentation/ia64/ia64.rst b/Documentation/ia64/ia64.rst new file mode 100644 index 000000000000..b725019a9492 --- /dev/null +++ b/Documentation/ia64/ia64.rst @@ -0,0 +1,49 @@ +=========================================== +Linux kernel release for the IA-64 Platform +=========================================== + + These are the release notes for Linux since version 2.4 for IA-64 + platform. This document provides information specific to IA-64 + ONLY, to get additional information about the Linux kernel also + read the original Linux README provided with the kernel. + +Installing the Kernel +===================== + + - IA-64 kernel installation is the same as the other platforms, see + original README for details. + + +Software Requirements +===================== + + Compiling and running this kernel requires an IA-64 compliant GCC + compiler. And various software packages also compiled with an + IA-64 compliant GCC compiler. + + +Configuring the Kernel +====================== + + Configuration is the same, see original README for details. + + +Compiling the Kernel: + + - Compiling this kernel doesn't differ from other platform so read + the original README for details BUT make sure you have an IA-64 + compliant GCC compiler. + +IA-64 Specifics +=============== + + - General issues: + + * Hardly any performance tuning has been done. Obvious targets + include the library routines (IP checksum, etc.). Less + obvious targets include making sure we don't flush the TLB + needlessly, etc. + + * SMP locks cleanup/optimization + + * IA32 support. Currently experimental. It mostly works. diff --git a/Documentation/ia64/index.rst b/Documentation/ia64/index.rst new file mode 100644 index 000000000000..a3e3052ad6e2 --- /dev/null +++ b/Documentation/ia64/index.rst @@ -0,0 +1,18 @@ +:orphan: + +================== +IA-64 Architecture +================== + +.. toctree:: + :maxdepth: 1 + + ia64 + aliasing + efirtc + err_inject + fsys + irq-redir + mca + serial + xen diff --git a/Documentation/ia64/irq-redir.rst b/Documentation/ia64/irq-redir.rst new file mode 100644 index 000000000000..39bf94484a15 --- /dev/null +++ b/Documentation/ia64/irq-redir.rst @@ -0,0 +1,80 @@ +============================== +IRQ affinity on IA64 platforms +============================== + +07.01.2002, Erich Focht + + +By writing to /proc/irq/IRQ#/smp_affinity the interrupt routing can be +controlled. The behavior on IA64 platforms is slightly different from +that described in Documentation/IRQ-affinity.txt for i386 systems. + +Because of the usage of SAPIC mode and physical destination mode the +IRQ target is one particular CPU and cannot be a mask of several +CPUs. Only the first non-zero bit is taken into account. + + +Usage examples +============== + +The target CPU has to be specified as a hexadecimal CPU mask. The +first non-zero bit is the selected CPU. This format has been kept for +compatibility reasons with i386. + +Set the delivery mode of interrupt 41 to fixed and route the +interrupts to CPU #3 (logical CPU number) (2^3=0x08):: + + echo "8" >/proc/irq/41/smp_affinity + +Set the default route for IRQ number 41 to CPU 6 in lowest priority +delivery mode (redirectable):: + + echo "r 40" >/proc/irq/41/smp_affinity + +The output of the command:: + + cat /proc/irq/IRQ#/smp_affinity + +gives the target CPU mask for the specified interrupt vector. If the CPU +mask is preceded by the character "r", the interrupt is redirectable +(i.e. lowest priority mode routing is used), otherwise its route is +fixed. + + + +Initialization and default behavior +=================================== + +If the platform features IRQ redirection (info provided by SAL) all +IO-SAPIC interrupts are initialized with CPU#0 as their default target +and the routing is the so called "lowest priority mode" (actually +fixed SAPIC mode with hint). The XTP chipset registers are used as hints +for the IRQ routing. Currently in Linux XTP registers can have three +values: + + - minimal for an idle task, + - normal if any other task runs, + - maximal if the CPU is going to be switched off. + +The IRQ is routed to the CPU with lowest XTP register value, the +search begins at the default CPU. Therefore most of the interrupts +will be handled by CPU #0. + +If the platform doesn't feature interrupt redirection IOSAPIC fixed +routing is used. The target CPUs are distributed in a round robin +manner. IRQs will be routed only to the selected target CPUs. Check +with:: + + cat /proc/interrupts + + + +Comments +======== + +On large (multi-node) systems it is recommended to route the IRQs to +the node to which the corresponding device is connected. +For systems like the NEC AzusA we get IRQ node-affinity for free. This +is because usually the chipsets on each node redirect the interrupts +only to their own CPUs (as they cannot see the XTP registers on the +other nodes). diff --git a/Documentation/ia64/mca.rst b/Documentation/ia64/mca.rst new file mode 100644 index 000000000000..08270bba44a4 --- /dev/null +++ b/Documentation/ia64/mca.rst @@ -0,0 +1,198 @@ +============================================================= +An ad-hoc collection of notes on IA64 MCA and INIT processing +============================================================= + +Feel free to update it with notes about any area that is not clear. + +--- + +MCA/INIT are completely asynchronous. They can occur at any time, when +the OS is in any state. Including when one of the cpus is already +holding a spinlock. Trying to get any lock from MCA/INIT state is +asking for deadlock. Also the state of structures that are protected +by locks is indeterminate, including linked lists. + +--- + +The complicated ia64 MCA process. All of this is mandated by Intel's +specification for ia64 SAL, error recovery and unwind, it is not as +if we have a choice here. + +* MCA occurs on one cpu, usually due to a double bit memory error. + This is the monarch cpu. + +* SAL sends an MCA rendezvous interrupt (which is a normal interrupt) + to all the other cpus, the slaves. + +* Slave cpus that receive the MCA interrupt call down into SAL, they + end up spinning disabled while the MCA is being serviced. + +* If any slave cpu was already spinning disabled when the MCA occurred + then it cannot service the MCA interrupt. SAL waits ~20 seconds then + sends an unmaskable INIT event to the slave cpus that have not + already rendezvoused. + +* Because MCA/INIT can be delivered at any time, including when the cpu + is down in PAL in physical mode, the registers at the time of the + event are _completely_ undefined. In particular the MCA/INIT + handlers cannot rely on the thread pointer, PAL physical mode can + (and does) modify TP. It is allowed to do that as long as it resets + TP on return. However MCA/INIT events expose us to these PAL + internal TP changes. Hence curr_task(). + +* If an MCA/INIT event occurs while the kernel was running (not user + space) and the kernel has called PAL then the MCA/INIT handler cannot + assume that the kernel stack is in a fit state to be used. Mainly + because PAL may or may not maintain the stack pointer internally. + Because the MCA/INIT handlers cannot trust the kernel stack, they + have to use their own, per-cpu stacks. The MCA/INIT stacks are + preformatted with just enough task state to let the relevant handlers + do their job. + +* Unlike most other architectures, the ia64 struct task is embedded in + the kernel stack[1]. So switching to a new kernel stack means that + we switch to a new task as well. Because various bits of the kernel + assume that current points into the struct task, switching to a new + stack also means a new value for current. + +* Once all slaves have rendezvoused and are spinning disabled, the + monarch is entered. The monarch now tries to diagnose the problem + and decide if it can recover or not. + +* Part of the monarch's job is to look at the state of all the other + tasks. The only way to do that on ia64 is to call the unwinder, + as mandated by Intel. + +* The starting point for the unwind depends on whether a task is + running or not. That is, whether it is on a cpu or is blocked. The + monarch has to determine whether or not a task is on a cpu before it + knows how to start unwinding it. The tasks that received an MCA or + INIT event are no longer running, they have been converted to blocked + tasks. But (and its a big but), the cpus that received the MCA + rendezvous interrupt are still running on their normal kernel stacks! + +* To distinguish between these two cases, the monarch must know which + tasks are on a cpu and which are not. Hence each slave cpu that + switches to an MCA/INIT stack, registers its new stack using + set_curr_task(), so the monarch can tell that the _original_ task is + no longer running on that cpu. That gives us a decent chance of + getting a valid backtrace of the _original_ task. + +* MCA/INIT can be nested, to a depth of 2 on any cpu. In the case of a + nested error, we want diagnostics on the MCA/INIT handler that + failed, not on the task that was originally running. Again this + requires set_curr_task() so the MCA/INIT handlers can register their + own stack as running on that cpu. Then a recursive error gets a + trace of the failing handler's "task". + +[1] + My (Keith Owens) original design called for ia64 to separate its + struct task and the kernel stacks. Then the MCA/INIT data would be + chained stacks like i386 interrupt stacks. But that required + radical surgery on the rest of ia64, plus extra hard wired TLB + entries with its associated performance degradation. David + Mosberger vetoed that approach. Which meant that separate kernel + stacks meant separate "tasks" for the MCA/INIT handlers. + +--- + +INIT is less complicated than MCA. Pressing the nmi button or using +the equivalent command on the management console sends INIT to all +cpus. SAL picks one of the cpus as the monarch and the rest are +slaves. All the OS INIT handlers are entered at approximately the same +time. The OS monarch prints the state of all tasks and returns, after +which the slaves return and the system resumes. + +At least that is what is supposed to happen. Alas there are broken +versions of SAL out there. Some drive all the cpus as monarchs. Some +drive them all as slaves. Some drive one cpu as monarch, wait for that +cpu to return from the OS then drive the rest as slaves. Some versions +of SAL cannot even cope with returning from the OS, they spin inside +SAL on resume. The OS INIT code has workarounds for some of these +broken SAL symptoms, but some simply cannot be fixed from the OS side. + +--- + +The scheduler hooks used by ia64 (curr_task, set_curr_task) are layer +violations. Unfortunately MCA/INIT start off as massive layer +violations (can occur at _any_ time) and they build from there. + +At least ia64 makes an attempt at recovering from hardware errors, but +it is a difficult problem because of the asynchronous nature of these +errors. When processing an unmaskable interrupt we sometimes need +special code to cope with our inability to take any locks. + +--- + +How is ia64 MCA/INIT different from x86 NMI? + +* x86 NMI typically gets delivered to one cpu. MCA/INIT gets sent to + all cpus. + +* x86 NMI cannot be nested. MCA/INIT can be nested, to a depth of 2 + per cpu. + +* x86 has a separate struct task which points to one of multiple kernel + stacks. ia64 has the struct task embedded in the single kernel + stack, so switching stack means switching task. + +* x86 does not call the BIOS so the NMI handler does not have to worry + about any registers having changed. MCA/INIT can occur while the cpu + is in PAL in physical mode, with undefined registers and an undefined + kernel stack. + +* i386 backtrace is not very sensitive to whether a process is running + or not. ia64 unwind is very, very sensitive to whether a process is + running or not. + +--- + +What happens when MCA/INIT is delivered what a cpu is running user +space code? + +The user mode registers are stored in the RSE area of the MCA/INIT on +entry to the OS and are restored from there on return to SAL, so user +mode registers are preserved across a recoverable MCA/INIT. Since the +OS has no idea what unwind data is available for the user space stack, +MCA/INIT never tries to backtrace user space. Which means that the OS +does not bother making the user space process look like a blocked task, +i.e. the OS does not copy pt_regs and switch_stack to the user space +stack. Also the OS has no idea how big the user space RSE and memory +stacks are, which makes it too risky to copy the saved state to a user +mode stack. + +--- + +How do we get a backtrace on the tasks that were running when MCA/INIT +was delivered? + +mca.c:::ia64_mca_modify_original_stack(). That identifies and +verifies the original kernel stack, copies the dirty registers from +the MCA/INIT stack's RSE to the original stack's RSE, copies the +skeleton struct pt_regs and switch_stack to the original stack, fills +in the skeleton structures from the PAL minstate area and updates the +original stack's thread.ksp. That makes the original stack look +exactly like any other blocked task, i.e. it now appears to be +sleeping. To get a backtrace, just start with thread.ksp for the +original task and unwind like any other sleeping task. + +--- + +How do we identify the tasks that were running when MCA/INIT was +delivered? + +If the previous task has been verified and converted to a blocked +state, then sos->prev_task on the MCA/INIT stack is updated to point to +the previous task. You can look at that field in dumps or debuggers. +To help distinguish between the handler and the original tasks, +handlers have _TIF_MCA_INIT set in thread_info.flags. + +The sos data is always in the MCA/INIT handler stack, at offset +MCA_SOS_OFFSET. You can get that value from mca_asm.h or calculate it +as KERNEL_STACK_SIZE - sizeof(struct pt_regs) - sizeof(struct +ia64_sal_os_state), with 16 byte alignment for all structures. + +Also the comm field of the MCA/INIT task is modified to include the pid +of the original task, for humans to use. For example, a comm field of +'MCA 12159' means that pid 12159 was running when the MCA was +delivered. diff --git a/Documentation/ia64/mca.txt b/Documentation/ia64/mca.txt deleted file mode 100644 index f097c60cba1b..000000000000 --- a/Documentation/ia64/mca.txt +++ /dev/null @@ -1,194 +0,0 @@ -An ad-hoc collection of notes on IA64 MCA and INIT processing. Feel -free to update it with notes about any area that is not clear. - ---- - -MCA/INIT are completely asynchronous. They can occur at any time, when -the OS is in any state. Including when one of the cpus is already -holding a spinlock. Trying to get any lock from MCA/INIT state is -asking for deadlock. Also the state of structures that are protected -by locks is indeterminate, including linked lists. - ---- - -The complicated ia64 MCA process. All of this is mandated by Intel's -specification for ia64 SAL, error recovery and unwind, it is not as -if we have a choice here. - -* MCA occurs on one cpu, usually due to a double bit memory error. - This is the monarch cpu. - -* SAL sends an MCA rendezvous interrupt (which is a normal interrupt) - to all the other cpus, the slaves. - -* Slave cpus that receive the MCA interrupt call down into SAL, they - end up spinning disabled while the MCA is being serviced. - -* If any slave cpu was already spinning disabled when the MCA occurred - then it cannot service the MCA interrupt. SAL waits ~20 seconds then - sends an unmaskable INIT event to the slave cpus that have not - already rendezvoused. - -* Because MCA/INIT can be delivered at any time, including when the cpu - is down in PAL in physical mode, the registers at the time of the - event are _completely_ undefined. In particular the MCA/INIT - handlers cannot rely on the thread pointer, PAL physical mode can - (and does) modify TP. It is allowed to do that as long as it resets - TP on return. However MCA/INIT events expose us to these PAL - internal TP changes. Hence curr_task(). - -* If an MCA/INIT event occurs while the kernel was running (not user - space) and the kernel has called PAL then the MCA/INIT handler cannot - assume that the kernel stack is in a fit state to be used. Mainly - because PAL may or may not maintain the stack pointer internally. - Because the MCA/INIT handlers cannot trust the kernel stack, they - have to use their own, per-cpu stacks. The MCA/INIT stacks are - preformatted with just enough task state to let the relevant handlers - do their job. - -* Unlike most other architectures, the ia64 struct task is embedded in - the kernel stack[1]. So switching to a new kernel stack means that - we switch to a new task as well. Because various bits of the kernel - assume that current points into the struct task, switching to a new - stack also means a new value for current. - -* Once all slaves have rendezvoused and are spinning disabled, the - monarch is entered. The monarch now tries to diagnose the problem - and decide if it can recover or not. - -* Part of the monarch's job is to look at the state of all the other - tasks. The only way to do that on ia64 is to call the unwinder, - as mandated by Intel. - -* The starting point for the unwind depends on whether a task is - running or not. That is, whether it is on a cpu or is blocked. The - monarch has to determine whether or not a task is on a cpu before it - knows how to start unwinding it. The tasks that received an MCA or - INIT event are no longer running, they have been converted to blocked - tasks. But (and its a big but), the cpus that received the MCA - rendezvous interrupt are still running on their normal kernel stacks! - -* To distinguish between these two cases, the monarch must know which - tasks are on a cpu and which are not. Hence each slave cpu that - switches to an MCA/INIT stack, registers its new stack using - set_curr_task(), so the monarch can tell that the _original_ task is - no longer running on that cpu. That gives us a decent chance of - getting a valid backtrace of the _original_ task. - -* MCA/INIT can be nested, to a depth of 2 on any cpu. In the case of a - nested error, we want diagnostics on the MCA/INIT handler that - failed, not on the task that was originally running. Again this - requires set_curr_task() so the MCA/INIT handlers can register their - own stack as running on that cpu. Then a recursive error gets a - trace of the failing handler's "task". - -[1] My (Keith Owens) original design called for ia64 to separate its - struct task and the kernel stacks. Then the MCA/INIT data would be - chained stacks like i386 interrupt stacks. But that required - radical surgery on the rest of ia64, plus extra hard wired TLB - entries with its associated performance degradation. David - Mosberger vetoed that approach. Which meant that separate kernel - stacks meant separate "tasks" for the MCA/INIT handlers. - ---- - -INIT is less complicated than MCA. Pressing the nmi button or using -the equivalent command on the management console sends INIT to all -cpus. SAL picks one of the cpus as the monarch and the rest are -slaves. All the OS INIT handlers are entered at approximately the same -time. The OS monarch prints the state of all tasks and returns, after -which the slaves return and the system resumes. - -At least that is what is supposed to happen. Alas there are broken -versions of SAL out there. Some drive all the cpus as monarchs. Some -drive them all as slaves. Some drive one cpu as monarch, wait for that -cpu to return from the OS then drive the rest as slaves. Some versions -of SAL cannot even cope with returning from the OS, they spin inside -SAL on resume. The OS INIT code has workarounds for some of these -broken SAL symptoms, but some simply cannot be fixed from the OS side. - ---- - -The scheduler hooks used by ia64 (curr_task, set_curr_task) are layer -violations. Unfortunately MCA/INIT start off as massive layer -violations (can occur at _any_ time) and they build from there. - -At least ia64 makes an attempt at recovering from hardware errors, but -it is a difficult problem because of the asynchronous nature of these -errors. When processing an unmaskable interrupt we sometimes need -special code to cope with our inability to take any locks. - ---- - -How is ia64 MCA/INIT different from x86 NMI? - -* x86 NMI typically gets delivered to one cpu. MCA/INIT gets sent to - all cpus. - -* x86 NMI cannot be nested. MCA/INIT can be nested, to a depth of 2 - per cpu. - -* x86 has a separate struct task which points to one of multiple kernel - stacks. ia64 has the struct task embedded in the single kernel - stack, so switching stack means switching task. - -* x86 does not call the BIOS so the NMI handler does not have to worry - about any registers having changed. MCA/INIT can occur while the cpu - is in PAL in physical mode, with undefined registers and an undefined - kernel stack. - -* i386 backtrace is not very sensitive to whether a process is running - or not. ia64 unwind is very, very sensitive to whether a process is - running or not. - ---- - -What happens when MCA/INIT is delivered what a cpu is running user -space code? - -The user mode registers are stored in the RSE area of the MCA/INIT on -entry to the OS and are restored from there on return to SAL, so user -mode registers are preserved across a recoverable MCA/INIT. Since the -OS has no idea what unwind data is available for the user space stack, -MCA/INIT never tries to backtrace user space. Which means that the OS -does not bother making the user space process look like a blocked task, -i.e. the OS does not copy pt_regs and switch_stack to the user space -stack. Also the OS has no idea how big the user space RSE and memory -stacks are, which makes it too risky to copy the saved state to a user -mode stack. - ---- - -How do we get a backtrace on the tasks that were running when MCA/INIT -was delivered? - -mca.c:::ia64_mca_modify_original_stack(). That identifies and -verifies the original kernel stack, copies the dirty registers from -the MCA/INIT stack's RSE to the original stack's RSE, copies the -skeleton struct pt_regs and switch_stack to the original stack, fills -in the skeleton structures from the PAL minstate area and updates the -original stack's thread.ksp. That makes the original stack look -exactly like any other blocked task, i.e. it now appears to be -sleeping. To get a backtrace, just start with thread.ksp for the -original task and unwind like any other sleeping task. - ---- - -How do we identify the tasks that were running when MCA/INIT was -delivered? - -If the previous task has been verified and converted to a blocked -state, then sos->prev_task on the MCA/INIT stack is updated to point to -the previous task. You can look at that field in dumps or debuggers. -To help distinguish between the handler and the original tasks, -handlers have _TIF_MCA_INIT set in thread_info.flags. - -The sos data is always in the MCA/INIT handler stack, at offset -MCA_SOS_OFFSET. You can get that value from mca_asm.h or calculate it -as KERNEL_STACK_SIZE - sizeof(struct pt_regs) - sizeof(struct -ia64_sal_os_state), with 16 byte alignment for all structures. - -Also the comm field of the MCA/INIT task is modified to include the pid -of the original task, for humans to use. For example, a comm field of -'MCA 12159' means that pid 12159 was running when the MCA was -delivered. diff --git a/Documentation/ia64/serial.rst b/Documentation/ia64/serial.rst new file mode 100644 index 000000000000..1de70c305a79 --- /dev/null +++ b/Documentation/ia64/serial.rst @@ -0,0 +1,165 @@ +============== +Serial Devices +============== + +Serial Device Naming +==================== + + As of 2.6.10, serial devices on ia64 are named based on the + order of ACPI and PCI enumeration. The first device in the + ACPI namespace (if any) becomes /dev/ttyS0, the second becomes + /dev/ttyS1, etc., and PCI devices are named sequentially + starting after the ACPI devices. + + Prior to 2.6.10, there were confusing exceptions to this: + + - Firmware on some machines (mostly from HP) provides an HCDP + table[1] that tells the kernel about devices that can be used + as a serial console. If the user specified "console=ttyS0" + or the EFI ConOut path contained only UART devices, the + kernel registered the device described by the HCDP as + /dev/ttyS0. + + - If there was no HCDP, we assumed there were UARTs at the + legacy COM port addresses (I/O ports 0x3f8 and 0x2f8), so + the kernel registered those as /dev/ttyS0 and /dev/ttyS1. + + Any additional ACPI or PCI devices were registered sequentially + after /dev/ttyS0 as they were discovered. + + With an HCDP, device names changed depending on EFI configuration + and "console=" arguments. Without an HCDP, device names didn't + change, but we registered devices that might not really exist. + + For example, an HP rx1600 with a single built-in serial port + (described in the ACPI namespace) plus an MP[2] (a PCI device) has + these ports: + + ========== ========== ============ ============ ======= + Type MMIO pre-2.6.10 pre-2.6.10 2.6.10+ + address + (EFI console (EFI console + on builtin) on MP port) + ========== ========== ============ ============ ======= + builtin 0xff5e0000 ttyS0 ttyS1 ttyS0 + MP UPS 0xf8031000 ttyS1 ttyS2 ttyS1 + MP Console 0xf8030000 ttyS2 ttyS0 ttyS2 + MP 2 0xf8030010 ttyS3 ttyS3 ttyS3 + MP 3 0xf8030038 ttyS4 ttyS4 ttyS4 + ========== ========== ============ ============ ======= + +Console Selection +================= + + EFI knows what your console devices are, but it doesn't tell the + kernel quite enough to actually locate them. The DIG64 HCDP + table[1] does tell the kernel where potential serial console + devices are, but not all firmware supplies it. Also, EFI supports + multiple simultaneous consoles and doesn't tell the kernel which + should be the "primary" one. + + So how do you tell Linux which console device to use? + + - If your firmware supplies the HCDP, it is simplest to + configure EFI with a single device (either a UART or a VGA + card) as the console. Then you don't need to tell Linux + anything; the kernel will automatically use the EFI console. + + (This works only in 2.6.6 or later; prior to that you had + to specify "console=ttyS0" to get a serial console.) + + - Without an HCDP, Linux defaults to a VGA console unless you + specify a "console=" argument. + + NOTE: Don't assume that a serial console device will be /dev/ttyS0. + It might be ttyS1, ttyS2, etc. Make sure you have the appropriate + entries in /etc/inittab (for getty) and /etc/securetty (to allow + root login). + +Early Serial Console +==================== + + The kernel can't start using a serial console until it knows where + the device lives. Normally this happens when the driver enumerates + all the serial devices, which can happen a minute or more after the + kernel starts booting. + + 2.6.10 and later kernels have an "early uart" driver that works + very early in the boot process. The kernel will automatically use + this if the user supplies an argument like "console=uart,io,0x3f8", + or if the EFI console path contains only a UART device and the + firmware supplies an HCDP. + +Troubleshooting Serial Console Problems +======================================= + + No kernel output after elilo prints "Uncompressing Linux... done": + + - You specified "console=ttyS0" but Linux changed the device + to which ttyS0 refers. Configure exactly one EFI console + device[3] and remove the "console=" option. + + - The EFI console path contains both a VGA device and a UART. + EFI and elilo use both, but Linux defaults to VGA. Remove + the VGA device from the EFI console path[3]. + + - Multiple UARTs selected as EFI console devices. EFI and + elilo use all selected devices, but Linux uses only one. + Make sure only one UART is selected in the EFI console + path[3]. + + - You're connected to an HP MP port[2] but have a non-MP UART + selected as EFI console device. EFI uses the MP as a + console device even when it isn't explicitly selected. + Either move the console cable to the non-MP UART, or change + the EFI console path[3] to the MP UART. + + Long pause (60+ seconds) between "Uncompressing Linux... done" and + start of kernel output: + + - No early console because you used "console=ttyS". Remove + the "console=" option if your firmware supplies an HCDP. + + - If you don't have an HCDP, the kernel doesn't know where + your console lives until the driver discovers serial + devices. Use "console=uart,io,0x3f8" (or appropriate + address for your machine). + + Kernel and init script output works fine, but no "login:" prompt: + + - Add getty entry to /etc/inittab for console tty. Look for + the "Adding console on ttyS" message that tells you which + device is the console. + + "login:" prompt, but can't login as root: + + - Add entry to /etc/securetty for console tty. + + No ACPI serial devices found in 2.6.17 or later: + + - Turn on CONFIG_PNP and CONFIG_PNPACPI. Prior to 2.6.17, ACPI + serial devices were discovered by 8250_acpi. In 2.6.17, + 8250_acpi was replaced by the combination of 8250_pnp and + CONFIG_PNPACPI. + + + +[1] + http://www.dig64.org/specifications/agreement + The table was originally defined as the "HCDP" for "Headless + Console/Debug Port." The current version is the "PCDP" for + "Primary Console and Debug Port Devices." + +[2] + The HP MP (management processor) is a PCI device that provides + several UARTs. One of the UARTs is often used as a console; the + EFI Boot Manager identifies it as "Acpi(HWP0002,700)/Pci(...)/Uart". + The external connection is usually a 25-pin connector, and a + special dongle converts that to three 9-pin connectors, one of + which is labelled "Console." + +[3] + EFI console devices are configured using the EFI Boot Manager + "Boot option maintenance" menu. You may have to interrupt the + boot sequence to use this menu, and you will have to reset the + box after changing console configuration. diff --git a/Documentation/ia64/serial.txt b/Documentation/ia64/serial.txt deleted file mode 100644 index a63d2c54329b..000000000000 --- a/Documentation/ia64/serial.txt +++ /dev/null @@ -1,151 +0,0 @@ -SERIAL DEVICE NAMING - - As of 2.6.10, serial devices on ia64 are named based on the - order of ACPI and PCI enumeration. The first device in the - ACPI namespace (if any) becomes /dev/ttyS0, the second becomes - /dev/ttyS1, etc., and PCI devices are named sequentially - starting after the ACPI devices. - - Prior to 2.6.10, there were confusing exceptions to this: - - - Firmware on some machines (mostly from HP) provides an HCDP - table[1] that tells the kernel about devices that can be used - as a serial console. If the user specified "console=ttyS0" - or the EFI ConOut path contained only UART devices, the - kernel registered the device described by the HCDP as - /dev/ttyS0. - - - If there was no HCDP, we assumed there were UARTs at the - legacy COM port addresses (I/O ports 0x3f8 and 0x2f8), so - the kernel registered those as /dev/ttyS0 and /dev/ttyS1. - - Any additional ACPI or PCI devices were registered sequentially - after /dev/ttyS0 as they were discovered. - - With an HCDP, device names changed depending on EFI configuration - and "console=" arguments. Without an HCDP, device names didn't - change, but we registered devices that might not really exist. - - For example, an HP rx1600 with a single built-in serial port - (described in the ACPI namespace) plus an MP[2] (a PCI device) has - these ports: - - pre-2.6.10 pre-2.6.10 - MMIO (EFI console (EFI console - address on builtin) on MP port) 2.6.10 - ========== ========== ========== ====== - builtin 0xff5e0000 ttyS0 ttyS1 ttyS0 - MP UPS 0xf8031000 ttyS1 ttyS2 ttyS1 - MP Console 0xf8030000 ttyS2 ttyS0 ttyS2 - MP 2 0xf8030010 ttyS3 ttyS3 ttyS3 - MP 3 0xf8030038 ttyS4 ttyS4 ttyS4 - -CONSOLE SELECTION - - EFI knows what your console devices are, but it doesn't tell the - kernel quite enough to actually locate them. The DIG64 HCDP - table[1] does tell the kernel where potential serial console - devices are, but not all firmware supplies it. Also, EFI supports - multiple simultaneous consoles and doesn't tell the kernel which - should be the "primary" one. - - So how do you tell Linux which console device to use? - - - If your firmware supplies the HCDP, it is simplest to - configure EFI with a single device (either a UART or a VGA - card) as the console. Then you don't need to tell Linux - anything; the kernel will automatically use the EFI console. - - (This works only in 2.6.6 or later; prior to that you had - to specify "console=ttyS0" to get a serial console.) - - - Without an HCDP, Linux defaults to a VGA console unless you - specify a "console=" argument. - - NOTE: Don't assume that a serial console device will be /dev/ttyS0. - It might be ttyS1, ttyS2, etc. Make sure you have the appropriate - entries in /etc/inittab (for getty) and /etc/securetty (to allow - root login). - -EARLY SERIAL CONSOLE - - The kernel can't start using a serial console until it knows where - the device lives. Normally this happens when the driver enumerates - all the serial devices, which can happen a minute or more after the - kernel starts booting. - - 2.6.10 and later kernels have an "early uart" driver that works - very early in the boot process. The kernel will automatically use - this if the user supplies an argument like "console=uart,io,0x3f8", - or if the EFI console path contains only a UART device and the - firmware supplies an HCDP. - -TROUBLESHOOTING SERIAL CONSOLE PROBLEMS - - No kernel output after elilo prints "Uncompressing Linux... done": - - - You specified "console=ttyS0" but Linux changed the device - to which ttyS0 refers. Configure exactly one EFI console - device[3] and remove the "console=" option. - - - The EFI console path contains both a VGA device and a UART. - EFI and elilo use both, but Linux defaults to VGA. Remove - the VGA device from the EFI console path[3]. - - - Multiple UARTs selected as EFI console devices. EFI and - elilo use all selected devices, but Linux uses only one. - Make sure only one UART is selected in the EFI console - path[3]. - - - You're connected to an HP MP port[2] but have a non-MP UART - selected as EFI console device. EFI uses the MP as a - console device even when it isn't explicitly selected. - Either move the console cable to the non-MP UART, or change - the EFI console path[3] to the MP UART. - - Long pause (60+ seconds) between "Uncompressing Linux... done" and - start of kernel output: - - - No early console because you used "console=ttyS". Remove - the "console=" option if your firmware supplies an HCDP. - - - If you don't have an HCDP, the kernel doesn't know where - your console lives until the driver discovers serial - devices. Use "console=uart,io,0x3f8" (or appropriate - address for your machine). - - Kernel and init script output works fine, but no "login:" prompt: - - - Add getty entry to /etc/inittab for console tty. Look for - the "Adding console on ttyS" message that tells you which - device is the console. - - "login:" prompt, but can't login as root: - - - Add entry to /etc/securetty for console tty. - - No ACPI serial devices found in 2.6.17 or later: - - - Turn on CONFIG_PNP and CONFIG_PNPACPI. Prior to 2.6.17, ACPI - serial devices were discovered by 8250_acpi. In 2.6.17, - 8250_acpi was replaced by the combination of 8250_pnp and - CONFIG_PNPACPI. - - - -[1] http://www.dig64.org/specifications/agreement - The table was originally defined as the "HCDP" for "Headless - Console/Debug Port." The current version is the "PCDP" for - "Primary Console and Debug Port Devices." - -[2] The HP MP (management processor) is a PCI device that provides - several UARTs. One of the UARTs is often used as a console; the - EFI Boot Manager identifies it as "Acpi(HWP0002,700)/Pci(...)/Uart". - The external connection is usually a 25-pin connector, and a - special dongle converts that to three 9-pin connectors, one of - which is labelled "Console." - -[3] EFI console devices are configured using the EFI Boot Manager - "Boot option maintenance" menu. You may have to interrupt the - boot sequence to use this menu, and you will have to reset the - box after changing console configuration. diff --git a/Documentation/ia64/xen.rst b/Documentation/ia64/xen.rst new file mode 100644 index 000000000000..831339c74441 --- /dev/null +++ b/Documentation/ia64/xen.rst @@ -0,0 +1,206 @@ +******************************************************** +Recipe for getting/building/running Xen/ia64 with pv_ops +******************************************************** +This recipe describes how to get xen-ia64 source and build it, +and run domU with pv_ops. + +Requirements +============ + + - python + - mercurial + it (aka "hg") is an open-source source code + management software. See the below. + http://www.selenic.com/mercurial/wiki/ + - git + - bridge-utils + +Getting and Building Xen and Dom0 +================================= + + My environment is: + + - Machine : Tiger4 + - Domain0 OS : RHEL5 + - DomainU OS : RHEL5 + + 1. Download source:: + + # hg clone http://xenbits.xensource.com/ext/ia64/xen-unstable.hg + # cd xen-unstable.hg + # hg clone http://xenbits.xensource.com/ext/ia64/linux-2.6.18-xen.hg + + 2. # make world + + 3. # make install-tools + + 4. copy kernels and xen:: + + # cp xen/xen.gz /boot/efi/efi/redhat/ + # cp build-linux-2.6.18-xen_ia64/vmlinux.gz \ + /boot/efi/efi/redhat/vmlinuz-2.6.18.8-xen + + 5. make initrd for Dom0/DomU:: + + # make -C linux-2.6.18-xen.hg ARCH=ia64 modules_install \ + O=$(pwd)/build-linux-2.6.18-xen_ia64 + # mkinitrd -f /boot/efi/efi/redhat/initrd-2.6.18.8-xen.img \ + 2.6.18.8-xen --builtin mptspi --builtin mptbase \ + --builtin mptscsih --builtin uhci-hcd --builtin ohci-hcd \ + --builtin ehci-hcd + +Making a disk image for guest OS +================================ + + 1. make file:: + + # dd if=/dev/zero of=/root/rhel5.img bs=1M seek=4096 count=0 + # mke2fs -F -j /root/rhel5.img + # mount -o loop /root/rhel5.img /mnt + # cp -ax /{dev,var,etc,usr,bin,sbin,lib} /mnt + # mkdir /mnt/{root,proc,sys,home,tmp} + + Note: You may miss some device files. If so, please create them + with mknod. Or you can use tar instead of cp. + + 2. modify DomU's fstab:: + + # vi /mnt/etc/fstab + /dev/xvda1 / ext3 defaults 1 1 + none /dev/pts devpts gid=5,mode=620 0 0 + none /dev/shm tmpfs defaults 0 0 + none /proc proc defaults 0 0 + none /sys sysfs defaults 0 0 + + 3. modify inittab + + set runlevel to 3 to avoid X trying to start:: + + # vi /mnt/etc/inittab + id:3:initdefault: + + Start a getty on the hvc0 console:: + + X0:2345:respawn:/sbin/mingetty hvc0 + + tty1-6 mingetty can be commented out + + 4. add hvc0 into /etc/securetty:: + + # vi /mnt/etc/securetty (add hvc0) + + 5. umount:: + + # umount /mnt + +FYI, virt-manager can also make a disk image for guest OS. +It's GUI tools and easy to make it. + +Boot Xen & Domain0 +================== + + 1. replace elilo + elilo of RHEL5 can boot Xen and Dom0. + If you use old elilo (e.g RHEL4), please download from the below + http://elilo.sourceforge.net/cgi-bin/blosxom + and copy into /boot/efi/efi/redhat/:: + + # cp elilo-3.6-ia64.efi /boot/efi/efi/redhat/elilo.efi + + 2. modify elilo.conf (like the below):: + + # vi /boot/efi/efi/redhat/elilo.conf + prompt + timeout=20 + default=xen + relocatable + + image=vmlinuz-2.6.18.8-xen + label=xen + vmm=xen.gz + initrd=initrd-2.6.18.8-xen.img + read-only + append=" -- rhgb root=/dev/sda2" + +The append options before "--" are for xen hypervisor, +the options after "--" are for dom0. + +FYI, your machine may need console options like +"com1=19200,8n1 console=vga,com1". For example, +append="com1=19200,8n1 console=vga,com1 -- rhgb console=tty0 \ +console=ttyS0 root=/dev/sda2" + +Getting and Building domU with pv_ops +===================================== + + 1. get pv_ops tree:: + + # git clone http://people.valinux.co.jp/~yamahata/xen-ia64/linux-2.6-xen-ia64.git/ + + 2. git branch (if necessary):: + + # cd linux-2.6-xen-ia64/ + # git checkout -b your_branch origin/xen-ia64-domu-minimal-2008may19 + + Note: + The current branch is xen-ia64-domu-minimal-2008may19. + But you would find the new branch. You can see with + "git branch -r" to get the branch lists. + + http://people.valinux.co.jp/~yamahata/xen-ia64/for_eagl/linux-2.6-ia64-pv-ops.git/ + + is also available. + + The tree is based on + + git://git.kernel.org/pub/scm/linux/kernel/git/aegl/linux-2.6 test) + + 3. copy .config for pv_ops of domU:: + + # cp arch/ia64/configs/xen_domu_wip_defconfig .config + + 4. make kernel with pv_ops:: + + # make oldconfig + # make + + 5. install the kernel and initrd:: + + # cp vmlinux.gz /boot/efi/efi/redhat/vmlinuz-2.6-pv_ops-xenU + # make modules_install + # mkinitrd -f /boot/efi/efi/redhat/initrd-2.6-pv_ops-xenU.img \ + 2.6.26-rc3xen-ia64-08941-g1b12161 --builtin mptspi \ + --builtin mptbase --builtin mptscsih --builtin uhci-hcd \ + --builtin ohci-hcd --builtin ehci-hcd + +Boot DomainU with pv_ops +======================== + + 1. make config of DomU:: + + # vi /etc/xen/rhel5 + kernel = "/boot/efi/efi/redhat/vmlinuz-2.6-pv_ops-xenU" + ramdisk = "/boot/efi/efi/redhat/initrd-2.6-pv_ops-xenU.img" + vcpus = 1 + memory = 512 + name = "rhel5" + disk = [ 'file:/root/rhel5.img,xvda1,w' ] + root = "/dev/xvda1 ro" + extra= "rhgb console=hvc0" + + 2. After boot xen and dom0, start xend:: + + # /etc/init.d/xend start + + ( In the debugging case, `# XEND_DEBUG=1 xend trace_start` ) + + 3. start domU:: + + # xm create -c rhel5 + +Reference +========= +- Wiki of Xen/IA64 upstream merge + http://wiki.xensource.com/xenwiki/XenIA64/UpstreamMerge + +Written by Akio Takebe on 28 May 2008 diff --git a/Documentation/ia64/xen.txt b/Documentation/ia64/xen.txt deleted file mode 100644 index a12c74ce2773..000000000000 --- a/Documentation/ia64/xen.txt +++ /dev/null @@ -1,183 +0,0 @@ - Recipe for getting/building/running Xen/ia64 with pv_ops - -------------------------------------------------------- - -This recipe describes how to get xen-ia64 source and build it, -and run domU with pv_ops. - -============ -Requirements -============ - - - python - - mercurial - it (aka "hg") is an open-source source code - management software. See the below. - http://www.selenic.com/mercurial/wiki/ - - git - - bridge-utils - -================================= -Getting and Building Xen and Dom0 -================================= - - My environment is; - Machine : Tiger4 - Domain0 OS : RHEL5 - DomainU OS : RHEL5 - - 1. Download source - # hg clone http://xenbits.xensource.com/ext/ia64/xen-unstable.hg - # cd xen-unstable.hg - # hg clone http://xenbits.xensource.com/ext/ia64/linux-2.6.18-xen.hg - - 2. # make world - - 3. # make install-tools - - 4. copy kernels and xen - # cp xen/xen.gz /boot/efi/efi/redhat/ - # cp build-linux-2.6.18-xen_ia64/vmlinux.gz \ - /boot/efi/efi/redhat/vmlinuz-2.6.18.8-xen - - 5. make initrd for Dom0/DomU - # make -C linux-2.6.18-xen.hg ARCH=ia64 modules_install \ - O=$(pwd)/build-linux-2.6.18-xen_ia64 - # mkinitrd -f /boot/efi/efi/redhat/initrd-2.6.18.8-xen.img \ - 2.6.18.8-xen --builtin mptspi --builtin mptbase \ - --builtin mptscsih --builtin uhci-hcd --builtin ohci-hcd \ - --builtin ehci-hcd - -================================ -Making a disk image for guest OS -================================ - - 1. make file - # dd if=/dev/zero of=/root/rhel5.img bs=1M seek=4096 count=0 - # mke2fs -F -j /root/rhel5.img - # mount -o loop /root/rhel5.img /mnt - # cp -ax /{dev,var,etc,usr,bin,sbin,lib} /mnt - # mkdir /mnt/{root,proc,sys,home,tmp} - - Note: You may miss some device files. If so, please create them - with mknod. Or you can use tar instead of cp. - - 2. modify DomU's fstab - # vi /mnt/etc/fstab - /dev/xvda1 / ext3 defaults 1 1 - none /dev/pts devpts gid=5,mode=620 0 0 - none /dev/shm tmpfs defaults 0 0 - none /proc proc defaults 0 0 - none /sys sysfs defaults 0 0 - - 3. modify inittab - set runlevel to 3 to avoid X trying to start - # vi /mnt/etc/inittab - id:3:initdefault: - Start a getty on the hvc0 console - X0:2345:respawn:/sbin/mingetty hvc0 - tty1-6 mingetty can be commented out - - 4. add hvc0 into /etc/securetty - # vi /mnt/etc/securetty (add hvc0) - - 5. umount - # umount /mnt - -FYI, virt-manager can also make a disk image for guest OS. -It's GUI tools and easy to make it. - -================== -Boot Xen & Domain0 -================== - - 1. replace elilo - elilo of RHEL5 can boot Xen and Dom0. - If you use old elilo (e.g RHEL4), please download from the below - http://elilo.sourceforge.net/cgi-bin/blosxom - and copy into /boot/efi/efi/redhat/ - # cp elilo-3.6-ia64.efi /boot/efi/efi/redhat/elilo.efi - - 2. modify elilo.conf (like the below) - # vi /boot/efi/efi/redhat/elilo.conf - prompt - timeout=20 - default=xen - relocatable - - image=vmlinuz-2.6.18.8-xen - label=xen - vmm=xen.gz - initrd=initrd-2.6.18.8-xen.img - read-only - append=" -- rhgb root=/dev/sda2" - -The append options before "--" are for xen hypervisor, -the options after "--" are for dom0. - -FYI, your machine may need console options like -"com1=19200,8n1 console=vga,com1". For example, -append="com1=19200,8n1 console=vga,com1 -- rhgb console=tty0 \ -console=ttyS0 root=/dev/sda2" - -===================================== -Getting and Building domU with pv_ops -===================================== - - 1. get pv_ops tree - # git clone http://people.valinux.co.jp/~yamahata/xen-ia64/linux-2.6-xen-ia64.git/ - - 2. git branch (if necessary) - # cd linux-2.6-xen-ia64/ - # git checkout -b your_branch origin/xen-ia64-domu-minimal-2008may19 - (Note: The current branch is xen-ia64-domu-minimal-2008may19. - But you would find the new branch. You can see with - "git branch -r" to get the branch lists. - http://people.valinux.co.jp/~yamahata/xen-ia64/for_eagl/linux-2.6-ia64-pv-ops.git/ - is also available. The tree is based on - git://git.kernel.org/pub/scm/linux/kernel/git/aegl/linux-2.6 test) - - - 3. copy .config for pv_ops of domU - # cp arch/ia64/configs/xen_domu_wip_defconfig .config - - 4. make kernel with pv_ops - # make oldconfig - # make - - 5. install the kernel and initrd - # cp vmlinux.gz /boot/efi/efi/redhat/vmlinuz-2.6-pv_ops-xenU - # make modules_install - # mkinitrd -f /boot/efi/efi/redhat/initrd-2.6-pv_ops-xenU.img \ - 2.6.26-rc3xen-ia64-08941-g1b12161 --builtin mptspi \ - --builtin mptbase --builtin mptscsih --builtin uhci-hcd \ - --builtin ohci-hcd --builtin ehci-hcd - -======================== -Boot DomainU with pv_ops -======================== - - 1. make config of DomU - # vi /etc/xen/rhel5 - kernel = "/boot/efi/efi/redhat/vmlinuz-2.6-pv_ops-xenU" - ramdisk = "/boot/efi/efi/redhat/initrd-2.6-pv_ops-xenU.img" - vcpus = 1 - memory = 512 - name = "rhel5" - disk = [ 'file:/root/rhel5.img,xvda1,w' ] - root = "/dev/xvda1 ro" - extra= "rhgb console=hvc0" - - 2. After boot xen and dom0, start xend - # /etc/init.d/xend start - ( In the debugging case, # XEND_DEBUG=1 xend trace_start ) - - 3. start domU - # xm create -c rhel5 - -========= -Reference -========= -- Wiki of Xen/IA64 upstream merge - http://wiki.xensource.com/xenwiki/XenIA64/UpstreamMerge - -Written by Akio Takebe on 28 May 2008 diff --git a/MAINTAINERS b/MAINTAINERS index 2a2d74e5d670..c30b52c9049a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14389,7 +14389,7 @@ SGI SN-IA64 (Altix) SERIAL CONSOLE DRIVER M: Pat Gefre L: linux-ia64@vger.kernel.org S: Supported -F: Documentation/ia64/serial.txt +F: Documentation/ia64/serial.rst F: drivers/tty/serial/ioc?_serial.c F: include/linux/ioc?.h diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index 8f106638913c..3795d18276c4 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -852,7 +852,7 @@ valid_phys_addr_range (phys_addr_t phys_addr, unsigned long size) * /dev/mem reads and writes use copy_to_user(), which implicitly * uses a granule-sized kernel identity mapping. It's really * only safe to do this for regions in kern_memmap. For more - * details, see Documentation/ia64/aliasing.txt. + * details, see Documentation/ia64/aliasing.rst. */ attr = kern_mem_attribute(phys_addr, size); if (attr & EFI_MEMORY_WB || attr & EFI_MEMORY_UC) diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S index d80c99a5f55d..0750a716adc7 100644 --- a/arch/ia64/kernel/fsys.S +++ b/arch/ia64/kernel/fsys.S @@ -28,7 +28,7 @@ #include /* - * See Documentation/ia64/fsys.txt for details on fsyscalls. + * See Documentation/ia64/fsys.rst for details on fsyscalls. * * On entry to an fsyscall handler: * r10 = 0 (i.e., defaults to "successful syscall return") diff --git a/arch/ia64/mm/ioremap.c b/arch/ia64/mm/ioremap.c index 5e3e7b1fdac5..0c0de2c4ec69 100644 --- a/arch/ia64/mm/ioremap.c +++ b/arch/ia64/mm/ioremap.c @@ -42,7 +42,7 @@ ioremap (unsigned long phys_addr, unsigned long size) /* * For things in kern_memmap, we must use the same attribute * as the rest of the kernel. For more details, see - * Documentation/ia64/aliasing.txt. + * Documentation/ia64/aliasing.rst. */ attr = kern_mem_attribute(phys_addr, size); if (attr & EFI_MEMORY_WB) diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c index e308196c2229..165e561dc81a 100644 --- a/arch/ia64/pci/pci.c +++ b/arch/ia64/pci/pci.c @@ -450,7 +450,7 @@ pci_mmap_legacy_page_range(struct pci_bus *bus, struct vm_area_struct *vma, return -ENOSYS; /* - * Avoid attribute aliasing. See Documentation/ia64/aliasing.txt + * Avoid attribute aliasing. See Documentation/ia64/aliasing.rst * for more details. */ if (!valid_mmap_phys_addr_range(vma->vm_pgoff, size)) -- cgit v1.2.3-70-g09d2 From 8ea0afa3b801e9fe3ff676c3e60e74afa1a0848a Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 18 Apr 2019 14:34:34 -0300 Subject: docs: xtensa: convert to ReST Rename the xtensa documentation files to ReST, add an index for them and adjust in order to produce a nice html output via the Sphinx build system. At its new index.rst, let's add a :orphan: while this is not linked to the main index.rst file, in order to avoid build warnings. Signed-off-by: Mauro Carvalho Chehab --- Documentation/xtensa/atomctl.rst | 51 ++++++++ Documentation/xtensa/atomctl.txt | 44 ------- Documentation/xtensa/booting.rst | 22 ++++ Documentation/xtensa/booting.txt | 19 --- Documentation/xtensa/index.rst | 12 ++ Documentation/xtensa/mmu.rst | 195 +++++++++++++++++++++++++++++++ Documentation/xtensa/mmu.txt | 189 ------------------------------ arch/xtensa/include/asm/initialize_mmu.h | 2 +- 8 files changed, 281 insertions(+), 253 deletions(-) create mode 100644 Documentation/xtensa/atomctl.rst delete mode 100644 Documentation/xtensa/atomctl.txt create mode 100644 Documentation/xtensa/booting.rst delete mode 100644 Documentation/xtensa/booting.txt create mode 100644 Documentation/xtensa/index.rst create mode 100644 Documentation/xtensa/mmu.rst delete mode 100644 Documentation/xtensa/mmu.txt (limited to 'arch') diff --git a/Documentation/xtensa/atomctl.rst b/Documentation/xtensa/atomctl.rst new file mode 100644 index 000000000000..1ecbd0ba9a2e --- /dev/null +++ b/Documentation/xtensa/atomctl.rst @@ -0,0 +1,51 @@ +=========================================== +Atomic Operation Control (ATOMCTL) Register +=========================================== + +We Have Atomic Operation Control (ATOMCTL) Register. +This register determines the effect of using a S32C1I instruction +with various combinations of: + + 1. With and without an Coherent Cache Controller which + can do Atomic Transactions to the memory internally. + + 2. With and without An Intelligent Memory Controller which + can do Atomic Transactions itself. + +The Core comes up with a default value of for the three types of cache ops:: + + 0x28: (WB: Internal, WT: Internal, BY:Exception) + +On the FPGA Cards we typically simulate an Intelligent Memory controller +which can implement RCW transactions. For FPGA cards with an External +Memory controller we let it to the atomic operations internally while +doing a Cached (WB) transaction and use the Memory RCW for un-cached +operations. + +For systems without an coherent cache controller, non-MX, we always +use the memory controllers RCW, thought non-MX controlers likely +support the Internal Operation. + +CUSTOMER-WARNING: + Virtually all customers buy their memory controllers from vendors that + don't support atomic RCW memory transactions and will likely want to + configure this register to not use RCW. + +Developers might find using RCW in Bypass mode convenient when testing +with the cache being bypassed; for example studying cache alias problems. + +See Section 4.3.12.4 of ISA; Bits:: + + WB WT BY + 5 4 | 3 2 | 1 0 + +========= ================== ================== =============== + 2 Bit + Field + Values WB - Write Back WT - Write Thru BY - Bypass +========= ================== ================== =============== + 0 Exception Exception Exception + 1 RCW Transaction RCW Transaction RCW Transaction + 2 Internal Operation Internal Operation Reserved + 3 Reserved Reserved Reserved +========= ================== ================== =============== diff --git a/Documentation/xtensa/atomctl.txt b/Documentation/xtensa/atomctl.txt deleted file mode 100644 index 1da783ac200c..000000000000 --- a/Documentation/xtensa/atomctl.txt +++ /dev/null @@ -1,44 +0,0 @@ -We Have Atomic Operation Control (ATOMCTL) Register. -This register determines the effect of using a S32C1I instruction -with various combinations of: - - 1. With and without an Coherent Cache Controller which - can do Atomic Transactions to the memory internally. - - 2. With and without An Intelligent Memory Controller which - can do Atomic Transactions itself. - -The Core comes up with a default value of for the three types of cache ops: - - 0x28: (WB: Internal, WT: Internal, BY:Exception) - -On the FPGA Cards we typically simulate an Intelligent Memory controller -which can implement RCW transactions. For FPGA cards with an External -Memory controller we let it to the atomic operations internally while -doing a Cached (WB) transaction and use the Memory RCW for un-cached -operations. - -For systems without an coherent cache controller, non-MX, we always -use the memory controllers RCW, thought non-MX controlers likely -support the Internal Operation. - -CUSTOMER-WARNING: - Virtually all customers buy their memory controllers from vendors that - don't support atomic RCW memory transactions and will likely want to - configure this register to not use RCW. - -Developers might find using RCW in Bypass mode convenient when testing -with the cache being bypassed; for example studying cache alias problems. - -See Section 4.3.12.4 of ISA; Bits: - - WB WT BY - 5 4 | 3 2 | 1 0 - 2 Bit - Field - Values WB - Write Back WT - Write Thru BY - Bypass ---------- --------------- ----------------- ---------------- - 0 Exception Exception Exception - 1 RCW Transaction RCW Transaction RCW Transaction - 2 Internal Operation Internal Operation Reserved - 3 Reserved Reserved Reserved diff --git a/Documentation/xtensa/booting.rst b/Documentation/xtensa/booting.rst new file mode 100644 index 000000000000..e1b83707e5b6 --- /dev/null +++ b/Documentation/xtensa/booting.rst @@ -0,0 +1,22 @@ +===================================== +Passing boot parameters to the kernel +===================================== + +Boot parameters are represented as a TLV list in the memory. Please see +arch/xtensa/include/asm/bootparam.h for definition of the bp_tag structure and +tag value constants. First entry in the list must have type BP_TAG_FIRST, last +entry must have type BP_TAG_LAST. The address of the first list entry is +passed to the kernel in the register a2. The address type depends on MMU type: + +- For configurations without MMU, with region protection or with MPU the + address must be the physical address. +- For configurations with region translarion MMU or with MMUv3 and CONFIG_MMU=n + the address must be a valid address in the current mapping. The kernel will + not change the mapping on its own. +- For configurations with MMUv2 the address must be a virtual address in the + default virtual mapping (0xd0000000..0xffffffff). +- For configurations with MMUv3 and CONFIG_MMU=y the address may be either a + virtual or physical address. In either case it must be within the default + virtual mapping. It is considered physical if it is within the range of + physical addresses covered by the default KSEG mapping (XCHAL_KSEG_PADDR.. + XCHAL_KSEG_PADDR + XCHAL_KSEG_SIZE), otherwise it is considered virtual. diff --git a/Documentation/xtensa/booting.txt b/Documentation/xtensa/booting.txt deleted file mode 100644 index 402b33a2619f..000000000000 --- a/Documentation/xtensa/booting.txt +++ /dev/null @@ -1,19 +0,0 @@ -Passing boot parameters to the kernel. - -Boot parameters are represented as a TLV list in the memory. Please see -arch/xtensa/include/asm/bootparam.h for definition of the bp_tag structure and -tag value constants. First entry in the list must have type BP_TAG_FIRST, last -entry must have type BP_TAG_LAST. The address of the first list entry is -passed to the kernel in the register a2. The address type depends on MMU type: -- For configurations without MMU, with region protection or with MPU the - address must be the physical address. -- For configurations with region translarion MMU or with MMUv3 and CONFIG_MMU=n - the address must be a valid address in the current mapping. The kernel will - not change the mapping on its own. -- For configurations with MMUv2 the address must be a virtual address in the - default virtual mapping (0xd0000000..0xffffffff). -- For configurations with MMUv3 and CONFIG_MMU=y the address may be either a - virtual or physical address. In either case it must be within the default - virtual mapping. It is considered physical if it is within the range of - physical addresses covered by the default KSEG mapping (XCHAL_KSEG_PADDR.. - XCHAL_KSEG_PADDR + XCHAL_KSEG_SIZE), otherwise it is considered virtual. diff --git a/Documentation/xtensa/index.rst b/Documentation/xtensa/index.rst new file mode 100644 index 000000000000..5a24e365e35f --- /dev/null +++ b/Documentation/xtensa/index.rst @@ -0,0 +1,12 @@ +:orphan: + +=================== +Xtensa Architecture +=================== + +.. toctree:: + :maxdepth: 1 + + atomctl + booting + mmu diff --git a/Documentation/xtensa/mmu.rst b/Documentation/xtensa/mmu.rst new file mode 100644 index 000000000000..e52a12960fdc --- /dev/null +++ b/Documentation/xtensa/mmu.rst @@ -0,0 +1,195 @@ +============================= +MMUv3 initialization sequence +============================= + +The code in the initialize_mmu macro sets up MMUv3 memory mapping +identically to MMUv2 fixed memory mapping. Depending on +CONFIG_INITIALIZE_XTENSA_MMU_INSIDE_VMLINUX symbol this code is +located in addresses it was linked for (symbol undefined), or not +(symbol defined), so it needs to be position-independent. + +The code has the following assumptions: + + - This code fragment is run only on an MMU v3. + - TLBs are in their reset state. + - ITLBCFG and DTLBCFG are zero (reset state). + - RASID is 0x04030201 (reset state). + - PS.RING is zero (reset state). + - LITBASE is zero (reset state, PC-relative literals); required to be PIC. + +TLB setup proceeds along the following steps. + + Legend: + + - VA = virtual address (two upper nibbles of it); + - PA = physical address (two upper nibbles of it); + - pc = physical range that contains this code; + +After step 2, we jump to virtual address in the range 0x40000000..0x5fffffff +or 0x00000000..0x1fffffff, depending on whether the kernel was loaded below +0x40000000 or above. That address corresponds to next instruction to execute +in this code. After step 4, we jump to intended (linked) address of this code. +The scheme below assumes that the kernel is loaded below 0x40000000. + + ====== ===== ===== ===== ===== ====== ===== ===== + - Step0 Step1 Step2 Step3 Step4 Step5 + + VA PA PA PA PA VA PA PA + ====== ===== ===== ===== ===== ====== ===== ===== + E0..FF -> E0 -> E0 -> E0 F0..FF -> F0 -> F0 + C0..DF -> C0 -> C0 -> C0 E0..EF -> F0 -> F0 + A0..BF -> A0 -> A0 -> A0 D8..DF -> 00 -> 00 + 80..9F -> 80 -> 80 -> 80 D0..D7 -> 00 -> 00 + 60..7F -> 60 -> 60 -> 60 + 40..5F -> 40 -> pc -> pc 40..5F -> pc + 20..3F -> 20 -> 20 -> 20 + 00..1F -> 00 -> 00 -> 00 + ====== ===== ===== ===== ===== ====== ===== ===== + +The default location of IO peripherals is above 0xf0000000. This may be changed +using a "ranges" property in a device tree simple-bus node. See the Devicetree +Specification, section 4.5 for details on the syntax and semantics of +simple-bus nodes. The following limitations apply: + +1. Only top level simple-bus nodes are considered + +2. Only one (first) simple-bus node is considered + +3. Empty "ranges" properties are not supported + +4. Only the first triplet in the "ranges" property is considered + +5. The parent-bus-address value is rounded down to the nearest 256MB boundary + +6. The IO area covers the entire 256MB segment of parent-bus-address; the + "ranges" triplet length field is ignored + + +MMUv3 address space layouts. +============================ + +Default MMUv2-compatible layout:: + + Symbol VADDR Size + +------------------+ + | Userspace | 0x00000000 TASK_SIZE + +------------------+ 0x40000000 + +------------------+ + | Page table | XCHAL_PAGE_TABLE_VADDR 0x80000000 XCHAL_PAGE_TABLE_SIZE + +------------------+ + | KASAN shadow map | KASAN_SHADOW_START 0x80400000 KASAN_SHADOW_SIZE + +------------------+ 0x8e400000 + +------------------+ + | VMALLOC area | VMALLOC_START 0xc0000000 128MB - 64KB + +------------------+ VMALLOC_END + | Cache aliasing | TLBTEMP_BASE_1 0xc7ff0000 DCACHE_WAY_SIZE + | remap area 1 | + +------------------+ + | Cache aliasing | TLBTEMP_BASE_2 DCACHE_WAY_SIZE + | remap area 2 | + +------------------+ + +------------------+ + | KMAP area | PKMAP_BASE PTRS_PER_PTE * + | | DCACHE_N_COLORS * + | | PAGE_SIZE + | | (4MB * DCACHE_N_COLORS) + +------------------+ + | Atomic KMAP area | FIXADDR_START KM_TYPE_NR * + | | NR_CPUS * + | | DCACHE_N_COLORS * + | | PAGE_SIZE + +------------------+ FIXADDR_TOP 0xcffff000 + +------------------+ + | Cached KSEG | XCHAL_KSEG_CACHED_VADDR 0xd0000000 128MB + +------------------+ + | Uncached KSEG | XCHAL_KSEG_BYPASS_VADDR 0xd8000000 128MB + +------------------+ + | Cached KIO | XCHAL_KIO_CACHED_VADDR 0xe0000000 256MB + +------------------+ + | Uncached KIO | XCHAL_KIO_BYPASS_VADDR 0xf0000000 256MB + +------------------+ + + +256MB cached + 256MB uncached layout:: + + Symbol VADDR Size + +------------------+ + | Userspace | 0x00000000 TASK_SIZE + +------------------+ 0x40000000 + +------------------+ + | Page table | XCHAL_PAGE_TABLE_VADDR 0x80000000 XCHAL_PAGE_TABLE_SIZE + +------------------+ + | KASAN shadow map | KASAN_SHADOW_START 0x80400000 KASAN_SHADOW_SIZE + +------------------+ 0x8e400000 + +------------------+ + | VMALLOC area | VMALLOC_START 0xa0000000 128MB - 64KB + +------------------+ VMALLOC_END + | Cache aliasing | TLBTEMP_BASE_1 0xa7ff0000 DCACHE_WAY_SIZE + | remap area 1 | + +------------------+ + | Cache aliasing | TLBTEMP_BASE_2 DCACHE_WAY_SIZE + | remap area 2 | + +------------------+ + +------------------+ + | KMAP area | PKMAP_BASE PTRS_PER_PTE * + | | DCACHE_N_COLORS * + | | PAGE_SIZE + | | (4MB * DCACHE_N_COLORS) + +------------------+ + | Atomic KMAP area | FIXADDR_START KM_TYPE_NR * + | | NR_CPUS * + | | DCACHE_N_COLORS * + | | PAGE_SIZE + +------------------+ FIXADDR_TOP 0xaffff000 + +------------------+ + | Cached KSEG | XCHAL_KSEG_CACHED_VADDR 0xb0000000 256MB + +------------------+ + | Uncached KSEG | XCHAL_KSEG_BYPASS_VADDR 0xc0000000 256MB + +------------------+ + +------------------+ + | Cached KIO | XCHAL_KIO_CACHED_VADDR 0xe0000000 256MB + +------------------+ + | Uncached KIO | XCHAL_KIO_BYPASS_VADDR 0xf0000000 256MB + +------------------+ + + +512MB cached + 512MB uncached layout:: + + Symbol VADDR Size + +------------------+ + | Userspace | 0x00000000 TASK_SIZE + +------------------+ 0x40000000 + +------------------+ + | Page table | XCHAL_PAGE_TABLE_VADDR 0x80000000 XCHAL_PAGE_TABLE_SIZE + +------------------+ + | KASAN shadow map | KASAN_SHADOW_START 0x80400000 KASAN_SHADOW_SIZE + +------------------+ 0x8e400000 + +------------------+ + | VMALLOC area | VMALLOC_START 0x90000000 128MB - 64KB + +------------------+ VMALLOC_END + | Cache aliasing | TLBTEMP_BASE_1 0x97ff0000 DCACHE_WAY_SIZE + | remap area 1 | + +------------------+ + | Cache aliasing | TLBTEMP_BASE_2 DCACHE_WAY_SIZE + | remap area 2 | + +------------------+ + +------------------+ + | KMAP area | PKMAP_BASE PTRS_PER_PTE * + | | DCACHE_N_COLORS * + | | PAGE_SIZE + | | (4MB * DCACHE_N_COLORS) + +------------------+ + | Atomic KMAP area | FIXADDR_START KM_TYPE_NR * + | | NR_CPUS * + | | DCACHE_N_COLORS * + | | PAGE_SIZE + +------------------+ FIXADDR_TOP 0x9ffff000 + +------------------+ + | Cached KSEG | XCHAL_KSEG_CACHED_VADDR 0xa0000000 512MB + +------------------+ + | Uncached KSEG | XCHAL_KSEG_BYPASS_VADDR 0xc0000000 512MB + +------------------+ + | Cached KIO | XCHAL_KIO_CACHED_VADDR 0xe0000000 256MB + +------------------+ + | Uncached KIO | XCHAL_KIO_BYPASS_VADDR 0xf0000000 256MB + +------------------+ diff --git a/Documentation/xtensa/mmu.txt b/Documentation/xtensa/mmu.txt deleted file mode 100644 index 318114de63f3..000000000000 --- a/Documentation/xtensa/mmu.txt +++ /dev/null @@ -1,189 +0,0 @@ -MMUv3 initialization sequence. - -The code in the initialize_mmu macro sets up MMUv3 memory mapping -identically to MMUv2 fixed memory mapping. Depending on -CONFIG_INITIALIZE_XTENSA_MMU_INSIDE_VMLINUX symbol this code is -located in addresses it was linked for (symbol undefined), or not -(symbol defined), so it needs to be position-independent. - -The code has the following assumptions: - This code fragment is run only on an MMU v3. - TLBs are in their reset state. - ITLBCFG and DTLBCFG are zero (reset state). - RASID is 0x04030201 (reset state). - PS.RING is zero (reset state). - LITBASE is zero (reset state, PC-relative literals); required to be PIC. - -TLB setup proceeds along the following steps. - - Legend: - VA = virtual address (two upper nibbles of it); - PA = physical address (two upper nibbles of it); - pc = physical range that contains this code; - -After step 2, we jump to virtual address in the range 0x40000000..0x5fffffff -or 0x00000000..0x1fffffff, depending on whether the kernel was loaded below -0x40000000 or above. That address corresponds to next instruction to execute -in this code. After step 4, we jump to intended (linked) address of this code. -The scheme below assumes that the kernel is loaded below 0x40000000. - - Step0 Step1 Step2 Step3 Step4 Step5 - ===== ===== ===== ===== ===== ===== - VA PA PA PA PA VA PA PA - ------ -- -- -- -- ------ -- -- - E0..FF -> E0 -> E0 -> E0 F0..FF -> F0 -> F0 - C0..DF -> C0 -> C0 -> C0 E0..EF -> F0 -> F0 - A0..BF -> A0 -> A0 -> A0 D8..DF -> 00 -> 00 - 80..9F -> 80 -> 80 -> 80 D0..D7 -> 00 -> 00 - 60..7F -> 60 -> 60 -> 60 - 40..5F -> 40 -> pc -> pc 40..5F -> pc - 20..3F -> 20 -> 20 -> 20 - 00..1F -> 00 -> 00 -> 00 - -The default location of IO peripherals is above 0xf0000000. This may be changed -using a "ranges" property in a device tree simple-bus node. See the Devicetree -Specification, section 4.5 for details on the syntax and semantics of -simple-bus nodes. The following limitations apply: - -1. Only top level simple-bus nodes are considered - -2. Only one (first) simple-bus node is considered - -3. Empty "ranges" properties are not supported - -4. Only the first triplet in the "ranges" property is considered - -5. The parent-bus-address value is rounded down to the nearest 256MB boundary - -6. The IO area covers the entire 256MB segment of parent-bus-address; the - "ranges" triplet length field is ignored - - -MMUv3 address space layouts. -============================ - -Default MMUv2-compatible layout. - - Symbol VADDR Size -+------------------+ -| Userspace | 0x00000000 TASK_SIZE -+------------------+ 0x40000000 -+------------------+ -| Page table | XCHAL_PAGE_TABLE_VADDR 0x80000000 XCHAL_PAGE_TABLE_SIZE -+------------------+ -| KASAN shadow map | KASAN_SHADOW_START 0x80400000 KASAN_SHADOW_SIZE -+------------------+ 0x8e400000 -+------------------+ -| VMALLOC area | VMALLOC_START 0xc0000000 128MB - 64KB -+------------------+ VMALLOC_END -| Cache aliasing | TLBTEMP_BASE_1 0xc7ff0000 DCACHE_WAY_SIZE -| remap area 1 | -+------------------+ -| Cache aliasing | TLBTEMP_BASE_2 DCACHE_WAY_SIZE -| remap area 2 | -+------------------+ -+------------------+ -| KMAP area | PKMAP_BASE PTRS_PER_PTE * -| | DCACHE_N_COLORS * -| | PAGE_SIZE -| | (4MB * DCACHE_N_COLORS) -+------------------+ -| Atomic KMAP area | FIXADDR_START KM_TYPE_NR * -| | NR_CPUS * -| | DCACHE_N_COLORS * -| | PAGE_SIZE -+------------------+ FIXADDR_TOP 0xcffff000 -+------------------+ -| Cached KSEG | XCHAL_KSEG_CACHED_VADDR 0xd0000000 128MB -+------------------+ -| Uncached KSEG | XCHAL_KSEG_BYPASS_VADDR 0xd8000000 128MB -+------------------+ -| Cached KIO | XCHAL_KIO_CACHED_VADDR 0xe0000000 256MB -+------------------+ -| Uncached KIO | XCHAL_KIO_BYPASS_VADDR 0xf0000000 256MB -+------------------+ - - -256MB cached + 256MB uncached layout. - - Symbol VADDR Size -+------------------+ -| Userspace | 0x00000000 TASK_SIZE -+------------------+ 0x40000000 -+------------------+ -| Page table | XCHAL_PAGE_TABLE_VADDR 0x80000000 XCHAL_PAGE_TABLE_SIZE -+------------------+ -| KASAN shadow map | KASAN_SHADOW_START 0x80400000 KASAN_SHADOW_SIZE -+------------------+ 0x8e400000 -+------------------+ -| VMALLOC area | VMALLOC_START 0xa0000000 128MB - 64KB -+------------------+ VMALLOC_END -| Cache aliasing | TLBTEMP_BASE_1 0xa7ff0000 DCACHE_WAY_SIZE -| remap area 1 | -+------------------+ -| Cache aliasing | TLBTEMP_BASE_2 DCACHE_WAY_SIZE -| remap area 2 | -+------------------+ -+------------------+ -| KMAP area | PKMAP_BASE PTRS_PER_PTE * -| | DCACHE_N_COLORS * -| | PAGE_SIZE -| | (4MB * DCACHE_N_COLORS) -+------------------+ -| Atomic KMAP area | FIXADDR_START KM_TYPE_NR * -| | NR_CPUS * -| | DCACHE_N_COLORS * -| | PAGE_SIZE -+------------------+ FIXADDR_TOP 0xaffff000 -+------------------+ -| Cached KSEG | XCHAL_KSEG_CACHED_VADDR 0xb0000000 256MB -+------------------+ -| Uncached KSEG | XCHAL_KSEG_BYPASS_VADDR 0xc0000000 256MB -+------------------+ -+------------------+ -| Cached KIO | XCHAL_KIO_CACHED_VADDR 0xe0000000 256MB -+------------------+ -| Uncached KIO | XCHAL_KIO_BYPASS_VADDR 0xf0000000 256MB -+------------------+ - - -512MB cached + 512MB uncached layout. - - Symbol VADDR Size -+------------------+ -| Userspace | 0x00000000 TASK_SIZE -+------------------+ 0x40000000 -+------------------+ -| Page table | XCHAL_PAGE_TABLE_VADDR 0x80000000 XCHAL_PAGE_TABLE_SIZE -+------------------+ -| KASAN shadow map | KASAN_SHADOW_START 0x80400000 KASAN_SHADOW_SIZE -+------------------+ 0x8e400000 -+------------------+ -| VMALLOC area | VMALLOC_START 0x90000000 128MB - 64KB -+------------------+ VMALLOC_END -| Cache aliasing | TLBTEMP_BASE_1 0x97ff0000 DCACHE_WAY_SIZE -| remap area 1 | -+------------------+ -| Cache aliasing | TLBTEMP_BASE_2 DCACHE_WAY_SIZE -| remap area 2 | -+------------------+ -+------------------+ -| KMAP area | PKMAP_BASE PTRS_PER_PTE * -| | DCACHE_N_COLORS * -| | PAGE_SIZE -| | (4MB * DCACHE_N_COLORS) -+------------------+ -| Atomic KMAP area | FIXADDR_START KM_TYPE_NR * -| | NR_CPUS * -| | DCACHE_N_COLORS * -| | PAGE_SIZE -+------------------+ FIXADDR_TOP 0x9ffff000 -+------------------+ -| Cached KSEG | XCHAL_KSEG_CACHED_VADDR 0xa0000000 512MB -+------------------+ -| Uncached KSEG | XCHAL_KSEG_BYPASS_VADDR 0xc0000000 512MB -+------------------+ -| Cached KIO | XCHAL_KIO_CACHED_VADDR 0xe0000000 256MB -+------------------+ -| Uncached KIO | XCHAL_KIO_BYPASS_VADDR 0xf0000000 256MB -+------------------+ diff --git a/arch/xtensa/include/asm/initialize_mmu.h b/arch/xtensa/include/asm/initialize_mmu.h index 323d05789159..3b054d2bede0 100644 --- a/arch/xtensa/include/asm/initialize_mmu.h +++ b/arch/xtensa/include/asm/initialize_mmu.h @@ -42,7 +42,7 @@ #if XCHAL_HAVE_S32C1I && (XCHAL_HW_MIN_VERSION >= XTENSA_HWVERSION_RC_2009_0) /* * We Have Atomic Operation Control (ATOMCTL) Register; Initialize it. - * For details see Documentation/xtensa/atomctl.txt + * For details see Documentation/xtensa/atomctl.rst */ #if XCHAL_DCACHE_IS_COHERENT movi a3, 0x25 /* For SMP/MX -- internal for writeback, -- cgit v1.2.3-70-g09d2 From 330d48105245abfb8c9ca491dc53ea500657217a Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 13 Jun 2019 15:21:39 -0300 Subject: docs: admin-guide: add kdump documentation into it The Kdump documentation describes procedures with admins use in order to solve issues on their systems. Signed-off-by: Mauro Carvalho Chehab --- Documentation/admin-guide/bug-hunting.rst | 4 +- Documentation/admin-guide/index.rst | 1 + Documentation/admin-guide/kdump/gdbmacros.txt | 264 +++++++++++ Documentation/admin-guide/kdump/index.rst | 20 + Documentation/admin-guide/kdump/kdump.rst | 534 ++++++++++++++++++++++ Documentation/admin-guide/kdump/vmcoreinfo.rst | 488 ++++++++++++++++++++ Documentation/admin-guide/kernel-parameters.txt | 6 +- Documentation/kdump/gdbmacros.txt | 264 ----------- Documentation/kdump/index.rst | 21 - Documentation/kdump/kdump.rst | 534 ---------------------- Documentation/kdump/vmcoreinfo.rst | 488 -------------------- Documentation/powerpc/firmware-assisted-dump.txt | 2 +- Documentation/translations/zh_CN/oops-tracing.txt | 4 +- MAINTAINERS | 2 +- arch/arm/Kconfig | 2 +- arch/arm64/Kconfig | 2 +- arch/sh/Kconfig | 2 +- arch/x86/Kconfig | 4 +- 18 files changed, 1321 insertions(+), 1321 deletions(-) create mode 100644 Documentation/admin-guide/kdump/gdbmacros.txt create mode 100644 Documentation/admin-guide/kdump/index.rst create mode 100644 Documentation/admin-guide/kdump/kdump.rst create mode 100644 Documentation/admin-guide/kdump/vmcoreinfo.rst delete mode 100644 Documentation/kdump/gdbmacros.txt delete mode 100644 Documentation/kdump/index.rst delete mode 100644 Documentation/kdump/kdump.rst delete mode 100644 Documentation/kdump/vmcoreinfo.rst (limited to 'arch') diff --git a/Documentation/admin-guide/bug-hunting.rst b/Documentation/admin-guide/bug-hunting.rst index b761aa2a51d2..44b8a4edd348 100644 --- a/Documentation/admin-guide/bug-hunting.rst +++ b/Documentation/admin-guide/bug-hunting.rst @@ -90,9 +90,9 @@ the disk is not available then you have three options: run a null modem to a second machine and capture the output there using your favourite communication program. Minicom works well. -(3) Use Kdump (see Documentation/kdump/kdump.rst), +(3) Use Kdump (see Documentation/admin-guide/kdump/kdump.rst), extract the kernel ring buffer from old memory with using dmesg - gdbmacro in Documentation/kdump/gdbmacros.txt. + gdbmacro in Documentation/admin-guide/kdump/gdbmacros.txt. Finding the bug's location -------------------------- diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index 6fcc83aaa9b6..5b63182ceb5f 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -39,6 +39,7 @@ problems and bugs in particular. ramoops dynamic-debug-howto init + kdump/index perf/index This is the beginning of a section with information of interest to diff --git a/Documentation/admin-guide/kdump/gdbmacros.txt b/Documentation/admin-guide/kdump/gdbmacros.txt new file mode 100644 index 000000000000..220d0a80ca2c --- /dev/null +++ b/Documentation/admin-guide/kdump/gdbmacros.txt @@ -0,0 +1,264 @@ +# +# This file contains a few gdb macros (user defined commands) to extract +# useful information from kernel crashdump (kdump) like stack traces of +# all the processes or a particular process and trapinfo. +# +# These macros can be used by copying this file in .gdbinit (put in home +# directory or current directory) or by invoking gdb command with +# --command= option +# +# Credits: +# Alexander Nyberg +# V Srivatsa +# Maneesh Soni +# + +define bttnobp + set $tasks_off=((size_t)&((struct task_struct *)0)->tasks) + set $pid_off=((size_t)&((struct task_struct *)0)->thread_group.next) + set $init_t=&init_task + set $next_t=(((char *)($init_t->tasks).next) - $tasks_off) + set var $stacksize = sizeof(union thread_union) + while ($next_t != $init_t) + set $next_t=(struct task_struct *)$next_t + printf "\npid %d; comm %s:\n", $next_t.pid, $next_t.comm + printf "===================\n" + set var $stackp = $next_t.thread.sp + set var $stack_top = ($stackp & ~($stacksize - 1)) + $stacksize + + while ($stackp < $stack_top) + if (*($stackp) > _stext && *($stackp) < _sinittext) + info symbol *($stackp) + end + set $stackp += 4 + end + set $next_th=(((char *)$next_t->thread_group.next) - $pid_off) + while ($next_th != $next_t) + set $next_th=(struct task_struct *)$next_th + printf "\npid %d; comm %s:\n", $next_t.pid, $next_t.comm + printf "===================\n" + set var $stackp = $next_t.thread.sp + set var $stack_top = ($stackp & ~($stacksize - 1)) + stacksize + + while ($stackp < $stack_top) + if (*($stackp) > _stext && *($stackp) < _sinittext) + info symbol *($stackp) + end + set $stackp += 4 + end + set $next_th=(((char *)$next_th->thread_group.next) - $pid_off) + end + set $next_t=(char *)($next_t->tasks.next) - $tasks_off + end +end +document bttnobp + dump all thread stack traces on a kernel compiled with !CONFIG_FRAME_POINTER +end + +define btthreadstack + set var $pid_task = $arg0 + + printf "\npid %d; comm %s:\n", $pid_task.pid, $pid_task.comm + printf "task struct: " + print $pid_task + printf "===================\n" + set var $stackp = $pid_task.thread.sp + set var $stacksize = sizeof(union thread_union) + set var $stack_top = ($stackp & ~($stacksize - 1)) + $stacksize + set var $stack_bot = ($stackp & ~($stacksize - 1)) + + set $stackp = *((unsigned long *) $stackp) + while (($stackp < $stack_top) && ($stackp > $stack_bot)) + set var $addr = *(((unsigned long *) $stackp) + 1) + info symbol $addr + set $stackp = *((unsigned long *) $stackp) + end +end +document btthreadstack + dump a thread stack using the given task structure pointer +end + + +define btt + set $tasks_off=((size_t)&((struct task_struct *)0)->tasks) + set $pid_off=((size_t)&((struct task_struct *)0)->thread_group.next) + set $init_t=&init_task + set $next_t=(((char *)($init_t->tasks).next) - $tasks_off) + while ($next_t != $init_t) + set $next_t=(struct task_struct *)$next_t + btthreadstack $next_t + + set $next_th=(((char *)$next_t->thread_group.next) - $pid_off) + while ($next_th != $next_t) + set $next_th=(struct task_struct *)$next_th + btthreadstack $next_th + set $next_th=(((char *)$next_th->thread_group.next) - $pid_off) + end + set $next_t=(char *)($next_t->tasks.next) - $tasks_off + end +end +document btt + dump all thread stack traces on a kernel compiled with CONFIG_FRAME_POINTER +end + +define btpid + set var $pid = $arg0 + set $tasks_off=((size_t)&((struct task_struct *)0)->tasks) + set $pid_off=((size_t)&((struct task_struct *)0)->thread_group.next) + set $init_t=&init_task + set $next_t=(((char *)($init_t->tasks).next) - $tasks_off) + set var $pid_task = 0 + + while ($next_t != $init_t) + set $next_t=(struct task_struct *)$next_t + + if ($next_t.pid == $pid) + set $pid_task = $next_t + end + + set $next_th=(((char *)$next_t->thread_group.next) - $pid_off) + while ($next_th != $next_t) + set $next_th=(struct task_struct *)$next_th + if ($next_th.pid == $pid) + set $pid_task = $next_th + end + set $next_th=(((char *)$next_th->thread_group.next) - $pid_off) + end + set $next_t=(char *)($next_t->tasks.next) - $tasks_off + end + + btthreadstack $pid_task +end +document btpid + backtrace of pid +end + + +define trapinfo + set var $pid = $arg0 + set $tasks_off=((size_t)&((struct task_struct *)0)->tasks) + set $pid_off=((size_t)&((struct task_struct *)0)->thread_group.next) + set $init_t=&init_task + set $next_t=(((char *)($init_t->tasks).next) - $tasks_off) + set var $pid_task = 0 + + while ($next_t != $init_t) + set $next_t=(struct task_struct *)$next_t + + if ($next_t.pid == $pid) + set $pid_task = $next_t + end + + set $next_th=(((char *)$next_t->thread_group.next) - $pid_off) + while ($next_th != $next_t) + set $next_th=(struct task_struct *)$next_th + if ($next_th.pid == $pid) + set $pid_task = $next_th + end + set $next_th=(((char *)$next_th->thread_group.next) - $pid_off) + end + set $next_t=(char *)($next_t->tasks.next) - $tasks_off + end + + printf "Trapno %ld, cr2 0x%lx, error_code %ld\n", $pid_task.thread.trap_no, \ + $pid_task.thread.cr2, $pid_task.thread.error_code + +end +document trapinfo + Run info threads and lookup pid of thread #1 + 'trapinfo ' will tell you by which trap & possibly + address the kernel panicked. +end + +define dump_log_idx + set $idx = $arg0 + if ($argc > 1) + set $prev_flags = $arg1 + else + set $prev_flags = 0 + end + set $msg = ((struct printk_log *) (log_buf + $idx)) + set $prefix = 1 + set $newline = 1 + set $log = log_buf + $idx + sizeof(*$msg) + + # prev & LOG_CONT && !(msg->flags & LOG_PREIX) + if (($prev_flags & 8) && !($msg->flags & 4)) + set $prefix = 0 + end + + # msg->flags & LOG_CONT + if ($msg->flags & 8) + # (prev & LOG_CONT && !(prev & LOG_NEWLINE)) + if (($prev_flags & 8) && !($prev_flags & 2)) + set $prefix = 0 + end + # (!(msg->flags & LOG_NEWLINE)) + if (!($msg->flags & 2)) + set $newline = 0 + end + end + + if ($prefix) + printf "[%5lu.%06lu] ", $msg->ts_nsec / 1000000000, $msg->ts_nsec % 1000000000 + end + if ($msg->text_len != 0) + eval "printf \"%%%d.%ds\", $log", $msg->text_len, $msg->text_len + end + if ($newline) + printf "\n" + end + if ($msg->dict_len > 0) + set $dict = $log + $msg->text_len + set $idx = 0 + set $line = 1 + while ($idx < $msg->dict_len) + if ($line) + printf " " + set $line = 0 + end + set $c = $dict[$idx] + if ($c == '\0') + printf "\n" + set $line = 1 + else + if ($c < ' ' || $c >= 127 || $c == '\\') + printf "\\x%02x", $c + else + printf "%c", $c + end + end + set $idx = $idx + 1 + end + printf "\n" + end +end +document dump_log_idx + Dump a single log given its index in the log buffer. The first + parameter is the index into log_buf, the second is optional and + specified the previous log buffer's flags, used for properly + formatting continued lines. +end + +define dmesg + set $i = log_first_idx + set $end_idx = log_first_idx + set $prev_flags = 0 + + while (1) + set $msg = ((struct printk_log *) (log_buf + $i)) + if ($msg->len == 0) + set $i = 0 + else + dump_log_idx $i $prev_flags + set $i = $i + $msg->len + set $prev_flags = $msg->flags + end + if ($i == $end_idx) + loop_break + end + end +end +document dmesg + print the kernel ring buffer +end diff --git a/Documentation/admin-guide/kdump/index.rst b/Documentation/admin-guide/kdump/index.rst new file mode 100644 index 000000000000..8e2ebd0383cd --- /dev/null +++ b/Documentation/admin-guide/kdump/index.rst @@ -0,0 +1,20 @@ + +================================================================ +Documentation for Kdump - The kexec-based Crash Dumping Solution +================================================================ + +This document includes overview, setup and installation, and analysis +information. + +.. toctree:: + :maxdepth: 1 + + kdump + vmcoreinfo + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst new file mode 100644 index 000000000000..ac7e131d2935 --- /dev/null +++ b/Documentation/admin-guide/kdump/kdump.rst @@ -0,0 +1,534 @@ +================================================================ +Documentation for Kdump - The kexec-based Crash Dumping Solution +================================================================ + +This document includes overview, setup and installation, and analysis +information. + +Overview +======== + +Kdump uses kexec to quickly boot to a dump-capture kernel whenever a +dump of the system kernel's memory needs to be taken (for example, when +the system panics). The system kernel's memory image is preserved across +the reboot and is accessible to the dump-capture kernel. + +You can use common commands, such as cp and scp, to copy the +memory image to a dump file on the local disk, or across the network to +a remote system. + +Kdump and kexec are currently supported on the x86, x86_64, ppc64, ia64, +s390x, arm and arm64 architectures. + +When the system kernel boots, it reserves a small section of memory for +the dump-capture kernel. This ensures that ongoing Direct Memory Access +(DMA) from the system kernel does not corrupt the dump-capture kernel. +The kexec -p command loads the dump-capture kernel into this reserved +memory. + +On x86 machines, the first 640 KB of physical memory is needed to boot, +regardless of where the kernel loads. Therefore, kexec backs up this +region just before rebooting into the dump-capture kernel. + +Similarly on PPC64 machines first 32KB of physical memory is needed for +booting regardless of where the kernel is loaded and to support 64K page +size kexec backs up the first 64KB memory. + +For s390x, when kdump is triggered, the crashkernel region is exchanged +with the region [0, crashkernel region size] and then the kdump kernel +runs in [0, crashkernel region size]. Therefore no relocatable kernel is +needed for s390x. + +All of the necessary information about the system kernel's core image is +encoded in the ELF format, and stored in a reserved area of memory +before a crash. The physical address of the start of the ELF header is +passed to the dump-capture kernel through the elfcorehdr= boot +parameter. Optionally the size of the ELF header can also be passed +when using the elfcorehdr=[size[KMG]@]offset[KMG] syntax. + + +With the dump-capture kernel, you can access the memory image through +/proc/vmcore. This exports the dump as an ELF-format file that you can +write out using file copy commands such as cp or scp. Further, you can +use analysis tools such as the GNU Debugger (GDB) and the Crash tool to +debug the dump file. This method ensures that the dump pages are correctly +ordered. + + +Setup and Installation +====================== + +Install kexec-tools +------------------- + +1) Login as the root user. + +2) Download the kexec-tools user-space package from the following URL: + +http://kernel.org/pub/linux/utils/kernel/kexec/kexec-tools.tar.gz + +This is a symlink to the latest version. + +The latest kexec-tools git tree is available at: + +- git://git.kernel.org/pub/scm/utils/kernel/kexec/kexec-tools.git +- http://www.kernel.org/pub/scm/utils/kernel/kexec/kexec-tools.git + +There is also a gitweb interface available at +http://www.kernel.org/git/?p=utils/kernel/kexec/kexec-tools.git + +More information about kexec-tools can be found at +http://horms.net/projects/kexec/ + +3) Unpack the tarball with the tar command, as follows:: + + tar xvpzf kexec-tools.tar.gz + +4) Change to the kexec-tools directory, as follows:: + + cd kexec-tools-VERSION + +5) Configure the package, as follows:: + + ./configure + +6) Compile the package, as follows:: + + make + +7) Install the package, as follows:: + + make install + + +Build the system and dump-capture kernels +----------------------------------------- +There are two possible methods of using Kdump. + +1) Build a separate custom dump-capture kernel for capturing the + kernel core dump. + +2) Or use the system kernel binary itself as dump-capture kernel and there is + no need to build a separate dump-capture kernel. This is possible + only with the architectures which support a relocatable kernel. As + of today, i386, x86_64, ppc64, ia64, arm and arm64 architectures support + relocatable kernel. + +Building a relocatable kernel is advantageous from the point of view that +one does not have to build a second kernel for capturing the dump. But +at the same time one might want to build a custom dump capture kernel +suitable to his needs. + +Following are the configuration setting required for system and +dump-capture kernels for enabling kdump support. + +System kernel config options +---------------------------- + +1) Enable "kexec system call" in "Processor type and features.":: + + CONFIG_KEXEC=y + +2) Enable "sysfs file system support" in "Filesystem" -> "Pseudo + filesystems." This is usually enabled by default:: + + CONFIG_SYSFS=y + + Note that "sysfs file system support" might not appear in the "Pseudo + filesystems" menu if "Configure standard kernel features (for small + systems)" is not enabled in "General Setup." In this case, check the + .config file itself to ensure that sysfs is turned on, as follows:: + + grep 'CONFIG_SYSFS' .config + +3) Enable "Compile the kernel with debug info" in "Kernel hacking.":: + + CONFIG_DEBUG_INFO=Y + + This causes the kernel to be built with debug symbols. The dump + analysis tools require a vmlinux with debug symbols in order to read + and analyze a dump file. + +Dump-capture kernel config options (Arch Independent) +----------------------------------------------------- + +1) Enable "kernel crash dumps" support under "Processor type and + features":: + + CONFIG_CRASH_DUMP=y + +2) Enable "/proc/vmcore support" under "Filesystems" -> "Pseudo filesystems":: + + CONFIG_PROC_VMCORE=y + + (CONFIG_PROC_VMCORE is set by default when CONFIG_CRASH_DUMP is selected.) + +Dump-capture kernel config options (Arch Dependent, i386 and x86_64) +-------------------------------------------------------------------- + +1) On i386, enable high memory support under "Processor type and + features":: + + CONFIG_HIGHMEM64G=y + + or:: + + CONFIG_HIGHMEM4G + +2) On i386 and x86_64, disable symmetric multi-processing support + under "Processor type and features":: + + CONFIG_SMP=n + + (If CONFIG_SMP=y, then specify maxcpus=1 on the kernel command line + when loading the dump-capture kernel, see section "Load the Dump-capture + Kernel".) + +3) If one wants to build and use a relocatable kernel, + Enable "Build a relocatable kernel" support under "Processor type and + features":: + + CONFIG_RELOCATABLE=y + +4) Use a suitable value for "Physical address where the kernel is + loaded" (under "Processor type and features"). This only appears when + "kernel crash dumps" is enabled. A suitable value depends upon + whether kernel is relocatable or not. + + If you are using a relocatable kernel use CONFIG_PHYSICAL_START=0x100000 + This will compile the kernel for physical address 1MB, but given the fact + kernel is relocatable, it can be run from any physical address hence + kexec boot loader will load it in memory region reserved for dump-capture + kernel. + + Otherwise it should be the start of memory region reserved for + second kernel using boot parameter "crashkernel=Y@X". Here X is + start of memory region reserved for dump-capture kernel. + Generally X is 16MB (0x1000000). So you can set + CONFIG_PHYSICAL_START=0x1000000 + +5) Make and install the kernel and its modules. DO NOT add this kernel + to the boot loader configuration files. + +Dump-capture kernel config options (Arch Dependent, ppc64) +---------------------------------------------------------- + +1) Enable "Build a kdump crash kernel" support under "Kernel" options:: + + CONFIG_CRASH_DUMP=y + +2) Enable "Build a relocatable kernel" support:: + + CONFIG_RELOCATABLE=y + + Make and install the kernel and its modules. + +Dump-capture kernel config options (Arch Dependent, ia64) +---------------------------------------------------------- + +- No specific options are required to create a dump-capture kernel + for ia64, other than those specified in the arch independent section + above. This means that it is possible to use the system kernel + as a dump-capture kernel if desired. + + The crashkernel region can be automatically placed by the system + kernel at run time. This is done by specifying the base address as 0, + or omitting it all together:: + + crashkernel=256M@0 + + or:: + + crashkernel=256M + + If the start address is specified, note that the start address of the + kernel will be aligned to 64Mb, so if the start address is not then + any space below the alignment point will be wasted. + +Dump-capture kernel config options (Arch Dependent, arm) +---------------------------------------------------------- + +- To use a relocatable kernel, + Enable "AUTO_ZRELADDR" support under "Boot" options:: + + AUTO_ZRELADDR=y + +Dump-capture kernel config options (Arch Dependent, arm64) +---------------------------------------------------------- + +- Please note that kvm of the dump-capture kernel will not be enabled + on non-VHE systems even if it is configured. This is because the CPU + will not be reset to EL2 on panic. + +Extended crashkernel syntax +=========================== + +While the "crashkernel=size[@offset]" syntax is sufficient for most +configurations, sometimes it's handy to have the reserved memory dependent +on the value of System RAM -- that's mostly for distributors that pre-setup +the kernel command line to avoid a unbootable system after some memory has +been removed from the machine. + +The syntax is:: + + crashkernel=:[,:,...][@offset] + range=start-[end] + +For example:: + + crashkernel=512M-2G:64M,2G-:128M + +This would mean: + + 1) if the RAM is smaller than 512M, then don't reserve anything + (this is the "rescue" case) + 2) if the RAM size is between 512M and 2G (exclusive), then reserve 64M + 3) if the RAM size is larger than 2G, then reserve 128M + + + +Boot into System Kernel +======================= + +1) Update the boot loader (such as grub, yaboot, or lilo) configuration + files as necessary. + +2) Boot the system kernel with the boot parameter "crashkernel=Y@X", + where Y specifies how much memory to reserve for the dump-capture kernel + and X specifies the beginning of this reserved memory. For example, + "crashkernel=64M@16M" tells the system kernel to reserve 64 MB of memory + starting at physical address 0x01000000 (16MB) for the dump-capture kernel. + + On x86 and x86_64, use "crashkernel=64M@16M". + + On ppc64, use "crashkernel=128M@32M". + + On ia64, 256M@256M is a generous value that typically works. + The region may be automatically placed on ia64, see the + dump-capture kernel config option notes above. + If use sparse memory, the size should be rounded to GRANULE boundaries. + + On s390x, typically use "crashkernel=xxM". The value of xx is dependent + on the memory consumption of the kdump system. In general this is not + dependent on the memory size of the production system. + + On arm, the use of "crashkernel=Y@X" is no longer necessary; the + kernel will automatically locate the crash kernel image within the + first 512MB of RAM if X is not given. + + On arm64, use "crashkernel=Y[@X]". Note that the start address of + the kernel, X if explicitly specified, must be aligned to 2MiB (0x200000). + +Load the Dump-capture Kernel +============================ + +After booting to the system kernel, dump-capture kernel needs to be +loaded. + +Based on the architecture and type of image (relocatable or not), one +can choose to load the uncompressed vmlinux or compressed bzImage/vmlinuz +of dump-capture kernel. Following is the summary. + +For i386 and x86_64: + + - Use vmlinux if kernel is not relocatable. + - Use bzImage/vmlinuz if kernel is relocatable. + +For ppc64: + + - Use vmlinux + +For ia64: + + - Use vmlinux or vmlinuz.gz + +For s390x: + + - Use image or bzImage + +For arm: + + - Use zImage + +For arm64: + + - Use vmlinux or Image + +If you are using an uncompressed vmlinux image then use following command +to load dump-capture kernel:: + + kexec -p \ + --initrd= --args-linux \ + --append="root= " + +If you are using a compressed bzImage/vmlinuz, then use following command +to load dump-capture kernel:: + + kexec -p \ + --initrd= \ + --append="root= " + +If you are using a compressed zImage, then use following command +to load dump-capture kernel:: + + kexec --type zImage -p \ + --initrd= \ + --dtb= \ + --append="root= " + +If you are using an uncompressed Image, then use following command +to load dump-capture kernel:: + + kexec -p \ + --initrd= \ + --append="root= " + +Please note, that --args-linux does not need to be specified for ia64. +It is planned to make this a no-op on that architecture, but for now +it should be omitted + +Following are the arch specific command line options to be used while +loading dump-capture kernel. + +For i386, x86_64 and ia64: + + "1 irqpoll maxcpus=1 reset_devices" + +For ppc64: + + "1 maxcpus=1 noirqdistrib reset_devices" + +For s390x: + + "1 maxcpus=1 cgroup_disable=memory" + +For arm: + + "1 maxcpus=1 reset_devices" + +For arm64: + + "1 maxcpus=1 reset_devices" + +Notes on loading the dump-capture kernel: + +* By default, the ELF headers are stored in ELF64 format to support + systems with more than 4GB memory. On i386, kexec automatically checks if + the physical RAM size exceeds the 4 GB limit and if not, uses ELF32. + So, on non-PAE systems, ELF32 is always used. + + The --elf32-core-headers option can be used to force the generation of ELF32 + headers. This is necessary because GDB currently cannot open vmcore files + with ELF64 headers on 32-bit systems. + +* The "irqpoll" boot parameter reduces driver initialization failures + due to shared interrupts in the dump-capture kernel. + +* You must specify in the format corresponding to the root + device name in the output of mount command. + +* Boot parameter "1" boots the dump-capture kernel into single-user + mode without networking. If you want networking, use "3". + +* We generally don't have to bring up a SMP kernel just to capture the + dump. Hence generally it is useful either to build a UP dump-capture + kernel or specify maxcpus=1 option while loading dump-capture kernel. + Note, though maxcpus always works, you had better replace it with + nr_cpus to save memory if supported by the current ARCH, such as x86. + +* You should enable multi-cpu support in dump-capture kernel if you intend + to use multi-thread programs with it, such as parallel dump feature of + makedumpfile. Otherwise, the multi-thread program may have a great + performance degradation. To enable multi-cpu support, you should bring up an + SMP dump-capture kernel and specify maxcpus/nr_cpus, disable_cpu_apicid=[X] + options while loading it. + +* For s390x there are two kdump modes: If a ELF header is specified with + the elfcorehdr= kernel parameter, it is used by the kdump kernel as it + is done on all other architectures. If no elfcorehdr= kernel parameter is + specified, the s390x kdump kernel dynamically creates the header. The + second mode has the advantage that for CPU and memory hotplug, kdump has + not to be reloaded with kexec_load(). + +* For s390x systems with many attached devices the "cio_ignore" kernel + parameter should be used for the kdump kernel in order to prevent allocation + of kernel memory for devices that are not relevant for kdump. The same + applies to systems that use SCSI/FCP devices. In that case the + "allow_lun_scan" zfcp module parameter should be set to zero before + setting FCP devices online. + +Kernel Panic +============ + +After successfully loading the dump-capture kernel as previously +described, the system will reboot into the dump-capture kernel if a +system crash is triggered. Trigger points are located in panic(), +die(), die_nmi() and in the sysrq handler (ALT-SysRq-c). + +The following conditions will execute a crash trigger point: + +If a hard lockup is detected and "NMI watchdog" is configured, the system +will boot into the dump-capture kernel ( die_nmi() ). + +If die() is called, and it happens to be a thread with pid 0 or 1, or die() +is called inside interrupt context or die() is called and panic_on_oops is set, +the system will boot into the dump-capture kernel. + +On powerpc systems when a soft-reset is generated, die() is called by all cpus +and the system will boot into the dump-capture kernel. + +For testing purposes, you can trigger a crash by using "ALT-SysRq-c", +"echo c > /proc/sysrq-trigger" or write a module to force the panic. + +Write Out the Dump File +======================= + +After the dump-capture kernel is booted, write out the dump file with +the following command:: + + cp /proc/vmcore + + +Analysis +======== + +Before analyzing the dump image, you should reboot into a stable kernel. + +You can do limited analysis using GDB on the dump file copied out of +/proc/vmcore. Use the debug vmlinux built with -g and run the following +command:: + + gdb vmlinux + +Stack trace for the task on processor 0, register display, and memory +display work fine. + +Note: GDB cannot analyze core files generated in ELF64 format for x86. +On systems with a maximum of 4GB of memory, you can generate +ELF32-format headers using the --elf32-core-headers kernel option on the +dump kernel. + +You can also use the Crash utility to analyze dump files in Kdump +format. Crash is available on Dave Anderson's site at the following URL: + + http://people.redhat.com/~anderson/ + +Trigger Kdump on WARN() +======================= + +The kernel parameter, panic_on_warn, calls panic() in all WARN() paths. This +will cause a kdump to occur at the panic() call. In cases where a user wants +to specify this during runtime, /proc/sys/kernel/panic_on_warn can be set to 1 +to achieve the same behaviour. + +Contact +======= + +- Vivek Goyal (vgoyal@redhat.com) +- Maneesh Soni (maneesh@in.ibm.com) + +GDB macros +========== + +.. include:: gdbmacros.txt + :literal: diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst new file mode 100644 index 000000000000..007a6b86e0ee --- /dev/null +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst @@ -0,0 +1,488 @@ +========== +VMCOREINFO +========== + +What is it? +=========== + +VMCOREINFO is a special ELF note section. It contains various +information from the kernel like structure size, page size, symbol +values, field offsets, etc. These data are packed into an ELF note +section and used by user-space tools like crash and makedumpfile to +analyze a kernel's memory layout. + +Common variables +================ + +init_uts_ns.name.release +------------------------ + +The version of the Linux kernel. Used to find the corresponding source +code from which the kernel has been built. For example, crash uses it to +find the corresponding vmlinux in order to process vmcore. + +PAGE_SIZE +--------- + +The size of a page. It is the smallest unit of data used by the memory +management facilities. It is usually 4096 bytes of size and a page is +aligned on 4096 bytes. Used for computing page addresses. + +init_uts_ns +----------- + +The UTS namespace which is used to isolate two specific elements of the +system that relate to the uname(2) system call. It is named after the +data structure used to store information returned by the uname(2) system +call. + +User-space tools can get the kernel name, host name, kernel release +number, kernel version, architecture name and OS type from it. + +node_online_map +--------------- + +An array node_states[N_ONLINE] which represents the set of online nodes +in a system, one bit position per node number. Used to keep track of +which nodes are in the system and online. + +swapper_pg_dir +-------------- + +The global page directory pointer of the kernel. Used to translate +virtual to physical addresses. + +_stext +------ + +Defines the beginning of the text section. In general, _stext indicates +the kernel start address. Used to convert a virtual address from the +direct kernel map to a physical address. + +vmap_area_list +-------------- + +Stores the virtual area list. makedumpfile gets the vmalloc start value +from this variable and its value is necessary for vmalloc translation. + +mem_map +------- + +Physical addresses are translated to struct pages by treating them as +an index into the mem_map array. Right-shifting a physical address +PAGE_SHIFT bits converts it into a page frame number which is an index +into that mem_map array. + +Used to map an address to the corresponding struct page. + +contig_page_data +---------------- + +Makedumpfile gets the pglist_data structure from this symbol, which is +used to describe the memory layout. + +User-space tools use this to exclude free pages when dumping memory. + +mem_section|(mem_section, NR_SECTION_ROOTS)|(mem_section, section_mem_map) +-------------------------------------------------------------------------- + +The address of the mem_section array, its length, structure size, and +the section_mem_map offset. + +It exists in the sparse memory mapping model, and it is also somewhat +similar to the mem_map variable, both of them are used to translate an +address. + +page +---- + +The size of a page structure. struct page is an important data structure +and it is widely used to compute contiguous memory. + +pglist_data +----------- + +The size of a pglist_data structure. This value is used to check if the +pglist_data structure is valid. It is also used for checking the memory +type. + +zone +---- + +The size of a zone structure. This value is used to check if the zone +structure has been found. It is also used for excluding free pages. + +free_area +--------- + +The size of a free_area structure. It indicates whether the free_area +structure is valid or not. Useful when excluding free pages. + +list_head +--------- + +The size of a list_head structure. Used when iterating lists in a +post-mortem analysis session. + +nodemask_t +---------- + +The size of a nodemask_t type. Used to compute the number of online +nodes. + +(page, flags|_refcount|mapping|lru|_mapcount|private|compound_dtor|compound_order|compound_head) +------------------------------------------------------------------------------------------------- + +User-space tools compute their values based on the offset of these +variables. The variables are used when excluding unnecessary pages. + +(pglist_data, node_zones|nr_zones|node_mem_map|node_start_pfn|node_spanned_pages|node_id) +----------------------------------------------------------------------------------------- + +On NUMA machines, each NUMA node has a pg_data_t to describe its memory +layout. On UMA machines there is a single pglist_data which describes the +whole memory. + +These values are used to check the memory type and to compute the +virtual address for memory map. + +(zone, free_area|vm_stat|spanned_pages) +--------------------------------------- + +Each node is divided into a number of blocks called zones which +represent ranges within memory. A zone is described by a structure zone. + +User-space tools compute required values based on the offset of these +variables. + +(free_area, free_list) +---------------------- + +Offset of the free_list's member. This value is used to compute the number +of free pages. + +Each zone has a free_area structure array called free_area[MAX_ORDER]. +The free_list represents a linked list of free page blocks. + +(list_head, next|prev) +---------------------- + +Offsets of the list_head's members. list_head is used to define a +circular linked list. User-space tools need these in order to traverse +lists. + +(vmap_area, va_start|list) +-------------------------- + +Offsets of the vmap_area's members. They carry vmalloc-specific +information. Makedumpfile gets the start address of the vmalloc region +from this. + +(zone.free_area, MAX_ORDER) +--------------------------- + +Free areas descriptor. User-space tools use this value to iterate the +free_area ranges. MAX_ORDER is used by the zone buddy allocator. + +log_first_idx +------------- + +Index of the first record stored in the buffer log_buf. Used by +user-space tools to read the strings in the log_buf. + +log_buf +------- + +Console output is written to the ring buffer log_buf at index +log_first_idx. Used to get the kernel log. + +log_buf_len +----------- + +log_buf's length. + +clear_idx +--------- + +The index that the next printk() record to read after the last clear +command. It indicates the first record after the last SYSLOG_ACTION +_CLEAR, like issued by 'dmesg -c'. Used by user-space tools to dump +the dmesg log. + +log_next_idx +------------ + +The index of the next record to store in the buffer log_buf. Used to +compute the index of the current buffer position. + +printk_log +---------- + +The size of a structure printk_log. Used to compute the size of +messages, and extract dmesg log. It encapsulates header information for +log_buf, such as timestamp, syslog level, etc. + +(printk_log, ts_nsec|len|text_len|dict_len) +------------------------------------------- + +It represents field offsets in struct printk_log. User space tools +parse it and check whether the values of printk_log's members have been +changed. + +(free_area.free_list, MIGRATE_TYPES) +------------------------------------ + +The number of migrate types for pages. The free_list is described by the +array. Used by tools to compute the number of free pages. + +NR_FREE_PAGES +------------- + +On linux-2.6.21 or later, the number of free pages is in +vm_stat[NR_FREE_PAGES]. Used to get the number of free pages. + +PG_lru|PG_private|PG_swapcache|PG_swapbacked|PG_slab|PG_hwpoision|PG_head_mask +------------------------------------------------------------------------------ + +Page attributes. These flags are used to filter various unnecessary for +dumping pages. + +PAGE_BUDDY_MAPCOUNT_VALUE(~PG_buddy)|PAGE_OFFLINE_MAPCOUNT_VALUE(~PG_offline) +----------------------------------------------------------------------------- + +More page attributes. These flags are used to filter various unnecessary for +dumping pages. + + +HUGETLB_PAGE_DTOR +----------------- + +The HUGETLB_PAGE_DTOR flag denotes hugetlbfs pages. Makedumpfile +excludes these pages. + +x86_64 +====== + +phys_base +--------- + +Used to convert the virtual address of an exported kernel symbol to its +corresponding physical address. + +init_top_pgt +------------ + +Used to walk through the whole page table and convert virtual addresses +to physical addresses. The init_top_pgt is somewhat similar to +swapper_pg_dir, but it is only used in x86_64. + +pgtable_l5_enabled +------------------ + +User-space tools need to know whether the crash kernel was in 5-level +paging mode. + +node_data +--------- + +This is a struct pglist_data array and stores all NUMA nodes +information. Makedumpfile gets the pglist_data structure from it. + +(node_data, MAX_NUMNODES) +------------------------- + +The maximum number of nodes in system. + +KERNELOFFSET +------------ + +The kernel randomization offset. Used to compute the page offset. If +KASLR is disabled, this value is zero. + +KERNEL_IMAGE_SIZE +----------------- + +Currently unused by Makedumpfile. Used to compute the module virtual +address by Crash. + +sme_mask +-------- + +AMD-specific with SME support: it indicates the secure memory encryption +mask. Makedumpfile tools need to know whether the crash kernel was +encrypted. If SME is enabled in the first kernel, the crash kernel's +page table entries (pgd/pud/pmd/pte) contain the memory encryption +mask. This is used to remove the SME mask and obtain the true physical +address. + +Currently, sme_mask stores the value of the C-bit position. If needed, +additional SME-relevant info can be placed in that variable. + +For example:: + + [ misc ][ enc bit ][ other misc SME info ] + 0000_0000_0000_0000_1000_0000_0000_0000_0000_0000_..._0000 + 63 59 55 51 47 43 39 35 31 27 ... 3 + +x86_32 +====== + +X86_PAE +------- + +Denotes whether physical address extensions are enabled. It has the cost +of a higher page table lookup overhead, and also consumes more page +table space per process. Used to check whether PAE was enabled in the +crash kernel when converting virtual addresses to physical addresses. + +ia64 +==== + +pgdat_list|(pgdat_list, MAX_NUMNODES) +------------------------------------- + +pg_data_t array storing all NUMA nodes information. MAX_NUMNODES +indicates the number of the nodes. + +node_memblk|(node_memblk, NR_NODE_MEMBLKS) +------------------------------------------ + +List of node memory chunks. Filled when parsing the SRAT table to obtain +information about memory nodes. NR_NODE_MEMBLKS indicates the number of +node memory chunks. + +These values are used to compute the number of nodes the crashed kernel used. + +node_memblk_s|(node_memblk_s, start_paddr)|(node_memblk_s, size) +---------------------------------------------------------------- + +The size of a struct node_memblk_s and the offsets of the +node_memblk_s's members. Used to compute the number of nodes. + +PGTABLE_3|PGTABLE_4 +------------------- + +User-space tools need to know whether the crash kernel was in 3-level or +4-level paging mode. Used to distinguish the page table. + +ARM64 +===== + +VA_BITS +------- + +The maximum number of bits for virtual addresses. Used to compute the +virtual memory ranges. + +kimage_voffset +-------------- + +The offset between the kernel virtual and physical mappings. Used to +translate virtual to physical addresses. + +PHYS_OFFSET +----------- + +Indicates the physical address of the start of memory. Similar to +kimage_voffset, which is used to translate virtual to physical +addresses. + +KERNELOFFSET +------------ + +The kernel randomization offset. Used to compute the page offset. If +KASLR is disabled, this value is zero. + +arm +=== + +ARM_LPAE +-------- + +It indicates whether the crash kernel supports large physical address +extensions. Used to translate virtual to physical addresses. + +s390 +==== + +lowcore_ptr +----------- + +An array with a pointer to the lowcore of every CPU. Used to print the +psw and all registers information. + +high_memory +----------- + +Used to get the vmalloc_start address from the high_memory symbol. + +(lowcore_ptr, NR_CPUS) +---------------------- + +The maximum number of CPUs. + +powerpc +======= + + +node_data|(node_data, MAX_NUMNODES) +----------------------------------- + +See above. + +contig_page_data +---------------- + +See above. + +vmemmap_list +------------ + +The vmemmap_list maintains the entire vmemmap physical mapping. Used +to get vmemmap list count and populated vmemmap regions info. If the +vmemmap address translation information is stored in the crash kernel, +it is used to translate vmemmap kernel virtual addresses. + +mmu_vmemmap_psize +----------------- + +The size of a page. Used to translate virtual to physical addresses. + +mmu_psize_defs +-------------- + +Page size definitions, i.e. 4k, 64k, or 16M. + +Used to make vtop translations. + +vmemmap_backing|(vmemmap_backing, list)|(vmemmap_backing, phys)|(vmemmap_backing, virt_addr) +-------------------------------------------------------------------------------------------- + +The vmemmap virtual address space management does not have a traditional +page table to track which virtual struct pages are backed by a physical +mapping. The virtual to physical mappings are tracked in a simple linked +list format. + +User-space tools need to know the offset of list, phys and virt_addr +when computing the count of vmemmap regions. + +mmu_psize_def|(mmu_psize_def, shift) +------------------------------------ + +The size of a struct mmu_psize_def and the offset of mmu_psize_def's +member. + +Used in vtop translations. + +sh +== + +node_data|(node_data, MAX_NUMNODES) +----------------------------------- + +See above. + +X2TLB +----- + +Indicates whether the crashed kernel enabled SH extended mode. diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 4821175a3769..e645b3ab4b6f 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -708,14 +708,14 @@ [KNL, x86_64] select a region under 4G first, and fall back to reserve region above 4G when '@offset' hasn't been specified. - See Documentation/kdump/kdump.rst for further details. + See Documentation/admin-guide/kdump/kdump.rst for further details. crashkernel=range1:size1[,range2:size2,...][@offset] [KNL] Same as above, but depends on the memory in the running system. The syntax of range is start-[end] where start and end are both a memory unit (amount[KMG]). See also - Documentation/kdump/kdump.rst for an example. + Documentation/admin-guide/kdump/kdump.rst for an example. crashkernel=size[KMG],high [KNL, x86_64] range could be above 4G. Allow kernel @@ -1207,7 +1207,7 @@ Specifies physical address of start of kernel core image elf header and optionally the size. Generally kexec loader will pass this option to capture kernel. - See Documentation/kdump/kdump.rst for details. + See Documentation/admin-guide/kdump/kdump.rst for details. enable_mtrr_cleanup [X86] The kernel tries to adjust MTRR layout from continuous diff --git a/Documentation/kdump/gdbmacros.txt b/Documentation/kdump/gdbmacros.txt deleted file mode 100644 index 220d0a80ca2c..000000000000 --- a/Documentation/kdump/gdbmacros.txt +++ /dev/null @@ -1,264 +0,0 @@ -# -# This file contains a few gdb macros (user defined commands) to extract -# useful information from kernel crashdump (kdump) like stack traces of -# all the processes or a particular process and trapinfo. -# -# These macros can be used by copying this file in .gdbinit (put in home -# directory or current directory) or by invoking gdb command with -# --command= option -# -# Credits: -# Alexander Nyberg -# V Srivatsa -# Maneesh Soni -# - -define bttnobp - set $tasks_off=((size_t)&((struct task_struct *)0)->tasks) - set $pid_off=((size_t)&((struct task_struct *)0)->thread_group.next) - set $init_t=&init_task - set $next_t=(((char *)($init_t->tasks).next) - $tasks_off) - set var $stacksize = sizeof(union thread_union) - while ($next_t != $init_t) - set $next_t=(struct task_struct *)$next_t - printf "\npid %d; comm %s:\n", $next_t.pid, $next_t.comm - printf "===================\n" - set var $stackp = $next_t.thread.sp - set var $stack_top = ($stackp & ~($stacksize - 1)) + $stacksize - - while ($stackp < $stack_top) - if (*($stackp) > _stext && *($stackp) < _sinittext) - info symbol *($stackp) - end - set $stackp += 4 - end - set $next_th=(((char *)$next_t->thread_group.next) - $pid_off) - while ($next_th != $next_t) - set $next_th=(struct task_struct *)$next_th - printf "\npid %d; comm %s:\n", $next_t.pid, $next_t.comm - printf "===================\n" - set var $stackp = $next_t.thread.sp - set var $stack_top = ($stackp & ~($stacksize - 1)) + stacksize - - while ($stackp < $stack_top) - if (*($stackp) > _stext && *($stackp) < _sinittext) - info symbol *($stackp) - end - set $stackp += 4 - end - set $next_th=(((char *)$next_th->thread_group.next) - $pid_off) - end - set $next_t=(char *)($next_t->tasks.next) - $tasks_off - end -end -document bttnobp - dump all thread stack traces on a kernel compiled with !CONFIG_FRAME_POINTER -end - -define btthreadstack - set var $pid_task = $arg0 - - printf "\npid %d; comm %s:\n", $pid_task.pid, $pid_task.comm - printf "task struct: " - print $pid_task - printf "===================\n" - set var $stackp = $pid_task.thread.sp - set var $stacksize = sizeof(union thread_union) - set var $stack_top = ($stackp & ~($stacksize - 1)) + $stacksize - set var $stack_bot = ($stackp & ~($stacksize - 1)) - - set $stackp = *((unsigned long *) $stackp) - while (($stackp < $stack_top) && ($stackp > $stack_bot)) - set var $addr = *(((unsigned long *) $stackp) + 1) - info symbol $addr - set $stackp = *((unsigned long *) $stackp) - end -end -document btthreadstack - dump a thread stack using the given task structure pointer -end - - -define btt - set $tasks_off=((size_t)&((struct task_struct *)0)->tasks) - set $pid_off=((size_t)&((struct task_struct *)0)->thread_group.next) - set $init_t=&init_task - set $next_t=(((char *)($init_t->tasks).next) - $tasks_off) - while ($next_t != $init_t) - set $next_t=(struct task_struct *)$next_t - btthreadstack $next_t - - set $next_th=(((char *)$next_t->thread_group.next) - $pid_off) - while ($next_th != $next_t) - set $next_th=(struct task_struct *)$next_th - btthreadstack $next_th - set $next_th=(((char *)$next_th->thread_group.next) - $pid_off) - end - set $next_t=(char *)($next_t->tasks.next) - $tasks_off - end -end -document btt - dump all thread stack traces on a kernel compiled with CONFIG_FRAME_POINTER -end - -define btpid - set var $pid = $arg0 - set $tasks_off=((size_t)&((struct task_struct *)0)->tasks) - set $pid_off=((size_t)&((struct task_struct *)0)->thread_group.next) - set $init_t=&init_task - set $next_t=(((char *)($init_t->tasks).next) - $tasks_off) - set var $pid_task = 0 - - while ($next_t != $init_t) - set $next_t=(struct task_struct *)$next_t - - if ($next_t.pid == $pid) - set $pid_task = $next_t - end - - set $next_th=(((char *)$next_t->thread_group.next) - $pid_off) - while ($next_th != $next_t) - set $next_th=(struct task_struct *)$next_th - if ($next_th.pid == $pid) - set $pid_task = $next_th - end - set $next_th=(((char *)$next_th->thread_group.next) - $pid_off) - end - set $next_t=(char *)($next_t->tasks.next) - $tasks_off - end - - btthreadstack $pid_task -end -document btpid - backtrace of pid -end - - -define trapinfo - set var $pid = $arg0 - set $tasks_off=((size_t)&((struct task_struct *)0)->tasks) - set $pid_off=((size_t)&((struct task_struct *)0)->thread_group.next) - set $init_t=&init_task - set $next_t=(((char *)($init_t->tasks).next) - $tasks_off) - set var $pid_task = 0 - - while ($next_t != $init_t) - set $next_t=(struct task_struct *)$next_t - - if ($next_t.pid == $pid) - set $pid_task = $next_t - end - - set $next_th=(((char *)$next_t->thread_group.next) - $pid_off) - while ($next_th != $next_t) - set $next_th=(struct task_struct *)$next_th - if ($next_th.pid == $pid) - set $pid_task = $next_th - end - set $next_th=(((char *)$next_th->thread_group.next) - $pid_off) - end - set $next_t=(char *)($next_t->tasks.next) - $tasks_off - end - - printf "Trapno %ld, cr2 0x%lx, error_code %ld\n", $pid_task.thread.trap_no, \ - $pid_task.thread.cr2, $pid_task.thread.error_code - -end -document trapinfo - Run info threads and lookup pid of thread #1 - 'trapinfo ' will tell you by which trap & possibly - address the kernel panicked. -end - -define dump_log_idx - set $idx = $arg0 - if ($argc > 1) - set $prev_flags = $arg1 - else - set $prev_flags = 0 - end - set $msg = ((struct printk_log *) (log_buf + $idx)) - set $prefix = 1 - set $newline = 1 - set $log = log_buf + $idx + sizeof(*$msg) - - # prev & LOG_CONT && !(msg->flags & LOG_PREIX) - if (($prev_flags & 8) && !($msg->flags & 4)) - set $prefix = 0 - end - - # msg->flags & LOG_CONT - if ($msg->flags & 8) - # (prev & LOG_CONT && !(prev & LOG_NEWLINE)) - if (($prev_flags & 8) && !($prev_flags & 2)) - set $prefix = 0 - end - # (!(msg->flags & LOG_NEWLINE)) - if (!($msg->flags & 2)) - set $newline = 0 - end - end - - if ($prefix) - printf "[%5lu.%06lu] ", $msg->ts_nsec / 1000000000, $msg->ts_nsec % 1000000000 - end - if ($msg->text_len != 0) - eval "printf \"%%%d.%ds\", $log", $msg->text_len, $msg->text_len - end - if ($newline) - printf "\n" - end - if ($msg->dict_len > 0) - set $dict = $log + $msg->text_len - set $idx = 0 - set $line = 1 - while ($idx < $msg->dict_len) - if ($line) - printf " " - set $line = 0 - end - set $c = $dict[$idx] - if ($c == '\0') - printf "\n" - set $line = 1 - else - if ($c < ' ' || $c >= 127 || $c == '\\') - printf "\\x%02x", $c - else - printf "%c", $c - end - end - set $idx = $idx + 1 - end - printf "\n" - end -end -document dump_log_idx - Dump a single log given its index in the log buffer. The first - parameter is the index into log_buf, the second is optional and - specified the previous log buffer's flags, used for properly - formatting continued lines. -end - -define dmesg - set $i = log_first_idx - set $end_idx = log_first_idx - set $prev_flags = 0 - - while (1) - set $msg = ((struct printk_log *) (log_buf + $i)) - if ($msg->len == 0) - set $i = 0 - else - dump_log_idx $i $prev_flags - set $i = $i + $msg->len - set $prev_flags = $msg->flags - end - if ($i == $end_idx) - loop_break - end - end -end -document dmesg - print the kernel ring buffer -end diff --git a/Documentation/kdump/index.rst b/Documentation/kdump/index.rst deleted file mode 100644 index 2b17fcf6867a..000000000000 --- a/Documentation/kdump/index.rst +++ /dev/null @@ -1,21 +0,0 @@ -:orphan: - -================================================================ -Documentation for Kdump - The kexec-based Crash Dumping Solution -================================================================ - -This document includes overview, setup and installation, and analysis -information. - -.. toctree:: - :maxdepth: 1 - - kdump - vmcoreinfo - -.. only:: subproject and html - - Indices - ======= - - * :ref:`genindex` diff --git a/Documentation/kdump/kdump.rst b/Documentation/kdump/kdump.rst deleted file mode 100644 index ac7e131d2935..000000000000 --- a/Documentation/kdump/kdump.rst +++ /dev/null @@ -1,534 +0,0 @@ -================================================================ -Documentation for Kdump - The kexec-based Crash Dumping Solution -================================================================ - -This document includes overview, setup and installation, and analysis -information. - -Overview -======== - -Kdump uses kexec to quickly boot to a dump-capture kernel whenever a -dump of the system kernel's memory needs to be taken (for example, when -the system panics). The system kernel's memory image is preserved across -the reboot and is accessible to the dump-capture kernel. - -You can use common commands, such as cp and scp, to copy the -memory image to a dump file on the local disk, or across the network to -a remote system. - -Kdump and kexec are currently supported on the x86, x86_64, ppc64, ia64, -s390x, arm and arm64 architectures. - -When the system kernel boots, it reserves a small section of memory for -the dump-capture kernel. This ensures that ongoing Direct Memory Access -(DMA) from the system kernel does not corrupt the dump-capture kernel. -The kexec -p command loads the dump-capture kernel into this reserved -memory. - -On x86 machines, the first 640 KB of physical memory is needed to boot, -regardless of where the kernel loads. Therefore, kexec backs up this -region just before rebooting into the dump-capture kernel. - -Similarly on PPC64 machines first 32KB of physical memory is needed for -booting regardless of where the kernel is loaded and to support 64K page -size kexec backs up the first 64KB memory. - -For s390x, when kdump is triggered, the crashkernel region is exchanged -with the region [0, crashkernel region size] and then the kdump kernel -runs in [0, crashkernel region size]. Therefore no relocatable kernel is -needed for s390x. - -All of the necessary information about the system kernel's core image is -encoded in the ELF format, and stored in a reserved area of memory -before a crash. The physical address of the start of the ELF header is -passed to the dump-capture kernel through the elfcorehdr= boot -parameter. Optionally the size of the ELF header can also be passed -when using the elfcorehdr=[size[KMG]@]offset[KMG] syntax. - - -With the dump-capture kernel, you can access the memory image through -/proc/vmcore. This exports the dump as an ELF-format file that you can -write out using file copy commands such as cp or scp. Further, you can -use analysis tools such as the GNU Debugger (GDB) and the Crash tool to -debug the dump file. This method ensures that the dump pages are correctly -ordered. - - -Setup and Installation -====================== - -Install kexec-tools -------------------- - -1) Login as the root user. - -2) Download the kexec-tools user-space package from the following URL: - -http://kernel.org/pub/linux/utils/kernel/kexec/kexec-tools.tar.gz - -This is a symlink to the latest version. - -The latest kexec-tools git tree is available at: - -- git://git.kernel.org/pub/scm/utils/kernel/kexec/kexec-tools.git -- http://www.kernel.org/pub/scm/utils/kernel/kexec/kexec-tools.git - -There is also a gitweb interface available at -http://www.kernel.org/git/?p=utils/kernel/kexec/kexec-tools.git - -More information about kexec-tools can be found at -http://horms.net/projects/kexec/ - -3) Unpack the tarball with the tar command, as follows:: - - tar xvpzf kexec-tools.tar.gz - -4) Change to the kexec-tools directory, as follows:: - - cd kexec-tools-VERSION - -5) Configure the package, as follows:: - - ./configure - -6) Compile the package, as follows:: - - make - -7) Install the package, as follows:: - - make install - - -Build the system and dump-capture kernels ------------------------------------------ -There are two possible methods of using Kdump. - -1) Build a separate custom dump-capture kernel for capturing the - kernel core dump. - -2) Or use the system kernel binary itself as dump-capture kernel and there is - no need to build a separate dump-capture kernel. This is possible - only with the architectures which support a relocatable kernel. As - of today, i386, x86_64, ppc64, ia64, arm and arm64 architectures support - relocatable kernel. - -Building a relocatable kernel is advantageous from the point of view that -one does not have to build a second kernel for capturing the dump. But -at the same time one might want to build a custom dump capture kernel -suitable to his needs. - -Following are the configuration setting required for system and -dump-capture kernels for enabling kdump support. - -System kernel config options ----------------------------- - -1) Enable "kexec system call" in "Processor type and features.":: - - CONFIG_KEXEC=y - -2) Enable "sysfs file system support" in "Filesystem" -> "Pseudo - filesystems." This is usually enabled by default:: - - CONFIG_SYSFS=y - - Note that "sysfs file system support" might not appear in the "Pseudo - filesystems" menu if "Configure standard kernel features (for small - systems)" is not enabled in "General Setup." In this case, check the - .config file itself to ensure that sysfs is turned on, as follows:: - - grep 'CONFIG_SYSFS' .config - -3) Enable "Compile the kernel with debug info" in "Kernel hacking.":: - - CONFIG_DEBUG_INFO=Y - - This causes the kernel to be built with debug symbols. The dump - analysis tools require a vmlinux with debug symbols in order to read - and analyze a dump file. - -Dump-capture kernel config options (Arch Independent) ------------------------------------------------------ - -1) Enable "kernel crash dumps" support under "Processor type and - features":: - - CONFIG_CRASH_DUMP=y - -2) Enable "/proc/vmcore support" under "Filesystems" -> "Pseudo filesystems":: - - CONFIG_PROC_VMCORE=y - - (CONFIG_PROC_VMCORE is set by default when CONFIG_CRASH_DUMP is selected.) - -Dump-capture kernel config options (Arch Dependent, i386 and x86_64) --------------------------------------------------------------------- - -1) On i386, enable high memory support under "Processor type and - features":: - - CONFIG_HIGHMEM64G=y - - or:: - - CONFIG_HIGHMEM4G - -2) On i386 and x86_64, disable symmetric multi-processing support - under "Processor type and features":: - - CONFIG_SMP=n - - (If CONFIG_SMP=y, then specify maxcpus=1 on the kernel command line - when loading the dump-capture kernel, see section "Load the Dump-capture - Kernel".) - -3) If one wants to build and use a relocatable kernel, - Enable "Build a relocatable kernel" support under "Processor type and - features":: - - CONFIG_RELOCATABLE=y - -4) Use a suitable value for "Physical address where the kernel is - loaded" (under "Processor type and features"). This only appears when - "kernel crash dumps" is enabled. A suitable value depends upon - whether kernel is relocatable or not. - - If you are using a relocatable kernel use CONFIG_PHYSICAL_START=0x100000 - This will compile the kernel for physical address 1MB, but given the fact - kernel is relocatable, it can be run from any physical address hence - kexec boot loader will load it in memory region reserved for dump-capture - kernel. - - Otherwise it should be the start of memory region reserved for - second kernel using boot parameter "crashkernel=Y@X". Here X is - start of memory region reserved for dump-capture kernel. - Generally X is 16MB (0x1000000). So you can set - CONFIG_PHYSICAL_START=0x1000000 - -5) Make and install the kernel and its modules. DO NOT add this kernel - to the boot loader configuration files. - -Dump-capture kernel config options (Arch Dependent, ppc64) ----------------------------------------------------------- - -1) Enable "Build a kdump crash kernel" support under "Kernel" options:: - - CONFIG_CRASH_DUMP=y - -2) Enable "Build a relocatable kernel" support:: - - CONFIG_RELOCATABLE=y - - Make and install the kernel and its modules. - -Dump-capture kernel config options (Arch Dependent, ia64) ----------------------------------------------------------- - -- No specific options are required to create a dump-capture kernel - for ia64, other than those specified in the arch independent section - above. This means that it is possible to use the system kernel - as a dump-capture kernel if desired. - - The crashkernel region can be automatically placed by the system - kernel at run time. This is done by specifying the base address as 0, - or omitting it all together:: - - crashkernel=256M@0 - - or:: - - crashkernel=256M - - If the start address is specified, note that the start address of the - kernel will be aligned to 64Mb, so if the start address is not then - any space below the alignment point will be wasted. - -Dump-capture kernel config options (Arch Dependent, arm) ----------------------------------------------------------- - -- To use a relocatable kernel, - Enable "AUTO_ZRELADDR" support under "Boot" options:: - - AUTO_ZRELADDR=y - -Dump-capture kernel config options (Arch Dependent, arm64) ----------------------------------------------------------- - -- Please note that kvm of the dump-capture kernel will not be enabled - on non-VHE systems even if it is configured. This is because the CPU - will not be reset to EL2 on panic. - -Extended crashkernel syntax -=========================== - -While the "crashkernel=size[@offset]" syntax is sufficient for most -configurations, sometimes it's handy to have the reserved memory dependent -on the value of System RAM -- that's mostly for distributors that pre-setup -the kernel command line to avoid a unbootable system after some memory has -been removed from the machine. - -The syntax is:: - - crashkernel=:[,:,...][@offset] - range=start-[end] - -For example:: - - crashkernel=512M-2G:64M,2G-:128M - -This would mean: - - 1) if the RAM is smaller than 512M, then don't reserve anything - (this is the "rescue" case) - 2) if the RAM size is between 512M and 2G (exclusive), then reserve 64M - 3) if the RAM size is larger than 2G, then reserve 128M - - - -Boot into System Kernel -======================= - -1) Update the boot loader (such as grub, yaboot, or lilo) configuration - files as necessary. - -2) Boot the system kernel with the boot parameter "crashkernel=Y@X", - where Y specifies how much memory to reserve for the dump-capture kernel - and X specifies the beginning of this reserved memory. For example, - "crashkernel=64M@16M" tells the system kernel to reserve 64 MB of memory - starting at physical address 0x01000000 (16MB) for the dump-capture kernel. - - On x86 and x86_64, use "crashkernel=64M@16M". - - On ppc64, use "crashkernel=128M@32M". - - On ia64, 256M@256M is a generous value that typically works. - The region may be automatically placed on ia64, see the - dump-capture kernel config option notes above. - If use sparse memory, the size should be rounded to GRANULE boundaries. - - On s390x, typically use "crashkernel=xxM". The value of xx is dependent - on the memory consumption of the kdump system. In general this is not - dependent on the memory size of the production system. - - On arm, the use of "crashkernel=Y@X" is no longer necessary; the - kernel will automatically locate the crash kernel image within the - first 512MB of RAM if X is not given. - - On arm64, use "crashkernel=Y[@X]". Note that the start address of - the kernel, X if explicitly specified, must be aligned to 2MiB (0x200000). - -Load the Dump-capture Kernel -============================ - -After booting to the system kernel, dump-capture kernel needs to be -loaded. - -Based on the architecture and type of image (relocatable or not), one -can choose to load the uncompressed vmlinux or compressed bzImage/vmlinuz -of dump-capture kernel. Following is the summary. - -For i386 and x86_64: - - - Use vmlinux if kernel is not relocatable. - - Use bzImage/vmlinuz if kernel is relocatable. - -For ppc64: - - - Use vmlinux - -For ia64: - - - Use vmlinux or vmlinuz.gz - -For s390x: - - - Use image or bzImage - -For arm: - - - Use zImage - -For arm64: - - - Use vmlinux or Image - -If you are using an uncompressed vmlinux image then use following command -to load dump-capture kernel:: - - kexec -p \ - --initrd= --args-linux \ - --append="root= " - -If you are using a compressed bzImage/vmlinuz, then use following command -to load dump-capture kernel:: - - kexec -p \ - --initrd= \ - --append="root= " - -If you are using a compressed zImage, then use following command -to load dump-capture kernel:: - - kexec --type zImage -p \ - --initrd= \ - --dtb= \ - --append="root= " - -If you are using an uncompressed Image, then use following command -to load dump-capture kernel:: - - kexec -p \ - --initrd= \ - --append="root= " - -Please note, that --args-linux does not need to be specified for ia64. -It is planned to make this a no-op on that architecture, but for now -it should be omitted - -Following are the arch specific command line options to be used while -loading dump-capture kernel. - -For i386, x86_64 and ia64: - - "1 irqpoll maxcpus=1 reset_devices" - -For ppc64: - - "1 maxcpus=1 noirqdistrib reset_devices" - -For s390x: - - "1 maxcpus=1 cgroup_disable=memory" - -For arm: - - "1 maxcpus=1 reset_devices" - -For arm64: - - "1 maxcpus=1 reset_devices" - -Notes on loading the dump-capture kernel: - -* By default, the ELF headers are stored in ELF64 format to support - systems with more than 4GB memory. On i386, kexec automatically checks if - the physical RAM size exceeds the 4 GB limit and if not, uses ELF32. - So, on non-PAE systems, ELF32 is always used. - - The --elf32-core-headers option can be used to force the generation of ELF32 - headers. This is necessary because GDB currently cannot open vmcore files - with ELF64 headers on 32-bit systems. - -* The "irqpoll" boot parameter reduces driver initialization failures - due to shared interrupts in the dump-capture kernel. - -* You must specify in the format corresponding to the root - device name in the output of mount command. - -* Boot parameter "1" boots the dump-capture kernel into single-user - mode without networking. If you want networking, use "3". - -* We generally don't have to bring up a SMP kernel just to capture the - dump. Hence generally it is useful either to build a UP dump-capture - kernel or specify maxcpus=1 option while loading dump-capture kernel. - Note, though maxcpus always works, you had better replace it with - nr_cpus to save memory if supported by the current ARCH, such as x86. - -* You should enable multi-cpu support in dump-capture kernel if you intend - to use multi-thread programs with it, such as parallel dump feature of - makedumpfile. Otherwise, the multi-thread program may have a great - performance degradation. To enable multi-cpu support, you should bring up an - SMP dump-capture kernel and specify maxcpus/nr_cpus, disable_cpu_apicid=[X] - options while loading it. - -* For s390x there are two kdump modes: If a ELF header is specified with - the elfcorehdr= kernel parameter, it is used by the kdump kernel as it - is done on all other architectures. If no elfcorehdr= kernel parameter is - specified, the s390x kdump kernel dynamically creates the header. The - second mode has the advantage that for CPU and memory hotplug, kdump has - not to be reloaded with kexec_load(). - -* For s390x systems with many attached devices the "cio_ignore" kernel - parameter should be used for the kdump kernel in order to prevent allocation - of kernel memory for devices that are not relevant for kdump. The same - applies to systems that use SCSI/FCP devices. In that case the - "allow_lun_scan" zfcp module parameter should be set to zero before - setting FCP devices online. - -Kernel Panic -============ - -After successfully loading the dump-capture kernel as previously -described, the system will reboot into the dump-capture kernel if a -system crash is triggered. Trigger points are located in panic(), -die(), die_nmi() and in the sysrq handler (ALT-SysRq-c). - -The following conditions will execute a crash trigger point: - -If a hard lockup is detected and "NMI watchdog" is configured, the system -will boot into the dump-capture kernel ( die_nmi() ). - -If die() is called, and it happens to be a thread with pid 0 or 1, or die() -is called inside interrupt context or die() is called and panic_on_oops is set, -the system will boot into the dump-capture kernel. - -On powerpc systems when a soft-reset is generated, die() is called by all cpus -and the system will boot into the dump-capture kernel. - -For testing purposes, you can trigger a crash by using "ALT-SysRq-c", -"echo c > /proc/sysrq-trigger" or write a module to force the panic. - -Write Out the Dump File -======================= - -After the dump-capture kernel is booted, write out the dump file with -the following command:: - - cp /proc/vmcore - - -Analysis -======== - -Before analyzing the dump image, you should reboot into a stable kernel. - -You can do limited analysis using GDB on the dump file copied out of -/proc/vmcore. Use the debug vmlinux built with -g and run the following -command:: - - gdb vmlinux - -Stack trace for the task on processor 0, register display, and memory -display work fine. - -Note: GDB cannot analyze core files generated in ELF64 format for x86. -On systems with a maximum of 4GB of memory, you can generate -ELF32-format headers using the --elf32-core-headers kernel option on the -dump kernel. - -You can also use the Crash utility to analyze dump files in Kdump -format. Crash is available on Dave Anderson's site at the following URL: - - http://people.redhat.com/~anderson/ - -Trigger Kdump on WARN() -======================= - -The kernel parameter, panic_on_warn, calls panic() in all WARN() paths. This -will cause a kdump to occur at the panic() call. In cases where a user wants -to specify this during runtime, /proc/sys/kernel/panic_on_warn can be set to 1 -to achieve the same behaviour. - -Contact -======= - -- Vivek Goyal (vgoyal@redhat.com) -- Maneesh Soni (maneesh@in.ibm.com) - -GDB macros -========== - -.. include:: gdbmacros.txt - :literal: diff --git a/Documentation/kdump/vmcoreinfo.rst b/Documentation/kdump/vmcoreinfo.rst deleted file mode 100644 index 007a6b86e0ee..000000000000 --- a/Documentation/kdump/vmcoreinfo.rst +++ /dev/null @@ -1,488 +0,0 @@ -========== -VMCOREINFO -========== - -What is it? -=========== - -VMCOREINFO is a special ELF note section. It contains various -information from the kernel like structure size, page size, symbol -values, field offsets, etc. These data are packed into an ELF note -section and used by user-space tools like crash and makedumpfile to -analyze a kernel's memory layout. - -Common variables -================ - -init_uts_ns.name.release ------------------------- - -The version of the Linux kernel. Used to find the corresponding source -code from which the kernel has been built. For example, crash uses it to -find the corresponding vmlinux in order to process vmcore. - -PAGE_SIZE ---------- - -The size of a page. It is the smallest unit of data used by the memory -management facilities. It is usually 4096 bytes of size and a page is -aligned on 4096 bytes. Used for computing page addresses. - -init_uts_ns ------------ - -The UTS namespace which is used to isolate two specific elements of the -system that relate to the uname(2) system call. It is named after the -data structure used to store information returned by the uname(2) system -call. - -User-space tools can get the kernel name, host name, kernel release -number, kernel version, architecture name and OS type from it. - -node_online_map ---------------- - -An array node_states[N_ONLINE] which represents the set of online nodes -in a system, one bit position per node number. Used to keep track of -which nodes are in the system and online. - -swapper_pg_dir --------------- - -The global page directory pointer of the kernel. Used to translate -virtual to physical addresses. - -_stext ------- - -Defines the beginning of the text section. In general, _stext indicates -the kernel start address. Used to convert a virtual address from the -direct kernel map to a physical address. - -vmap_area_list --------------- - -Stores the virtual area list. makedumpfile gets the vmalloc start value -from this variable and its value is necessary for vmalloc translation. - -mem_map -------- - -Physical addresses are translated to struct pages by treating them as -an index into the mem_map array. Right-shifting a physical address -PAGE_SHIFT bits converts it into a page frame number which is an index -into that mem_map array. - -Used to map an address to the corresponding struct page. - -contig_page_data ----------------- - -Makedumpfile gets the pglist_data structure from this symbol, which is -used to describe the memory layout. - -User-space tools use this to exclude free pages when dumping memory. - -mem_section|(mem_section, NR_SECTION_ROOTS)|(mem_section, section_mem_map) --------------------------------------------------------------------------- - -The address of the mem_section array, its length, structure size, and -the section_mem_map offset. - -It exists in the sparse memory mapping model, and it is also somewhat -similar to the mem_map variable, both of them are used to translate an -address. - -page ----- - -The size of a page structure. struct page is an important data structure -and it is widely used to compute contiguous memory. - -pglist_data ------------ - -The size of a pglist_data structure. This value is used to check if the -pglist_data structure is valid. It is also used for checking the memory -type. - -zone ----- - -The size of a zone structure. This value is used to check if the zone -structure has been found. It is also used for excluding free pages. - -free_area ---------- - -The size of a free_area structure. It indicates whether the free_area -structure is valid or not. Useful when excluding free pages. - -list_head ---------- - -The size of a list_head structure. Used when iterating lists in a -post-mortem analysis session. - -nodemask_t ----------- - -The size of a nodemask_t type. Used to compute the number of online -nodes. - -(page, flags|_refcount|mapping|lru|_mapcount|private|compound_dtor|compound_order|compound_head) -------------------------------------------------------------------------------------------------- - -User-space tools compute their values based on the offset of these -variables. The variables are used when excluding unnecessary pages. - -(pglist_data, node_zones|nr_zones|node_mem_map|node_start_pfn|node_spanned_pages|node_id) ------------------------------------------------------------------------------------------ - -On NUMA machines, each NUMA node has a pg_data_t to describe its memory -layout. On UMA machines there is a single pglist_data which describes the -whole memory. - -These values are used to check the memory type and to compute the -virtual address for memory map. - -(zone, free_area|vm_stat|spanned_pages) ---------------------------------------- - -Each node is divided into a number of blocks called zones which -represent ranges within memory. A zone is described by a structure zone. - -User-space tools compute required values based on the offset of these -variables. - -(free_area, free_list) ----------------------- - -Offset of the free_list's member. This value is used to compute the number -of free pages. - -Each zone has a free_area structure array called free_area[MAX_ORDER]. -The free_list represents a linked list of free page blocks. - -(list_head, next|prev) ----------------------- - -Offsets of the list_head's members. list_head is used to define a -circular linked list. User-space tools need these in order to traverse -lists. - -(vmap_area, va_start|list) --------------------------- - -Offsets of the vmap_area's members. They carry vmalloc-specific -information. Makedumpfile gets the start address of the vmalloc region -from this. - -(zone.free_area, MAX_ORDER) ---------------------------- - -Free areas descriptor. User-space tools use this value to iterate the -free_area ranges. MAX_ORDER is used by the zone buddy allocator. - -log_first_idx -------------- - -Index of the first record stored in the buffer log_buf. Used by -user-space tools to read the strings in the log_buf. - -log_buf -------- - -Console output is written to the ring buffer log_buf at index -log_first_idx. Used to get the kernel log. - -log_buf_len ------------ - -log_buf's length. - -clear_idx ---------- - -The index that the next printk() record to read after the last clear -command. It indicates the first record after the last SYSLOG_ACTION -_CLEAR, like issued by 'dmesg -c'. Used by user-space tools to dump -the dmesg log. - -log_next_idx ------------- - -The index of the next record to store in the buffer log_buf. Used to -compute the index of the current buffer position. - -printk_log ----------- - -The size of a structure printk_log. Used to compute the size of -messages, and extract dmesg log. It encapsulates header information for -log_buf, such as timestamp, syslog level, etc. - -(printk_log, ts_nsec|len|text_len|dict_len) -------------------------------------------- - -It represents field offsets in struct printk_log. User space tools -parse it and check whether the values of printk_log's members have been -changed. - -(free_area.free_list, MIGRATE_TYPES) ------------------------------------- - -The number of migrate types for pages. The free_list is described by the -array. Used by tools to compute the number of free pages. - -NR_FREE_PAGES -------------- - -On linux-2.6.21 or later, the number of free pages is in -vm_stat[NR_FREE_PAGES]. Used to get the number of free pages. - -PG_lru|PG_private|PG_swapcache|PG_swapbacked|PG_slab|PG_hwpoision|PG_head_mask ------------------------------------------------------------------------------- - -Page attributes. These flags are used to filter various unnecessary for -dumping pages. - -PAGE_BUDDY_MAPCOUNT_VALUE(~PG_buddy)|PAGE_OFFLINE_MAPCOUNT_VALUE(~PG_offline) ------------------------------------------------------------------------------ - -More page attributes. These flags are used to filter various unnecessary for -dumping pages. - - -HUGETLB_PAGE_DTOR ------------------ - -The HUGETLB_PAGE_DTOR flag denotes hugetlbfs pages. Makedumpfile -excludes these pages. - -x86_64 -====== - -phys_base ---------- - -Used to convert the virtual address of an exported kernel symbol to its -corresponding physical address. - -init_top_pgt ------------- - -Used to walk through the whole page table and convert virtual addresses -to physical addresses. The init_top_pgt is somewhat similar to -swapper_pg_dir, but it is only used in x86_64. - -pgtable_l5_enabled ------------------- - -User-space tools need to know whether the crash kernel was in 5-level -paging mode. - -node_data ---------- - -This is a struct pglist_data array and stores all NUMA nodes -information. Makedumpfile gets the pglist_data structure from it. - -(node_data, MAX_NUMNODES) -------------------------- - -The maximum number of nodes in system. - -KERNELOFFSET ------------- - -The kernel randomization offset. Used to compute the page offset. If -KASLR is disabled, this value is zero. - -KERNEL_IMAGE_SIZE ------------------ - -Currently unused by Makedumpfile. Used to compute the module virtual -address by Crash. - -sme_mask --------- - -AMD-specific with SME support: it indicates the secure memory encryption -mask. Makedumpfile tools need to know whether the crash kernel was -encrypted. If SME is enabled in the first kernel, the crash kernel's -page table entries (pgd/pud/pmd/pte) contain the memory encryption -mask. This is used to remove the SME mask and obtain the true physical -address. - -Currently, sme_mask stores the value of the C-bit position. If needed, -additional SME-relevant info can be placed in that variable. - -For example:: - - [ misc ][ enc bit ][ other misc SME info ] - 0000_0000_0000_0000_1000_0000_0000_0000_0000_0000_..._0000 - 63 59 55 51 47 43 39 35 31 27 ... 3 - -x86_32 -====== - -X86_PAE -------- - -Denotes whether physical address extensions are enabled. It has the cost -of a higher page table lookup overhead, and also consumes more page -table space per process. Used to check whether PAE was enabled in the -crash kernel when converting virtual addresses to physical addresses. - -ia64 -==== - -pgdat_list|(pgdat_list, MAX_NUMNODES) -------------------------------------- - -pg_data_t array storing all NUMA nodes information. MAX_NUMNODES -indicates the number of the nodes. - -node_memblk|(node_memblk, NR_NODE_MEMBLKS) ------------------------------------------- - -List of node memory chunks. Filled when parsing the SRAT table to obtain -information about memory nodes. NR_NODE_MEMBLKS indicates the number of -node memory chunks. - -These values are used to compute the number of nodes the crashed kernel used. - -node_memblk_s|(node_memblk_s, start_paddr)|(node_memblk_s, size) ----------------------------------------------------------------- - -The size of a struct node_memblk_s and the offsets of the -node_memblk_s's members. Used to compute the number of nodes. - -PGTABLE_3|PGTABLE_4 -------------------- - -User-space tools need to know whether the crash kernel was in 3-level or -4-level paging mode. Used to distinguish the page table. - -ARM64 -===== - -VA_BITS -------- - -The maximum number of bits for virtual addresses. Used to compute the -virtual memory ranges. - -kimage_voffset --------------- - -The offset between the kernel virtual and physical mappings. Used to -translate virtual to physical addresses. - -PHYS_OFFSET ------------ - -Indicates the physical address of the start of memory. Similar to -kimage_voffset, which is used to translate virtual to physical -addresses. - -KERNELOFFSET ------------- - -The kernel randomization offset. Used to compute the page offset. If -KASLR is disabled, this value is zero. - -arm -=== - -ARM_LPAE --------- - -It indicates whether the crash kernel supports large physical address -extensions. Used to translate virtual to physical addresses. - -s390 -==== - -lowcore_ptr ------------ - -An array with a pointer to the lowcore of every CPU. Used to print the -psw and all registers information. - -high_memory ------------ - -Used to get the vmalloc_start address from the high_memory symbol. - -(lowcore_ptr, NR_CPUS) ----------------------- - -The maximum number of CPUs. - -powerpc -======= - - -node_data|(node_data, MAX_NUMNODES) ------------------------------------ - -See above. - -contig_page_data ----------------- - -See above. - -vmemmap_list ------------- - -The vmemmap_list maintains the entire vmemmap physical mapping. Used -to get vmemmap list count and populated vmemmap regions info. If the -vmemmap address translation information is stored in the crash kernel, -it is used to translate vmemmap kernel virtual addresses. - -mmu_vmemmap_psize ------------------ - -The size of a page. Used to translate virtual to physical addresses. - -mmu_psize_defs --------------- - -Page size definitions, i.e. 4k, 64k, or 16M. - -Used to make vtop translations. - -vmemmap_backing|(vmemmap_backing, list)|(vmemmap_backing, phys)|(vmemmap_backing, virt_addr) --------------------------------------------------------------------------------------------- - -The vmemmap virtual address space management does not have a traditional -page table to track which virtual struct pages are backed by a physical -mapping. The virtual to physical mappings are tracked in a simple linked -list format. - -User-space tools need to know the offset of list, phys and virt_addr -when computing the count of vmemmap regions. - -mmu_psize_def|(mmu_psize_def, shift) ------------------------------------- - -The size of a struct mmu_psize_def and the offset of mmu_psize_def's -member. - -Used in vtop translations. - -sh -== - -node_data|(node_data, MAX_NUMNODES) ------------------------------------ - -See above. - -X2TLB ------ - -Indicates whether the crashed kernel enabled SH extended mode. diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt index 0c41d6d463f3..10e7f4d16c14 100644 --- a/Documentation/powerpc/firmware-assisted-dump.txt +++ b/Documentation/powerpc/firmware-assisted-dump.txt @@ -59,7 +59,7 @@ as follows: the default calculated size. Use this option if default boot memory size is not sufficient for second kernel to boot successfully. For syntax of crashkernel= parameter, - refer to Documentation/kdump/kdump.rst. If any offset is + refer to Documentation/admin-guide/kdump/kdump.rst. If any offset is provided in crashkernel= parameter, it will be ignored as fadump uses a predefined offset to reserve memory for boot memory dump preservation in case of a crash. diff --git a/Documentation/translations/zh_CN/oops-tracing.txt b/Documentation/translations/zh_CN/oops-tracing.txt index 368ddd05b304..c5f3bda7abcb 100644 --- a/Documentation/translations/zh_CN/oops-tracing.txt +++ b/Documentation/translations/zh_CN/oops-tracing.txt @@ -53,8 +53,8 @@ cat /proc/kmsg > file, 然而你必须介入中止传输, kmsg是一个“ (2)用串口终端启动(请参看Documentation/admin-guide/serial-console.rst),运行一个null modem到另一台机器并用你喜欢的通讯工具获取输出。Minicom工作地很好。 -(3)使用Kdump(请参看Documentation/kdump/kdump.rst), -使用在Documentation/kdump/gdbmacros.txt中定义的dmesg gdb宏,从旧的内存中提取内核 +(3)使用Kdump(请参看Documentation/admin-guide/kdump/kdump.rst), +使用在Documentation/admin-guide/kdump/gdbmacros.txt中定义的dmesg gdb宏,从旧的内存中提取内核 环形缓冲区。 完整信息 diff --git a/MAINTAINERS b/MAINTAINERS index 288f84dbd480..b36028f43192 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8675,7 +8675,7 @@ R: Vivek Goyal L: kexec@lists.infradead.org W: http://lse.sourceforge.net/kdump/ S: Maintained -F: Documentation/kdump/ +F: Documentation/admin-guide/kdump/ KEENE FM RADIO TRANSMITTER DRIVER M: Hans Verkuil diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 6425871e9903..20afd6077465 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -2036,7 +2036,7 @@ config CRASH_DUMP kdump/kexec. The crash dump kernel must be compiled to a memory address not used by the main kernel - For more details see Documentation/kdump/kdump.rst + For more details see Documentation/admin-guide/kdump/kdump.rst config AUTO_ZRELADDR bool "Auto calculation of the decompressed kernel image address" diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a4b22bbf0590..86f81b5afd95 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -996,7 +996,7 @@ config CRASH_DUMP reserved region and then later executed after a crash by kdump/kexec. - For more details see Documentation/kdump/kdump.rst + For more details see Documentation/admin-guide/kdump/kdump.rst config XEN_DOM0 def_bool y diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 31a7d12db705..c2858ac6a46a 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -626,7 +626,7 @@ config CRASH_DUMP to a memory address not used by the main kernel using PHYSICAL_START. - For more details see Documentation/kdump/kdump.rst + For more details see Documentation/admin-guide/kdump/kdump.rst config KEXEC_JUMP bool "kexec jump (EXPERIMENTAL)" diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d0bbca65e4a4..9505066b7ba3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2057,7 +2057,7 @@ config CRASH_DUMP to a memory address not used by the main kernel or BIOS using PHYSICAL_START, or it must be built as a relocatable image (CONFIG_RELOCATABLE=y). - For more details see Documentation/kdump/kdump.rst + For more details see Documentation/admin-guide/kdump/kdump.rst config KEXEC_JUMP bool "kexec jump" @@ -2094,7 +2094,7 @@ config PHYSICAL_START the reserved region. In other words, it can be set based on the "X" value as specified in the "crashkernel=YM@XM" command line boot parameter passed to the panic-ed - kernel. Please take a look at Documentation/kdump/kdump.rst + kernel. Please take a look at Documentation/admin-guide/kdump/kdump.rst for more details about crash dumps. Usage of bzImage for capturing the crash dump is recommended as -- cgit v1.2.3-70-g09d2 From 4f4cfa6c560c93ba180c30675cf845e1597de44c Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 27 Jun 2019 14:56:51 -0300 Subject: docs: admin-guide: add a series of orphaned documents There are lots of documents that belong to the admin-guide but are on random places (most under Documentation root dir). Move them to the admin guide. Signed-off-by: Mauro Carvalho Chehab Acked-by: Alexandre Belloni Acked-by: Bartlomiej Zolnierkiewicz --- Documentation/ABI/stable/sysfs-devices-node | 2 +- Documentation/ABI/testing/procfs-diskstats | 2 +- Documentation/ABI/testing/sysfs-block | 2 +- Documentation/ABI/testing/sysfs-devices-system-cpu | 4 +- Documentation/admin-guide/btmrvl.rst | 124 +++++++ Documentation/admin-guide/clearing-warn-once.rst | 9 + Documentation/admin-guide/cpu-load.rst | 114 +++++++ Documentation/admin-guide/cputopology.rst | 177 ++++++++++ .../admin-guide/device-mapper/statistics.rst | 4 +- Documentation/admin-guide/efi-stub.rst | 100 ++++++ Documentation/admin-guide/highuid.rst | 80 +++++ Documentation/admin-guide/hw-vuln/l1tf.rst | 2 +- Documentation/admin-guide/hw_random.rst | 105 ++++++ Documentation/admin-guide/index.rst | 17 + Documentation/admin-guide/iostats.rst | 197 ++++++++++++ Documentation/admin-guide/kernel-parameters.txt | 2 +- .../admin-guide/kernel-per-CPU-kthreads.rst | 356 +++++++++++++++++++++ Documentation/admin-guide/lcd-panel-cgram.rst | 27 ++ Documentation/admin-guide/ldm.rst | 121 +++++++ Documentation/admin-guide/lockup-watchdogs.rst | 83 +++++ Documentation/admin-guide/mm/cma_debugfs.rst | 25 ++ Documentation/admin-guide/mm/index.rst | 1 + Documentation/admin-guide/numastat.rst | 30 ++ Documentation/admin-guide/pnp.rst | 292 +++++++++++++++++ Documentation/admin-guide/rtc.rst | 140 ++++++++ Documentation/admin-guide/svga.rst | 249 ++++++++++++++ Documentation/admin-guide/sysctl/kernel.rst | 2 +- Documentation/admin-guide/video-output.rst | 34 ++ Documentation/auxdisplay/lcd-panel-cgram.rst | 29 -- Documentation/btmrvl.txt | 124 ------- Documentation/clearing-warn-once.txt | 9 - Documentation/cma/debugfs.rst | 27 -- Documentation/cpu-load.txt | 114 ------- Documentation/cputopology.txt | 177 ---------- Documentation/efi-stub.txt | 100 ------ Documentation/fb/vesafb.rst | 2 +- Documentation/highuid.txt | 80 ----- Documentation/hw_random.txt | 105 ------ Documentation/iostats.txt | 197 ------------ Documentation/kernel-per-CPU-kthreads.txt | 356 --------------------- Documentation/ldm.txt | 121 ------- Documentation/lockup-watchdogs.txt | 83 ----- Documentation/numastat.txt | 30 -- Documentation/pnp.txt | 292 ----------------- Documentation/rtc.txt | 140 -------- Documentation/svga.txt | 249 -------------- Documentation/video-output.txt | 34 -- Documentation/x86/topology.rst | 2 +- MAINTAINERS | 12 +- arch/arm/Kconfig | 2 +- arch/parisc/Kconfig | 2 +- arch/sh/Kconfig | 2 +- arch/sparc/Kconfig | 2 +- arch/x86/Kconfig | 4 +- block/partitions/Kconfig | 2 +- drivers/char/Kconfig | 4 +- drivers/char/hw_random/core.c | 2 +- include/linux/hw_random.h | 2 +- 58 files changed, 2310 insertions(+), 2296 deletions(-) create mode 100644 Documentation/admin-guide/btmrvl.rst create mode 100644 Documentation/admin-guide/clearing-warn-once.rst create mode 100644 Documentation/admin-guide/cpu-load.rst create mode 100644 Documentation/admin-guide/cputopology.rst create mode 100644 Documentation/admin-guide/efi-stub.rst create mode 100644 Documentation/admin-guide/highuid.rst create mode 100644 Documentation/admin-guide/hw_random.rst create mode 100644 Documentation/admin-guide/iostats.rst create mode 100644 Documentation/admin-guide/kernel-per-CPU-kthreads.rst create mode 100644 Documentation/admin-guide/lcd-panel-cgram.rst create mode 100644 Documentation/admin-guide/ldm.rst create mode 100644 Documentation/admin-guide/lockup-watchdogs.rst create mode 100644 Documentation/admin-guide/mm/cma_debugfs.rst create mode 100644 Documentation/admin-guide/numastat.rst create mode 100644 Documentation/admin-guide/pnp.rst create mode 100644 Documentation/admin-guide/rtc.rst create mode 100644 Documentation/admin-guide/svga.rst create mode 100644 Documentation/admin-guide/video-output.rst delete mode 100644 Documentation/auxdisplay/lcd-panel-cgram.rst delete mode 100644 Documentation/btmrvl.txt delete mode 100644 Documentation/clearing-warn-once.txt delete mode 100644 Documentation/cma/debugfs.rst delete mode 100644 Documentation/cpu-load.txt delete mode 100644 Documentation/cputopology.txt delete mode 100644 Documentation/efi-stub.txt delete mode 100644 Documentation/highuid.txt delete mode 100644 Documentation/hw_random.txt delete mode 100644 Documentation/iostats.txt delete mode 100644 Documentation/kernel-per-CPU-kthreads.txt delete mode 100644 Documentation/ldm.txt delete mode 100644 Documentation/lockup-watchdogs.txt delete mode 100644 Documentation/numastat.txt delete mode 100644 Documentation/pnp.txt delete mode 100644 Documentation/rtc.txt delete mode 100644 Documentation/svga.txt delete mode 100644 Documentation/video-output.txt (limited to 'arch') diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node index f7ce68fbd4b9..df8413cf1468 100644 --- a/Documentation/ABI/stable/sysfs-devices-node +++ b/Documentation/ABI/stable/sysfs-devices-node @@ -61,7 +61,7 @@ Date: October 2002 Contact: Linux Memory Management list Description: The node's hit/miss statistics, in units of pages. - See Documentation/numastat.txt + See Documentation/admin-guide/numastat.rst What: /sys/devices/system/node/nodeX/distance Date: October 2002 diff --git a/Documentation/ABI/testing/procfs-diskstats b/Documentation/ABI/testing/procfs-diskstats index abac31d216de..2c44b4f1b060 100644 --- a/Documentation/ABI/testing/procfs-diskstats +++ b/Documentation/ABI/testing/procfs-diskstats @@ -29,4 +29,4 @@ Description: 17 - sectors discarded 18 - time spent discarding - For more details refer to Documentation/iostats.txt + For more details refer to Documentation/admin-guide/iostats.rst diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index dfad7427817c..f8c7c7126bb1 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -15,7 +15,7 @@ Description: 9 - I/Os currently in progress 10 - time spent doing I/Os (ms) 11 - weighted time spent doing I/Os (ms) - For more details refer Documentation/iostats.txt + For more details refer Documentation/admin-guide/iostats.rst What: /sys/block///stat diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index d404603c6b52..5f7d7b14fa44 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -34,7 +34,7 @@ Description: CPU topology files that describe kernel limits related to present: cpus that have been identified as being present in the system. - See Documentation/cputopology.txt for more information. + See Documentation/admin-guide/cputopology.rst for more information. What: /sys/devices/system/cpu/probe @@ -103,7 +103,7 @@ Description: CPU topology files that describe a logical CPU's relationship thread_siblings_list: human-readable list of cpu#'s hardware threads within the same core as cpu# - See Documentation/cputopology.txt for more information. + See Documentation/admin-guide/cputopology.rst for more information. What: /sys/devices/system/cpu/cpuidle/current_driver diff --git a/Documentation/admin-guide/btmrvl.rst b/Documentation/admin-guide/btmrvl.rst new file mode 100644 index 000000000000..ec57740ead0c --- /dev/null +++ b/Documentation/admin-guide/btmrvl.rst @@ -0,0 +1,124 @@ +============= +btmrvl driver +============= + +All commands are used via debugfs interface. + +Set/get driver configurations +============================= + +Path: /debug/btmrvl/config/ + +gpiogap=[n], hscfgcmd + These commands are used to configure the host sleep parameters:: + bit 8:0 -- Gap + bit 16:8 -- GPIO + + where GPIO is the pin number of GPIO used to wake up the host. + It could be any valid GPIO pin# (e.g. 0-7) or 0xff (SDIO interface + wakeup will be used instead). + + where Gap is the gap in milli seconds between wakeup signal and + wakeup event, or 0xff for special host sleep setting. + + Usage:: + + # Use SDIO interface to wake up the host and set GAP to 0x80: + echo 0xff80 > /debug/btmrvl/config/gpiogap + echo 1 > /debug/btmrvl/config/hscfgcmd + + # Use GPIO pin #3 to wake up the host and set GAP to 0xff: + echo 0x03ff > /debug/btmrvl/config/gpiogap + echo 1 > /debug/btmrvl/config/hscfgcmd + +psmode=[n], pscmd + These commands are used to enable/disable auto sleep mode + + where the option is:: + + 1 -- Enable auto sleep mode + 0 -- Disable auto sleep mode + + Usage:: + + # Enable auto sleep mode + echo 1 > /debug/btmrvl/config/psmode + echo 1 > /debug/btmrvl/config/pscmd + + # Disable auto sleep mode + echo 0 > /debug/btmrvl/config/psmode + echo 1 > /debug/btmrvl/config/pscmd + + +hsmode=[n], hscmd + These commands are used to enable host sleep or wake up firmware + + where the option is:: + + 1 -- Enable host sleep + 0 -- Wake up firmware + + Usage:: + + # Enable host sleep + echo 1 > /debug/btmrvl/config/hsmode + echo 1 > /debug/btmrvl/config/hscmd + + # Wake up firmware + echo 0 > /debug/btmrvl/config/hsmode + echo 1 > /debug/btmrvl/config/hscmd + + +Get driver status +================= + +Path: /debug/btmrvl/status/ + +Usage:: + + cat /debug/btmrvl/status/ + +where the args are: + +curpsmode + This command displays current auto sleep status. + +psstate + This command display the power save state. + +hsstate + This command display the host sleep state. + +txdnldrdy + This command displays the value of Tx download ready flag. + +Issuing a raw hci command +========================= + +Use hcitool to issue raw hci command, refer to hcitool manual + +Usage:: + + Hcitool cmd [Parameters] + +Interface Control Command:: + + hcitool cmd 0x3f 0x5b 0xf5 0x01 0x00 --Enable All interface + hcitool cmd 0x3f 0x5b 0xf5 0x01 0x01 --Enable Wlan interface + hcitool cmd 0x3f 0x5b 0xf5 0x01 0x02 --Enable BT interface + hcitool cmd 0x3f 0x5b 0xf5 0x00 0x00 --Disable All interface + hcitool cmd 0x3f 0x5b 0xf5 0x00 0x01 --Disable Wlan interface + hcitool cmd 0x3f 0x5b 0xf5 0x00 0x02 --Disable BT interface + +SD8688 firmware +=============== + +Images: + +- /lib/firmware/sd8688_helper.bin +- /lib/firmware/sd8688.bin + + +The images can be downloaded from: + +git.infradead.org/users/dwmw2/linux-firmware.git/libertas/ diff --git a/Documentation/admin-guide/clearing-warn-once.rst b/Documentation/admin-guide/clearing-warn-once.rst new file mode 100644 index 000000000000..211fd926cf00 --- /dev/null +++ b/Documentation/admin-guide/clearing-warn-once.rst @@ -0,0 +1,9 @@ +Clearing WARN_ONCE +------------------ + +WARN_ONCE / WARN_ON_ONCE / printk_once only emit a message once. + +echo 1 > /sys/kernel/debug/clear_warn_once + +clears the state and allows the warnings to print once again. +This can be useful after test suite runs to reproduce problems. diff --git a/Documentation/admin-guide/cpu-load.rst b/Documentation/admin-guide/cpu-load.rst new file mode 100644 index 000000000000..2d01ce43d2a2 --- /dev/null +++ b/Documentation/admin-guide/cpu-load.rst @@ -0,0 +1,114 @@ +======== +CPU load +======== + +Linux exports various bits of information via ``/proc/stat`` and +``/proc/uptime`` that userland tools, such as top(1), use to calculate +the average time system spent in a particular state, for example:: + + $ iostat + Linux 2.6.18.3-exp (linmac) 02/20/2007 + + avg-cpu: %user %nice %system %iowait %steal %idle + 10.01 0.00 2.92 5.44 0.00 81.63 + + ... + +Here the system thinks that over the default sampling period the +system spent 10.01% of the time doing work in user space, 2.92% in the +kernel, and was overall 81.63% of the time idle. + +In most cases the ``/proc/stat`` information reflects the reality quite +closely, however due to the nature of how/when the kernel collects +this data sometimes it can not be trusted at all. + +So how is this information collected? Whenever timer interrupt is +signalled the kernel looks what kind of task was running at this +moment and increments the counter that corresponds to this tasks +kind/state. The problem with this is that the system could have +switched between various states multiple times between two timer +interrupts yet the counter is incremented only for the last state. + + +Example +------- + +If we imagine the system with one task that periodically burns cycles +in the following manner:: + + time line between two timer interrupts + |--------------------------------------| + ^ ^ + |_ something begins working | + |_ something goes to sleep + (only to be awaken quite soon) + +In the above situation the system will be 0% loaded according to the +``/proc/stat`` (since the timer interrupt will always happen when the +system is executing the idle handler), but in reality the load is +closer to 99%. + +One can imagine many more situations where this behavior of the kernel +will lead to quite erratic information inside ``/proc/stat``:: + + + /* gcc -o hog smallhog.c */ + #include + #include + #include + #include + #define HIST 10 + + static volatile sig_atomic_t stop; + + static void sighandler (int signr) + { + (void) signr; + stop = 1; + } + static unsigned long hog (unsigned long niters) + { + stop = 0; + while (!stop && --niters); + return niters; + } + int main (void) + { + int i; + struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 }, + .it_value = { .tv_sec = 0, .tv_usec = 1 } }; + sigset_t set; + unsigned long v[HIST]; + double tmp = 0.0; + unsigned long n; + signal (SIGALRM, &sighandler); + setitimer (ITIMER_REAL, &it, NULL); + + hog (ULONG_MAX); + for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX); + for (i = 0; i < HIST; ++i) tmp += v[i]; + tmp /= HIST; + n = tmp - (tmp / 3.0); + + sigemptyset (&set); + sigaddset (&set, SIGALRM); + + for (;;) { + hog (n); + sigwait (&set, &i); + } + return 0; + } + + +References +---------- + +- http://lkml.org/lkml/2007/2/12/6 +- Documentation/filesystems/proc.txt (1.8) + + +Thanks +------ + +Con Kolivas, Pavel Machek diff --git a/Documentation/admin-guide/cputopology.rst b/Documentation/admin-guide/cputopology.rst new file mode 100644 index 000000000000..b90dafcc8237 --- /dev/null +++ b/Documentation/admin-guide/cputopology.rst @@ -0,0 +1,177 @@ +=========================================== +How CPU topology info is exported via sysfs +=========================================== + +Export CPU topology info via sysfs. Items (attributes) are similar +to /proc/cpuinfo output of some architectures. They reside in +/sys/devices/system/cpu/cpuX/topology/: + +physical_package_id: + + physical package id of cpuX. Typically corresponds to a physical + socket number, but the actual value is architecture and platform + dependent. + +die_id: + + the CPU die ID of cpuX. Typically it is the hardware platform's + identifier (rather than the kernel's). The actual value is + architecture and platform dependent. + +core_id: + + the CPU core ID of cpuX. Typically it is the hardware platform's + identifier (rather than the kernel's). The actual value is + architecture and platform dependent. + +book_id: + + the book ID of cpuX. Typically it is the hardware platform's + identifier (rather than the kernel's). The actual value is + architecture and platform dependent. + +drawer_id: + + the drawer ID of cpuX. Typically it is the hardware platform's + identifier (rather than the kernel's). The actual value is + architecture and platform dependent. + +core_cpus: + + internal kernel map of CPUs within the same core. + (deprecated name: "thread_siblings") + +core_cpus_list: + + human-readable list of CPUs within the same core. + (deprecated name: "thread_siblings_list"); + +package_cpus: + + internal kernel map of the CPUs sharing the same physical_package_id. + (deprecated name: "core_siblings") + +package_cpus_list: + + human-readable list of CPUs sharing the same physical_package_id. + (deprecated name: "core_siblings_list") + +die_cpus: + + internal kernel map of CPUs within the same die. + +die_cpus_list: + + human-readable list of CPUs within the same die. + +book_siblings: + + internal kernel map of cpuX's hardware threads within the same + book_id. + +book_siblings_list: + + human-readable list of cpuX's hardware threads within the same + book_id. + +drawer_siblings: + + internal kernel map of cpuX's hardware threads within the same + drawer_id. + +drawer_siblings_list: + + human-readable list of cpuX's hardware threads within the same + drawer_id. + +Architecture-neutral, drivers/base/topology.c, exports these attributes. +However, the book and drawer related sysfs files will only be created if +CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are selected, respectively. + +CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are currently only used on s390, +where they reflect the cpu and cache hierarchy. + +For an architecture to support this feature, it must define some of +these macros in include/asm-XXX/topology.h:: + + #define topology_physical_package_id(cpu) + #define topology_die_id(cpu) + #define topology_core_id(cpu) + #define topology_book_id(cpu) + #define topology_drawer_id(cpu) + #define topology_sibling_cpumask(cpu) + #define topology_core_cpumask(cpu) + #define topology_die_cpumask(cpu) + #define topology_book_cpumask(cpu) + #define topology_drawer_cpumask(cpu) + +The type of ``**_id macros`` is int. +The type of ``**_cpumask macros`` is ``(const) struct cpumask *``. The latter +correspond with appropriate ``**_siblings`` sysfs attributes (except for +topology_sibling_cpumask() which corresponds with thread_siblings). + +To be consistent on all architectures, include/linux/topology.h +provides default definitions for any of the above macros that are +not defined by include/asm-XXX/topology.h: + +1) topology_physical_package_id: -1 +2) topology_die_id: -1 +3) topology_core_id: 0 +4) topology_sibling_cpumask: just the given CPU +5) topology_core_cpumask: just the given CPU +6) topology_die_cpumask: just the given CPU + +For architectures that don't support books (CONFIG_SCHED_BOOK) there are no +default definitions for topology_book_id() and topology_book_cpumask(). +For architectures that don't support drawers (CONFIG_SCHED_DRAWER) there are +no default definitions for topology_drawer_id() and topology_drawer_cpumask(). + +Additionally, CPU topology information is provided under +/sys/devices/system/cpu and includes these files. The internal +source for the output is in brackets ("[]"). + + =========== ========================================================== + kernel_max: the maximum CPU index allowed by the kernel configuration. + [NR_CPUS-1] + + offline: CPUs that are not online because they have been + HOTPLUGGED off (see cpu-hotplug.txt) or exceed the limit + of CPUs allowed by the kernel configuration (kernel_max + above). [~cpu_online_mask + cpus >= NR_CPUS] + + online: CPUs that are online and being scheduled [cpu_online_mask] + + possible: CPUs that have been allocated resources and can be + brought online if they are present. [cpu_possible_mask] + + present: CPUs that have been identified as being present in the + system. [cpu_present_mask] + =========== ========================================================== + +The format for the above output is compatible with cpulist_parse() +[see ]. Some examples follow. + +In this example, there are 64 CPUs in the system but cpus 32-63 exceed +the kernel max which is limited to 0..31 by the NR_CPUS config option +being 32. Note also that CPUs 2 and 4-31 are not online but could be +brought online as they are both present and possible:: + + kernel_max: 31 + offline: 2,4-31,32-63 + online: 0-1,3 + possible: 0-31 + present: 0-31 + +In this example, the NR_CPUS config option is 128, but the kernel was +started with possible_cpus=144. There are 4 CPUs in the system and cpu2 +was manually taken offline (and is the only CPU that can be brought +online.):: + + kernel_max: 127 + offline: 2,4-127,128-143 + online: 0-1,3 + possible: 0-127 + present: 0-3 + +See cpu-hotplug.txt for the possible_cpus=NUM kernel start parameter +as well as more information on the various cpumasks. diff --git a/Documentation/admin-guide/device-mapper/statistics.rst b/Documentation/admin-guide/device-mapper/statistics.rst index 3d80a9f850cc..41ded0bc5933 100644 --- a/Documentation/admin-guide/device-mapper/statistics.rst +++ b/Documentation/admin-guide/device-mapper/statistics.rst @@ -13,7 +13,7 @@ the range specified. The I/O statistics counters for each step-sized area of a region are in the same format as `/sys/block/*/stat` or `/proc/diskstats` (see: -Documentation/iostats.txt). But two extra counters (12 and 13) are +Documentation/admin-guide/iostats.rst). But two extra counters (12 and 13) are provided: total time spent reading and writing. When the histogram argument is used, the 14th parameter is reported that represents the histogram of latencies. All these counters may be accessed by sending @@ -151,7 +151,7 @@ Messages The first 11 counters have the same meaning as `/sys/block/*/stat or /proc/diskstats`. - Please refer to Documentation/iostats.txt for details. + Please refer to Documentation/admin-guide/iostats.rst for details. 1. the number of reads completed 2. the number of reads merged diff --git a/Documentation/admin-guide/efi-stub.rst b/Documentation/admin-guide/efi-stub.rst new file mode 100644 index 000000000000..833edb0d0bc4 --- /dev/null +++ b/Documentation/admin-guide/efi-stub.rst @@ -0,0 +1,100 @@ +================= +The EFI Boot Stub +================= + +On the x86 and ARM platforms, a kernel zImage/bzImage can masquerade +as a PE/COFF image, thereby convincing EFI firmware loaders to load +it as an EFI executable. The code that modifies the bzImage header, +along with the EFI-specific entry point that the firmware loader +jumps to are collectively known as the "EFI boot stub", and live in +arch/x86/boot/header.S and arch/x86/boot/compressed/eboot.c, +respectively. For ARM the EFI stub is implemented in +arch/arm/boot/compressed/efi-header.S and +arch/arm/boot/compressed/efi-stub.c. EFI stub code that is shared +between architectures is in drivers/firmware/efi/libstub. + +For arm64, there is no compressed kernel support, so the Image itself +masquerades as a PE/COFF image and the EFI stub is linked into the +kernel. The arm64 EFI stub lives in arch/arm64/kernel/efi-entry.S +and drivers/firmware/efi/libstub/arm64-stub.c. + +By using the EFI boot stub it's possible to boot a Linux kernel +without the use of a conventional EFI boot loader, such as grub or +elilo. Since the EFI boot stub performs the jobs of a boot loader, in +a certain sense it *IS* the boot loader. + +The EFI boot stub is enabled with the CONFIG_EFI_STUB kernel option. + + +How to install bzImage.efi +-------------------------- + +The bzImage located in arch/x86/boot/bzImage must be copied to the EFI +System Partition (ESP) and renamed with the extension ".efi". Without +the extension the EFI firmware loader will refuse to execute it. It's +not possible to execute bzImage.efi from the usual Linux file systems +because EFI firmware doesn't have support for them. For ARM the +arch/arm/boot/zImage should be copied to the system partition, and it +may not need to be renamed. Similarly for arm64, arch/arm64/boot/Image +should be copied but not necessarily renamed. + + +Passing kernel parameters from the EFI shell +-------------------------------------------- + +Arguments to the kernel can be passed after bzImage.efi, e.g.:: + + fs0:> bzImage.efi console=ttyS0 root=/dev/sda4 + + +The "initrd=" option +-------------------- + +Like most boot loaders, the EFI stub allows the user to specify +multiple initrd files using the "initrd=" option. This is the only EFI +stub-specific command line parameter, everything else is passed to the +kernel when it boots. + +The path to the initrd file must be an absolute path from the +beginning of the ESP, relative path names do not work. Also, the path +is an EFI-style path and directory elements must be separated with +backslashes (\). For example, given the following directory layout:: + + fs0:> + Kernels\ + bzImage.efi + initrd-large.img + + Ramdisks\ + initrd-small.img + initrd-medium.img + +to boot with the initrd-large.img file if the current working +directory is fs0:\Kernels, the following command must be used:: + + fs0:\Kernels> bzImage.efi initrd=\Kernels\initrd-large.img + +Notice how bzImage.efi can be specified with a relative path. That's +because the image we're executing is interpreted by the EFI shell, +which understands relative paths, whereas the rest of the command line +is passed to bzImage.efi. + + +The "dtb=" option +----------------- + +For the ARM and arm64 architectures, a device tree must be provided to +the kernel. Normally firmware shall supply the device tree via the +EFI CONFIGURATION TABLE. However, the "dtb=" command line option can +be used to override the firmware supplied device tree, or to supply +one when firmware is unable to. + +Please note: Firmware adds runtime configuration information to the +device tree before booting the kernel. If dtb= is used to override +the device tree, then any runtime data provided by firmware will be +lost. The dtb= option should only be used either as a debug tool, or +as a last resort when a device tree is not provided in the EFI +CONFIGURATION TABLE. + +"dtb=" is processed in the same manner as the "initrd=" option that is +described above. diff --git a/Documentation/admin-guide/highuid.rst b/Documentation/admin-guide/highuid.rst new file mode 100644 index 000000000000..6ee70465c0ea --- /dev/null +++ b/Documentation/admin-guide/highuid.rst @@ -0,0 +1,80 @@ +=================================================== +Notes on the change from 16-bit UIDs to 32-bit UIDs +=================================================== + +:Author: Chris Wing +:Last updated: January 11, 2000 + +- kernel code MUST take into account __kernel_uid_t and __kernel_uid32_t + when communicating between user and kernel space in an ioctl or data + structure. + +- kernel code should use uid_t and gid_t in kernel-private structures and + code. + +What's left to be done for 32-bit UIDs on all Linux architectures: + +- Disk quotas have an interesting limitation that is not related to the + maximum UID/GID. They are limited by the maximum file size on the + underlying filesystem, because quota records are written at offsets + corresponding to the UID in question. + Further investigation is needed to see if the quota system can cope + properly with huge UIDs. If it can deal with 64-bit file offsets on all + architectures, this should not be a problem. + +- Decide whether or not to keep backwards compatibility with the system + accounting file, or if we should break it as the comments suggest + (currently, the old 16-bit UID and GID are still written to disk, and + part of the former pad space is used to store separate 32-bit UID and + GID) + +- Need to validate that OS emulation calls the 16-bit UID + compatibility syscalls, if the OS being emulated used 16-bit UIDs, or + uses the 32-bit UID system calls properly otherwise. + + This affects at least: + + - iBCS on Intel + + - sparc32 emulation on sparc64 + (need to support whatever new 32-bit UID system calls are added to + sparc32) + +- Validate that all filesystems behave properly. + + At present, 32-bit UIDs _should_ work for: + + - ext2 + - ufs + - isofs + - nfs + - coda + - udf + + Ioctl() fixups have been made for: + + - ncpfs + - smbfs + + Filesystems with simple fixups to prevent 16-bit UID wraparound: + + - minix + - sysv + - qnx4 + + Other filesystems have not been checked yet. + +- The ncpfs and smpfs filesystems cannot presently use 32-bit UIDs in + all ioctl()s. Some new ioctl()s have been added with 32-bit UIDs, but + more are needed. (as well as new user<->kernel data structures) + +- The ELF core dump format only supports 16-bit UIDs on arm, i386, m68k, + sh, and sparc32. Fixing this is probably not that important, but would + require adding a new ELF section. + +- The ioctl()s used to control the in-kernel NFS server only support + 16-bit UIDs on arm, i386, m68k, sh, and sparc32. + +- make sure that the UID mapping feature of AX25 networking works properly + (it should be safe because it's always used a 32-bit integer to + communicate between user and kernel) diff --git a/Documentation/admin-guide/hw-vuln/l1tf.rst b/Documentation/admin-guide/hw-vuln/l1tf.rst index 656aee262e23..f83212fae4d5 100644 --- a/Documentation/admin-guide/hw-vuln/l1tf.rst +++ b/Documentation/admin-guide/hw-vuln/l1tf.rst @@ -241,7 +241,7 @@ Guest mitigation mechanisms For further information about confining guests to a single or to a group of cores consult the cpusets documentation: - https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.rst + https://www.kernel.org/doc/Documentation/admin-guide/cgroup-v1/cpusets.rst .. _interrupt_isolation: diff --git a/Documentation/admin-guide/hw_random.rst b/Documentation/admin-guide/hw_random.rst new file mode 100644 index 000000000000..121de96e395e --- /dev/null +++ b/Documentation/admin-guide/hw_random.rst @@ -0,0 +1,105 @@ +========================================================== +Linux support for random number generator in i8xx chipsets +========================================================== + +Introduction +============ + +The hw_random framework is software that makes use of a +special hardware feature on your CPU or motherboard, +a Random Number Generator (RNG). The software has two parts: +a core providing the /dev/hwrng character device and its +sysfs support, plus a hardware-specific driver that plugs +into that core. + +To make the most effective use of these mechanisms, you +should download the support software as well. Download the +latest version of the "rng-tools" package from the +hw_random driver's official Web site: + + http://sourceforge.net/projects/gkernel/ + +Those tools use /dev/hwrng to fill the kernel entropy pool, +which is used internally and exported by the /dev/urandom and +/dev/random special files. + +Theory of operation +=================== + +CHARACTER DEVICE. Using the standard open() +and read() system calls, you can read random data from +the hardware RNG device. This data is NOT CHECKED by any +fitness tests, and could potentially be bogus (if the +hardware is faulty or has been tampered with). Data is only +output if the hardware "has-data" flag is set, but nevertheless +a security-conscious person would run fitness tests on the +data before assuming it is truly random. + +The rng-tools package uses such tests in "rngd", and lets you +run them by hand with a "rngtest" utility. + +/dev/hwrng is char device major 10, minor 183. + +CLASS DEVICE. There is a /sys/class/misc/hw_random node with +two unique attributes, "rng_available" and "rng_current". The +"rng_available" attribute lists the hardware-specific drivers +available, while "rng_current" lists the one which is currently +connected to /dev/hwrng. If your system has more than one +RNG available, you may change the one used by writing a name from +the list in "rng_available" into "rng_current". + +========================================================================== + + +Hardware driver for Intel/AMD/VIA Random Number Generators (RNG) + - Copyright 2000,2001 Jeff Garzik + - Copyright 2000,2001 Philipp Rumpf + + +About the Intel RNG hardware, from the firmware hub datasheet +============================================================= + +The Firmware Hub integrates a Random Number Generator (RNG) +using thermal noise generated from inherently random quantum +mechanical properties of silicon. When not generating new random +bits the RNG circuitry will enter a low power state. Intel will +provide a binary software driver to give third party software +access to our RNG for use as a security feature. At this time, +the RNG is only to be used with a system in an OS-present state. + +Intel RNG Driver notes +====================== + +FIXME: support poll(2) + +.. note:: + + request_mem_region was removed, for three reasons: + + 1) Only one RNG is supported by this driver; + 2) The location used by the RNG is a fixed location in + MMIO-addressable memory; + 3) users with properly working BIOS e820 handling will always + have the region in which the RNG is located reserved, so + request_mem_region calls always fail for proper setups. + However, for people who use mem=XX, BIOS e820 information is + **not** in /proc/iomem, and request_mem_region(RNG_ADDR) can + succeed. + +Driver details +============== + +Based on: + Intel 82802AB/82802AC Firmware Hub (FWH) Datasheet + May 1999 Order Number: 290658-002 R + +Intel 82802 Firmware Hub: + Random Number Generator + Programmer's Reference Manual + December 1999 Order Number: 298029-001 R + +Intel 82802 Firmware HUB Random Number Generator Driver + Copyright (c) 2000 Matt Sottek + +Special thanks to Matt Sottek. I did the "guts", he +did the "brains" and all the testing. diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index a5fdb1a846ce..4e98f5596da0 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -85,8 +85,25 @@ configure specific aspects of kernel behavior to your liking. perf-security acpi/index aoe/index + btmrvl + clearing-warn-once + cpu-load + cputopology device-mapper/index + efi-stub + highuid + hw_random + iostats + kernel-per-CPU-kthreads laptops/index + lcd-panel-cgram + ldm + lockup-watchdogs + numastat + pnp + rtc + svga + video-output .. only:: subproject and html diff --git a/Documentation/admin-guide/iostats.rst b/Documentation/admin-guide/iostats.rst new file mode 100644 index 000000000000..5d63b18bd6d1 --- /dev/null +++ b/Documentation/admin-guide/iostats.rst @@ -0,0 +1,197 @@ +===================== +I/O statistics fields +===================== + +Since 2.4.20 (and some versions before, with patches), and 2.5.45, +more extensive disk statistics have been introduced to help measure disk +activity. Tools such as ``sar`` and ``iostat`` typically interpret these and do +the work for you, but in case you are interested in creating your own +tools, the fields are explained here. + +In 2.4 now, the information is found as additional fields in +``/proc/partitions``. In 2.6 and upper, the same information is found in two +places: one is in the file ``/proc/diskstats``, and the other is within +the sysfs file system, which must be mounted in order to obtain +the information. Throughout this document we'll assume that sysfs +is mounted on ``/sys``, although of course it may be mounted anywhere. +Both ``/proc/diskstats`` and sysfs use the same source for the information +and so should not differ. + +Here are examples of these different formats:: + + 2.4: + 3 0 39082680 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 + 3 1 9221278 hda1 35486 0 35496 38030 0 0 0 0 0 38030 38030 + + 2.6+ sysfs: + 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 + 35486 38030 38030 38030 + + 2.6+ diskstats: + 3 0 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 + 3 1 hda1 35486 38030 38030 38030 + + 4.18+ diskstats: + 3 0 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 0 0 0 0 + +On 2.4 you might execute ``grep 'hda ' /proc/partitions``. On 2.6+, you have +a choice of ``cat /sys/block/hda/stat`` or ``grep 'hda ' /proc/diskstats``. + +The advantage of one over the other is that the sysfs choice works well +if you are watching a known, small set of disks. ``/proc/diskstats`` may +be a better choice if you are watching a large number of disks because +you'll avoid the overhead of 50, 100, or 500 or more opens/closes with +each snapshot of your disk statistics. + +In 2.4, the statistics fields are those after the device name. In +the above example, the first field of statistics would be 446216. +By contrast, in 2.6+ if you look at ``/sys/block/hda/stat``, you'll +find just the eleven fields, beginning with 446216. If you look at +``/proc/diskstats``, the eleven fields will be preceded by the major and +minor device numbers, and device name. Each of these formats provides +eleven fields of statistics, each meaning exactly the same things. +All fields except field 9 are cumulative since boot. Field 9 should +go to zero as I/Os complete; all others only increase (unless they +overflow and wrap). Yes, these are (32-bit or 64-bit) unsigned long +(native word size) numbers, and on a very busy or long-lived system they +may wrap. Applications should be prepared to deal with that; unless +your observations are measured in large numbers of minutes or hours, +they should not wrap twice before you notice them. + +Each set of stats only applies to the indicated device; if you want +system-wide stats you'll have to find all the devices and sum them all up. + +Field 1 -- # of reads completed + This is the total number of reads completed successfully. + +Field 2 -- # of reads merged, field 6 -- # of writes merged + Reads and writes which are adjacent to each other may be merged for + efficiency. Thus two 4K reads may become one 8K read before it is + ultimately handed to the disk, and so it will be counted (and queued) + as only one I/O. This field lets you know how often this was done. + +Field 3 -- # of sectors read + This is the total number of sectors read successfully. + +Field 4 -- # of milliseconds spent reading + This is the total number of milliseconds spent by all reads (as + measured from __make_request() to end_that_request_last()). + +Field 5 -- # of writes completed + This is the total number of writes completed successfully. + +Field 6 -- # of writes merged + See the description of field 2. + +Field 7 -- # of sectors written + This is the total number of sectors written successfully. + +Field 8 -- # of milliseconds spent writing + This is the total number of milliseconds spent by all writes (as + measured from __make_request() to end_that_request_last()). + +Field 9 -- # of I/Os currently in progress + The only field that should go to zero. Incremented as requests are + given to appropriate struct request_queue and decremented as they finish. + +Field 10 -- # of milliseconds spent doing I/Os + This field increases so long as field 9 is nonzero. + + Since 5.0 this field counts jiffies when at least one request was + started or completed. If request runs more than 2 jiffies then some + I/O time will not be accounted unless there are other requests. + +Field 11 -- weighted # of milliseconds spent doing I/Os + This field is incremented at each I/O start, I/O completion, I/O + merge, or read of these stats by the number of I/Os in progress + (field 9) times the number of milliseconds spent doing I/O since the + last update of this field. This can provide an easy measure of both + I/O completion time and the backlog that may be accumulating. + +Field 12 -- # of discards completed + This is the total number of discards completed successfully. + +Field 13 -- # of discards merged + See the description of field 2 + +Field 14 -- # of sectors discarded + This is the total number of sectors discarded successfully. + +Field 15 -- # of milliseconds spent discarding + This is the total number of milliseconds spent by all discards (as + measured from __make_request() to end_that_request_last()). + +To avoid introducing performance bottlenecks, no locks are held while +modifying these counters. This implies that minor inaccuracies may be +introduced when changes collide, so (for instance) adding up all the +read I/Os issued per partition should equal those made to the disks ... +but due to the lack of locking it may only be very close. + +In 2.6+, there are counters for each CPU, which make the lack of locking +almost a non-issue. When the statistics are read, the per-CPU counters +are summed (possibly overflowing the unsigned long variable they are +summed to) and the result given to the user. There is no convenient +user interface for accessing the per-CPU counters themselves. + +Disks vs Partitions +------------------- + +There were significant changes between 2.4 and 2.6+ in the I/O subsystem. +As a result, some statistic information disappeared. The translation from +a disk address relative to a partition to the disk address relative to +the host disk happens much earlier. All merges and timings now happen +at the disk level rather than at both the disk and partition level as +in 2.4. Consequently, you'll see a different statistics output on 2.6+ for +partitions from that for disks. There are only *four* fields available +for partitions on 2.6+ machines. This is reflected in the examples above. + +Field 1 -- # of reads issued + This is the total number of reads issued to this partition. + +Field 2 -- # of sectors read + This is the total number of sectors requested to be read from this + partition. + +Field 3 -- # of writes issued + This is the total number of writes issued to this partition. + +Field 4 -- # of sectors written + This is the total number of sectors requested to be written to + this partition. + +Note that since the address is translated to a disk-relative one, and no +record of the partition-relative address is kept, the subsequent success +or failure of the read cannot be attributed to the partition. In other +words, the number of reads for partitions is counted slightly before time +of queuing for partitions, and at completion for whole disks. This is +a subtle distinction that is probably uninteresting for most cases. + +More significant is the error induced by counting the numbers of +reads/writes before merges for partitions and after for disks. Since a +typical workload usually contains a lot of successive and adjacent requests, +the number of reads/writes issued can be several times higher than the +number of reads/writes completed. + +In 2.6.25, the full statistic set is again available for partitions and +disk and partition statistics are consistent again. Since we still don't +keep record of the partition-relative address, an operation is attributed to +the partition which contains the first sector of the request after the +eventual merges. As requests can be merged across partition, this could lead +to some (probably insignificant) inaccuracy. + +Additional notes +---------------- + +In 2.6+, sysfs is not mounted by default. If your distribution of +Linux hasn't added it already, here's the line you'll want to add to +your ``/etc/fstab``:: + + none /sys sysfs defaults 0 0 + + +In 2.6+, all disk statistics were removed from ``/proc/stat``. In 2.4, they +appear in both ``/proc/partitions`` and ``/proc/stat``, although the ones in +``/proc/stat`` take a very different format from those in ``/proc/partitions`` +(see proc(5), if your system has it.) + +-- ricklind@us.ibm.com diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a571a67e0c85..19b1e3bef56c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5066,7 +5066,7 @@ vga= [BOOT,X86-32] Select a particular video mode See Documentation/x86/boot.rst and - Documentation/svga.txt. + Documentation/admin-guide/svga.rst. Use vga=ask for menu. This is actually a boot loader parameter; the value is passed to the kernel using a special protocol. diff --git a/Documentation/admin-guide/kernel-per-CPU-kthreads.rst b/Documentation/admin-guide/kernel-per-CPU-kthreads.rst new file mode 100644 index 000000000000..4f18456dd3b1 --- /dev/null +++ b/Documentation/admin-guide/kernel-per-CPU-kthreads.rst @@ -0,0 +1,356 @@ +========================================== +Reducing OS jitter due to per-cpu kthreads +========================================== + +This document lists per-CPU kthreads in the Linux kernel and presents +options to control their OS jitter. Note that non-per-CPU kthreads are +not listed here. To reduce OS jitter from non-per-CPU kthreads, bind +them to a "housekeeping" CPU dedicated to such work. + +References +========== + +- Documentation/IRQ-affinity.txt: Binding interrupts to sets of CPUs. + +- Documentation/admin-guide/cgroup-v1: Using cgroups to bind tasks to sets of CPUs. + +- man taskset: Using the taskset command to bind tasks to sets + of CPUs. + +- man sched_setaffinity: Using the sched_setaffinity() system + call to bind tasks to sets of CPUs. + +- /sys/devices/system/cpu/cpuN/online: Control CPU N's hotplug state, + writing "0" to offline and "1" to online. + +- In order to locate kernel-generated OS jitter on CPU N: + + cd /sys/kernel/debug/tracing + echo 1 > max_graph_depth # Increase the "1" for more detail + echo function_graph > current_tracer + # run workload + cat per_cpu/cpuN/trace + +kthreads +======== + +Name: + ehca_comp/%u + +Purpose: + Periodically process Infiniband-related work. + +To reduce its OS jitter, do any of the following: + +1. Don't use eHCA Infiniband hardware, instead choosing hardware + that does not require per-CPU kthreads. This will prevent these + kthreads from being created in the first place. (This will + work for most people, as this hardware, though important, is + relatively old and is produced in relatively low unit volumes.) +2. Do all eHCA-Infiniband-related work on other CPUs, including + interrupts. +3. Rework the eHCA driver so that its per-CPU kthreads are + provisioned only on selected CPUs. + + +Name: + irq/%d-%s + +Purpose: + Handle threaded interrupts. + +To reduce its OS jitter, do the following: + +1. Use irq affinity to force the irq threads to execute on + some other CPU. + +Name: + kcmtpd_ctr_%d + +Purpose: + Handle Bluetooth work. + +To reduce its OS jitter, do one of the following: + +1. Don't use Bluetooth, in which case these kthreads won't be + created in the first place. +2. Use irq affinity to force Bluetooth-related interrupts to + occur on some other CPU and furthermore initiate all + Bluetooth activity on some other CPU. + +Name: + ksoftirqd/%u + +Purpose: + Execute softirq handlers when threaded or when under heavy load. + +To reduce its OS jitter, each softirq vector must be handled +separately as follows: + +TIMER_SOFTIRQ +------------- + +Do all of the following: + +1. To the extent possible, keep the CPU out of the kernel when it + is non-idle, for example, by avoiding system calls and by forcing + both kernel threads and interrupts to execute elsewhere. +2. Build with CONFIG_HOTPLUG_CPU=y. After boot completes, force + the CPU offline, then bring it back online. This forces + recurring timers to migrate elsewhere. If you are concerned + with multiple CPUs, force them all offline before bringing the + first one back online. Once you have onlined the CPUs in question, + do not offline any other CPUs, because doing so could force the + timer back onto one of the CPUs in question. + +NET_TX_SOFTIRQ and NET_RX_SOFTIRQ +--------------------------------- + +Do all of the following: + +1. Force networking interrupts onto other CPUs. +2. Initiate any network I/O on other CPUs. +3. Once your application has started, prevent CPU-hotplug operations + from being initiated from tasks that might run on the CPU to + be de-jittered. (It is OK to force this CPU offline and then + bring it back online before you start your application.) + +BLOCK_SOFTIRQ +------------- + +Do all of the following: + +1. Force block-device interrupts onto some other CPU. +2. Initiate any block I/O on other CPUs. +3. Once your application has started, prevent CPU-hotplug operations + from being initiated from tasks that might run on the CPU to + be de-jittered. (It is OK to force this CPU offline and then + bring it back online before you start your application.) + +IRQ_POLL_SOFTIRQ +---------------- + +Do all of the following: + +1. Force block-device interrupts onto some other CPU. +2. Initiate any block I/O and block-I/O polling on other CPUs. +3. Once your application has started, prevent CPU-hotplug operations + from being initiated from tasks that might run on the CPU to + be de-jittered. (It is OK to force this CPU offline and then + bring it back online before you start your application.) + +TASKLET_SOFTIRQ +--------------- + +Do one or more of the following: + +1. Avoid use of drivers that use tasklets. (Such drivers will contain + calls to things like tasklet_schedule().) +2. Convert all drivers that you must use from tasklets to workqueues. +3. Force interrupts for drivers using tasklets onto other CPUs, + and also do I/O involving these drivers on other CPUs. + +SCHED_SOFTIRQ +------------- + +Do all of the following: + +1. Avoid sending scheduler IPIs to the CPU to be de-jittered, + for example, ensure that at most one runnable kthread is present + on that CPU. If a thread that expects to run on the de-jittered + CPU awakens, the scheduler will send an IPI that can result in + a subsequent SCHED_SOFTIRQ. +2. CONFIG_NO_HZ_FULL=y and ensure that the CPU to be de-jittered + is marked as an adaptive-ticks CPU using the "nohz_full=" + boot parameter. This reduces the number of scheduler-clock + interrupts that the de-jittered CPU receives, minimizing its + chances of being selected to do the load balancing work that + runs in SCHED_SOFTIRQ context. +3. To the extent possible, keep the CPU out of the kernel when it + is non-idle, for example, by avoiding system calls and by + forcing both kernel threads and interrupts to execute elsewhere. + This further reduces the number of scheduler-clock interrupts + received by the de-jittered CPU. + +HRTIMER_SOFTIRQ +--------------- + +Do all of the following: + +1. To the extent possible, keep the CPU out of the kernel when it + is non-idle. For example, avoid system calls and force both + kernel threads and interrupts to execute elsewhere. +2. Build with CONFIG_HOTPLUG_CPU=y. Once boot completes, force the + CPU offline, then bring it back online. This forces recurring + timers to migrate elsewhere. If you are concerned with multiple + CPUs, force them all offline before bringing the first one + back online. Once you have onlined the CPUs in question, do not + offline any other CPUs, because doing so could force the timer + back onto one of the CPUs in question. + +RCU_SOFTIRQ +----------- + +Do at least one of the following: + +1. Offload callbacks and keep the CPU in either dyntick-idle or + adaptive-ticks state by doing all of the following: + + a. CONFIG_NO_HZ_FULL=y and ensure that the CPU to be + de-jittered is marked as an adaptive-ticks CPU using the + "nohz_full=" boot parameter. Bind the rcuo kthreads to + housekeeping CPUs, which can tolerate OS jitter. + b. To the extent possible, keep the CPU out of the kernel + when it is non-idle, for example, by avoiding system + calls and by forcing both kernel threads and interrupts + to execute elsewhere. + +2. Enable RCU to do its processing remotely via dyntick-idle by + doing all of the following: + + a. Build with CONFIG_NO_HZ=y and CONFIG_RCU_FAST_NO_HZ=y. + b. Ensure that the CPU goes idle frequently, allowing other + CPUs to detect that it has passed through an RCU quiescent + state. If the kernel is built with CONFIG_NO_HZ_FULL=y, + userspace execution also allows other CPUs to detect that + the CPU in question has passed through a quiescent state. + c. To the extent possible, keep the CPU out of the kernel + when it is non-idle, for example, by avoiding system + calls and by forcing both kernel threads and interrupts + to execute elsewhere. + +Name: + kworker/%u:%d%s (cpu, id, priority) + +Purpose: + Execute workqueue requests + +To reduce its OS jitter, do any of the following: + +1. Run your workload at a real-time priority, which will allow + preempting the kworker daemons. +2. A given workqueue can be made visible in the sysfs filesystem + by passing the WQ_SYSFS to that workqueue's alloc_workqueue(). + Such a workqueue can be confined to a given subset of the + CPUs using the ``/sys/devices/virtual/workqueue/*/cpumask`` sysfs + files. The set of WQ_SYSFS workqueues can be displayed using + "ls sys/devices/virtual/workqueue". That said, the workqueues + maintainer would like to caution people against indiscriminately + sprinkling WQ_SYSFS across all the workqueues. The reason for + caution is that it is easy to add WQ_SYSFS, but because sysfs is + part of the formal user/kernel API, it can be nearly impossible + to remove it, even if its addition was a mistake. +3. Do any of the following needed to avoid jitter that your + application cannot tolerate: + + a. Build your kernel with CONFIG_SLUB=y rather than + CONFIG_SLAB=y, thus avoiding the slab allocator's periodic + use of each CPU's workqueues to run its cache_reap() + function. + b. Avoid using oprofile, thus avoiding OS jitter from + wq_sync_buffer(). + c. Limit your CPU frequency so that a CPU-frequency + governor is not required, possibly enlisting the aid of + special heatsinks or other cooling technologies. If done + correctly, and if you CPU architecture permits, you should + be able to build your kernel with CONFIG_CPU_FREQ=n to + avoid the CPU-frequency governor periodically running + on each CPU, including cs_dbs_timer() and od_dbs_timer(). + + WARNING: Please check your CPU specifications to + make sure that this is safe on your particular system. + d. As of v3.18, Christoph Lameter's on-demand vmstat workers + commit prevents OS jitter due to vmstat_update() on + CONFIG_SMP=y systems. Before v3.18, is not possible + to entirely get rid of the OS jitter, but you can + decrease its frequency by writing a large value to + /proc/sys/vm/stat_interval. The default value is HZ, + for an interval of one second. Of course, larger values + will make your virtual-memory statistics update more + slowly. Of course, you can also run your workload at + a real-time priority, thus preempting vmstat_update(), + but if your workload is CPU-bound, this is a bad idea. + However, there is an RFC patch from Christoph Lameter + (based on an earlier one from Gilad Ben-Yossef) that + reduces or even eliminates vmstat overhead for some + workloads at https://lkml.org/lkml/2013/9/4/379. + e. Boot with "elevator=noop" to avoid workqueue use by + the block layer. + f. If running on high-end powerpc servers, build with + CONFIG_PPC_RTAS_DAEMON=n. This prevents the RTAS + daemon from running on each CPU every second or so. + (This will require editing Kconfig files and will defeat + this platform's RAS functionality.) This avoids jitter + due to the rtas_event_scan() function. + WARNING: Please check your CPU specifications to + make sure that this is safe on your particular system. + g. If running on Cell Processor, build your kernel with + CBE_CPUFREQ_SPU_GOVERNOR=n to avoid OS jitter from + spu_gov_work(). + WARNING: Please check your CPU specifications to + make sure that this is safe on your particular system. + h. If running on PowerMAC, build your kernel with + CONFIG_PMAC_RACKMETER=n to disable the CPU-meter, + avoiding OS jitter from rackmeter_do_timer(). + +Name: + rcuc/%u + +Purpose: + Execute RCU callbacks in CONFIG_RCU_BOOST=y kernels. + +To reduce its OS jitter, do at least one of the following: + +1. Build the kernel with CONFIG_PREEMPT=n. This prevents these + kthreads from being created in the first place, and also obviates + the need for RCU priority boosting. This approach is feasible + for workloads that do not require high degrees of responsiveness. +2. Build the kernel with CONFIG_RCU_BOOST=n. This prevents these + kthreads from being created in the first place. This approach + is feasible only if your workload never requires RCU priority + boosting, for example, if you ensure frequent idle time on all + CPUs that might execute within the kernel. +3. Build with CONFIG_RCU_NOCB_CPU=y and boot with the rcu_nocbs= + boot parameter offloading RCU callbacks from all CPUs susceptible + to OS jitter. This approach prevents the rcuc/%u kthreads from + having any work to do, so that they are never awakened. +4. Ensure that the CPU never enters the kernel, and, in particular, + avoid initiating any CPU hotplug operations on this CPU. This is + another way of preventing any callbacks from being queued on the + CPU, again preventing the rcuc/%u kthreads from having any work + to do. + +Name: + rcuop/%d and rcuos/%d + +Purpose: + Offload RCU callbacks from the corresponding CPU. + +To reduce its OS jitter, do at least one of the following: + +1. Use affinity, cgroups, or other mechanism to force these kthreads + to execute on some other CPU. +2. Build with CONFIG_RCU_NOCB_CPU=n, which will prevent these + kthreads from being created in the first place. However, please + note that this will not eliminate OS jitter, but will instead + shift it to RCU_SOFTIRQ. + +Name: + watchdog/%u + +Purpose: + Detect software lockups on each CPU. + +To reduce its OS jitter, do at least one of the following: + +1. Build with CONFIG_LOCKUP_DETECTOR=n, which will prevent these + kthreads from being created in the first place. +2. Boot with "nosoftlockup=0", which will also prevent these kthreads + from being created. Other related watchdog and softlockup boot + parameters may be found in Documentation/admin-guide/kernel-parameters.rst + and Documentation/watchdog/watchdog-parameters.rst. +3. Echo a zero to /proc/sys/kernel/watchdog to disable the + watchdog timer. +4. Echo a large number of /proc/sys/kernel/watchdog_thresh in + order to reduce the frequency of OS jitter due to the watchdog + timer down to a level that is acceptable for your workload. diff --git a/Documentation/admin-guide/lcd-panel-cgram.rst b/Documentation/admin-guide/lcd-panel-cgram.rst new file mode 100644 index 000000000000..a3eb00c62f53 --- /dev/null +++ b/Documentation/admin-guide/lcd-panel-cgram.rst @@ -0,0 +1,27 @@ +====================================== +Parallel port LCD/Keypad Panel support +====================================== + +Some LCDs allow you to define up to 8 characters, mapped to ASCII +characters 0 to 7. The escape code to define a new character is +'\e[LG' followed by one digit from 0 to 7, representing the character +number, and up to 8 couples of hex digits terminated by a semi-colon +(';'). Each couple of digits represents a line, with 1-bits for each +illuminated pixel with LSB on the right. Lines are numbered from the +top of the character to the bottom. On a 5x7 matrix, only the 5 lower +bits of the 7 first bytes are used for each character. If the string +is incomplete, only complete lines will be redefined. Here are some +examples:: + + printf "\e[LG0010101050D1F0C04;" => 0 = [enter] + printf "\e[LG1040E1F0000000000;" => 1 = [up] + printf "\e[LG2000000001F0E0400;" => 2 = [down] + printf "\e[LG3040E1F001F0E0400;" => 3 = [up-down] + printf "\e[LG40002060E1E0E0602;" => 4 = [left] + printf "\e[LG500080C0E0F0E0C08;" => 5 = [right] + printf "\e[LG60016051516141400;" => 6 = "IP" + + printf "\e[LG00103071F1F070301;" => big speaker + printf "\e[LG00002061E1E060200;" => small speaker + +Willy diff --git a/Documentation/admin-guide/ldm.rst b/Documentation/admin-guide/ldm.rst new file mode 100644 index 000000000000..12c571368e73 --- /dev/null +++ b/Documentation/admin-guide/ldm.rst @@ -0,0 +1,121 @@ +========================================== +LDM - Logical Disk Manager (Dynamic Disks) +========================================== + +:Author: Originally Written by FlatCap - Richard Russon . +:Last Updated: Anton Altaparmakov on 30 March 2007 for Windows Vista. + +Overview +-------- + +Windows 2000, XP, and Vista use a new partitioning scheme. It is a complete +replacement for the MSDOS style partitions. It stores its information in a +1MiB journalled database at the end of the physical disk. The size of +partitions is limited only by disk space. The maximum number of partitions is +nearly 2000. + +Any partitions created under the LDM are called "Dynamic Disks". There are no +longer any primary or extended partitions. Normal MSDOS style partitions are +now known as Basic Disks. + +If you wish to use Spanned, Striped, Mirrored or RAID 5 Volumes, you must use +Dynamic Disks. The journalling allows Windows to make changes to these +partitions and filesystems without the need to reboot. + +Once the LDM driver has divided up the disk, you can use the MD driver to +assemble any multi-partition volumes, e.g. Stripes, RAID5. + +To prevent legacy applications from repartitioning the disk, the LDM creates a +dummy MSDOS partition containing one disk-sized partition. This is what is +supported with the Linux LDM driver. + +A newer approach that has been implemented with Vista is to put LDM on top of a +GPT label disk. This is not supported by the Linux LDM driver yet. + + +Example +------- + +Below we have a 50MiB disk, divided into seven partitions. + +.. note:: + + The missing 1MiB at the end of the disk is where the LDM database is + stored. + ++-------++--------------+---------+-----++--------------+---------+----+ +|Device || Offset Bytes | Sectors | MiB || Size Bytes | Sectors | MiB| ++=======++==============+=========+=====++==============+=========+====+ +|hda || 0 | 0 | 0 || 52428800 | 102400 | 50| ++-------++--------------+---------+-----++--------------+---------+----+ +|hda1 || 51380224 | 100352 | 49 || 1048576 | 2048 | 1| ++-------++--------------+---------+-----++--------------+---------+----+ +|hda2 || 16384 | 32 | 0 || 6979584 | 13632 | 6| ++-------++--------------+---------+-----++--------------+---------+----+ +|hda3 || 6995968 | 13664 | 6 || 10485760 | 20480 | 10| ++-------++--------------+---------+-----++--------------+---------+----+ +|hda4 || 17481728 | 34144 | 16 || 4194304 | 8192 | 4| ++-------++--------------+---------+-----++--------------+---------+----+ +|hda5 || 21676032 | 42336 | 20 || 5242880 | 10240 | 5| ++-------++--------------+---------+-----++--------------+---------+----+ +|hda6 || 26918912 | 52576 | 25 || 10485760 | 20480 | 10| ++-------++--------------+---------+-----++--------------+---------+----+ +|hda7 || 37404672 | 73056 | 35 || 13959168 | 27264 | 13| ++-------++--------------+---------+-----++--------------+---------+----+ + +The LDM Database may not store the partitions in the order that they appear on +disk, but the driver will sort them. + +When Linux boots, you will see something like:: + + hda: 102400 sectors w/32KiB Cache, CHS=50/64/32 + hda: [LDM] hda1 hda2 hda3 hda4 hda5 hda6 hda7 + + +Compiling LDM Support +--------------------- + +To enable LDM, choose the following two options: + + - "Advanced partition selection" CONFIG_PARTITION_ADVANCED + - "Windows Logical Disk Manager (Dynamic Disk) support" CONFIG_LDM_PARTITION + +If you believe the driver isn't working as it should, you can enable the extra +debugging code. This will produce a LOT of output. The option is: + + - "Windows LDM extra logging" CONFIG_LDM_DEBUG + +N.B. The partition code cannot be compiled as a module. + +As with all the partition code, if the driver doesn't see signs of its type of +partition, it will pass control to another driver, so there is no harm in +enabling it. + +If you have Dynamic Disks but don't enable the driver, then all you will see +is a dummy MSDOS partition filling the whole disk. You won't be able to mount +any of the volumes on the disk. + + +Booting +------- + +If you enable LDM support, then lilo is capable of booting from any of the +discovered partitions. However, grub does not understand the LDM partitioning +and cannot boot from a Dynamic Disk. + + +More Documentation +------------------ + +There is an Overview of the LDM together with complete Technical Documentation. +It is available for download. + + http://www.linux-ntfs.org/ + +If you have any LDM questions that aren't answered in the documentation, email +me. + +Cheers, + FlatCap - Richard Russon + ldm@flatcap.org + diff --git a/Documentation/admin-guide/lockup-watchdogs.rst b/Documentation/admin-guide/lockup-watchdogs.rst new file mode 100644 index 000000000000..290840c160af --- /dev/null +++ b/Documentation/admin-guide/lockup-watchdogs.rst @@ -0,0 +1,83 @@ +=============================================================== +Softlockup detector and hardlockup detector (aka nmi_watchdog) +=============================================================== + +The Linux kernel can act as a watchdog to detect both soft and hard +lockups. + +A 'softlockup' is defined as a bug that causes the kernel to loop in +kernel mode for more than 20 seconds (see "Implementation" below for +details), without giving other tasks a chance to run. The current +stack trace is displayed upon detection and, by default, the system +will stay locked up. Alternatively, the kernel can be configured to +panic; a sysctl, "kernel.softlockup_panic", a kernel parameter, +"softlockup_panic" (see "Documentation/admin-guide/kernel-parameters.rst" for +details), and a compile option, "BOOTPARAM_SOFTLOCKUP_PANIC", are +provided for this. + +A 'hardlockup' is defined as a bug that causes the CPU to loop in +kernel mode for more than 10 seconds (see "Implementation" below for +details), without letting other interrupts have a chance to run. +Similarly to the softlockup case, the current stack trace is displayed +upon detection and the system will stay locked up unless the default +behavior is changed, which can be done through a sysctl, +'hardlockup_panic', a compile time knob, "BOOTPARAM_HARDLOCKUP_PANIC", +and a kernel parameter, "nmi_watchdog" +(see "Documentation/admin-guide/kernel-parameters.rst" for details). + +The panic option can be used in combination with panic_timeout (this +timeout is set through the confusingly named "kernel.panic" sysctl), +to cause the system to reboot automatically after a specified amount +of time. + +Implementation +============== + +The soft and hard lockup detectors are built on top of the hrtimer and +perf subsystems, respectively. A direct consequence of this is that, +in principle, they should work in any architecture where these +subsystems are present. + +A periodic hrtimer runs to generate interrupts and kick the watchdog +task. An NMI perf event is generated every "watchdog_thresh" +(compile-time initialized to 10 and configurable through sysctl of the +same name) seconds to check for hardlockups. If any CPU in the system +does not receive any hrtimer interrupt during that time the +'hardlockup detector' (the handler for the NMI perf event) will +generate a kernel warning or call panic, depending on the +configuration. + +The watchdog task is a high priority kernel thread that updates a +timestamp every time it is scheduled. If that timestamp is not updated +for 2*watchdog_thresh seconds (the softlockup threshold) the +'softlockup detector' (coded inside the hrtimer callback function) +will dump useful debug information to the system log, after which it +will call panic if it was instructed to do so or resume execution of +other kernel code. + +The period of the hrtimer is 2*watchdog_thresh/5, which means it has +two or three chances to generate an interrupt before the hardlockup +detector kicks in. + +As explained above, a kernel knob is provided that allows +administrators to configure the period of the hrtimer and the perf +event. The right value for a particular environment is a trade-off +between fast response to lockups and detection overhead. + +By default, the watchdog runs on all online cores. However, on a +kernel configured with NO_HZ_FULL, by default the watchdog runs only +on the housekeeping cores, not the cores specified in the "nohz_full" +boot argument. If we allowed the watchdog to run by default on +the "nohz_full" cores, we would have to run timer ticks to activate +the scheduler, which would prevent the "nohz_full" functionality +from protecting the user code on those cores from the kernel. +Of course, disabling it by default on the nohz_full cores means that +when those cores do enter the kernel, by default we will not be +able to detect if they lock up. However, allowing the watchdog +to continue to run on the housekeeping (non-tickless) cores means +that we will continue to detect lockups properly on those cores. + +In either case, the set of cores excluded from running the watchdog +may be adjusted via the kernel.watchdog_cpumask sysctl. For +nohz_full cores, this may be useful for debugging a case where the +kernel seems to be hanging on the nohz_full cores. diff --git a/Documentation/admin-guide/mm/cma_debugfs.rst b/Documentation/admin-guide/mm/cma_debugfs.rst new file mode 100644 index 000000000000..4e06ffabd78a --- /dev/null +++ b/Documentation/admin-guide/mm/cma_debugfs.rst @@ -0,0 +1,25 @@ +===================== +CMA Debugfs Interface +===================== + +The CMA debugfs interface is useful to retrieve basic information out of the +different CMA areas and to test allocation/release in each of the areas. + +Each CMA zone represents a directory under /cma/, indexed by the +kernel's CMA index. So the first CMA zone would be: + + /cma/cma-0 + +The structure of the files created under that directory is as follows: + + - [RO] base_pfn: The base PFN (Page Frame Number) of the zone. + - [RO] count: Amount of memory in the CMA area. + - [RO] order_per_bit: Order of pages represented by one bit. + - [RO] bitmap: The bitmap of page states in the zone. + - [WO] alloc: Allocate N pages from that CMA area. For example:: + + echo 5 > /cma/cma-2/alloc + +would try to allocate 5 pages from the cma-2 area. + + - [WO] free: Free N pages from that CMA area, similar to the above. diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst index 5f61a6c429e0..11db46448354 100644 --- a/Documentation/admin-guide/mm/index.rst +++ b/Documentation/admin-guide/mm/index.rst @@ -26,6 +26,7 @@ the Linux memory management. :maxdepth: 1 concepts + cma_debugfs hugetlbpage idle_page_tracking ksm diff --git a/Documentation/admin-guide/numastat.rst b/Documentation/admin-guide/numastat.rst new file mode 100644 index 000000000000..aaf1667489f8 --- /dev/null +++ b/Documentation/admin-guide/numastat.rst @@ -0,0 +1,30 @@ +=============================== +Numa policy hit/miss statistics +=============================== + +/sys/devices/system/node/node*/numastat + +All units are pages. Hugepages have separate counters. + +=============== ============================================================ +numa_hit A process wanted to allocate memory from this node, + and succeeded. + +numa_miss A process wanted to allocate memory from another node, + but ended up with memory from this node. + +numa_foreign A process wanted to allocate on this node, + but ended up with memory from another one. + +local_node A process ran on this node and got memory from it. + +other_node A process ran on this node and got memory from another node. + +interleave_hit Interleaving wanted to allocate from this node + and succeeded. +=============== ============================================================ + +For easier reading you can use the numastat utility from the numactl package +(http://oss.sgi.com/projects/libnuma/). Note that it only works +well right now on machines with a small number of CPUs. + diff --git a/Documentation/admin-guide/pnp.rst b/Documentation/admin-guide/pnp.rst new file mode 100644 index 000000000000..bab2d10631f0 --- /dev/null +++ b/Documentation/admin-guide/pnp.rst @@ -0,0 +1,292 @@ +================================= +Linux Plug and Play Documentation +================================= + +:Author: Adam Belay +:Last updated: Oct. 16, 2002 + + +Overview +-------- + +Plug and Play provides a means of detecting and setting resources for legacy or +otherwise unconfigurable devices. The Linux Plug and Play Layer provides these +services to compatible drivers. + + +The User Interface +------------------ + +The Linux Plug and Play user interface provides a means to activate PnP devices +for legacy and user level drivers that do not support Linux Plug and Play. The +user interface is integrated into sysfs. + +In addition to the standard sysfs file the following are created in each +device's directory: +- id - displays a list of support EISA IDs +- options - displays possible resource configurations +- resources - displays currently allocated resources and allows resource changes + +activating a device +^^^^^^^^^^^^^^^^^^^ + +:: + + # echo "auto" > resources + +this will invoke the automatic resource config system to activate the device + +manually activating a device +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:: + + # echo "manual " > resources + + - the configuration number + - static or dynamic + static = for next boot + dynamic = now + +disabling a device +^^^^^^^^^^^^^^^^^^ + +:: + + # echo "disable" > resources + + +EXAMPLE: + +Suppose you need to activate the floppy disk controller. + +1. change to the proper directory, in my case it is + /driver/bus/pnp/devices/00:0f:: + + # cd /driver/bus/pnp/devices/00:0f + # cat name + PC standard floppy disk controller + +2. check if the device is already active:: + + # cat resources + DISABLED + + - Notice the string "DISABLED". This means the device is not active. + +3. check the device's possible configurations (optional):: + + # cat options + Dependent: 01 - Priority acceptable + port 0x3f0-0x3f0, align 0x7, size 0x6, 16-bit address decoding + port 0x3f7-0x3f7, align 0x0, size 0x1, 16-bit address decoding + irq 6 + dma 2 8-bit compatible + Dependent: 02 - Priority acceptable + port 0x370-0x370, align 0x7, size 0x6, 16-bit address decoding + port 0x377-0x377, align 0x0, size 0x1, 16-bit address decoding + irq 6 + dma 2 8-bit compatible + +4. now activate the device:: + + # echo "auto" > resources + +5. finally check if the device is active:: + + # cat resources + io 0x3f0-0x3f5 + io 0x3f7-0x3f7 + irq 6 + dma 2 + +also there are a series of kernel parameters:: + + pnp_reserve_irq=irq1[,irq2] .... + pnp_reserve_dma=dma1[,dma2] .... + pnp_reserve_io=io1,size1[,io2,size2] .... + pnp_reserve_mem=mem1,size1[,mem2,size2] .... + + + +The Unified Plug and Play Layer +------------------------------- + +All Plug and Play drivers, protocols, and services meet at a central location +called the Plug and Play Layer. This layer is responsible for the exchange of +information between PnP drivers and PnP protocols. Thus it automatically +forwards commands to the proper protocol. This makes writing PnP drivers +significantly easier. + +The following functions are available from the Plug and Play Layer: + +pnp_get_protocol + increments the number of uses by one + +pnp_put_protocol + deincrements the number of uses by one + +pnp_register_protocol + use this to register a new PnP protocol + +pnp_unregister_protocol + use this function to remove a PnP protocol from the Plug and Play Layer + +pnp_register_driver + adds a PnP driver to the Plug and Play Layer + + this includes driver model integration + returns zero for success or a negative error number for failure; count + calls to the .add() method if you need to know how many devices bind to + the driver + +pnp_unregister_driver + removes a PnP driver from the Plug and Play Layer + + + +Plug and Play Protocols +----------------------- + +This section contains information for PnP protocol developers. + +The following Protocols are currently available in the computing world: + +- PNPBIOS: + used for system devices such as serial and parallel ports. +- ISAPNP: + provides PnP support for the ISA bus +- ACPI: + among its many uses, ACPI provides information about system level + devices. + +It is meant to replace the PNPBIOS. It is not currently supported by Linux +Plug and Play but it is planned to be in the near future. + + +Requirements for a Linux PnP protocol: +1. the protocol must use EISA IDs +2. the protocol must inform the PnP Layer of a device's current configuration + +- the ability to set resources is optional but preferred. + +The following are PnP protocol related functions: + +pnp_add_device + use this function to add a PnP device to the PnP layer + + only call this function when all wanted values are set in the pnp_dev + structure + +pnp_init_device + call this to initialize the PnP structure + +pnp_remove_device + call this to remove a device from the Plug and Play Layer. + it will fail if the device is still in use. + automatically will free mem used by the device and related structures + +pnp_add_id + adds an EISA ID to the list of supported IDs for the specified device + +For more information consult the source of a protocol such as +/drivers/pnp/pnpbios/core.c. + + + +Linux Plug and Play Drivers +--------------------------- + +This section contains information for Linux PnP driver developers. + +The New Way +^^^^^^^^^^^ + +1. first make a list of supported EISA IDS + + ex:: + + static const struct pnp_id pnp_dev_table[] = { + /* Standard LPT Printer Port */ + {.id = "PNP0400", .driver_data = 0}, + /* ECP Printer Port */ + {.id = "PNP0401", .driver_data = 0}, + {.id = ""} + }; + + Please note that the character 'X' can be used as a wild card in the function + portion (last four characters). + + ex:: + + /* Unknown PnP modems */ + { "PNPCXXX", UNKNOWN_DEV }, + + Supported PnP card IDs can optionally be defined. + ex:: + + static const struct pnp_id pnp_card_table[] = { + { "ANYDEVS", 0 }, + { "", 0 } + }; + +2. Optionally define probe and remove functions. It may make sense not to + define these functions if the driver already has a reliable method of detecting + the resources, such as the parport_pc driver. + + ex:: + + static int + serial_pnp_probe(struct pnp_dev * dev, const struct pnp_id *card_id, const + struct pnp_id *dev_id) + { + . . . + + ex:: + + static void serial_pnp_remove(struct pnp_dev * dev) + { + . . . + + consult /drivers/serial/8250_pnp.c for more information. + +3. create a driver structure + + ex:: + + static struct pnp_driver serial_pnp_driver = { + .name = "serial", + .card_id_table = pnp_card_table, + .id_table = pnp_dev_table, + .probe = serial_pnp_probe, + .remove = serial_pnp_remove, + }; + + * name and id_table cannot be NULL. + +4. register the driver + + ex:: + + static int __init serial8250_pnp_init(void) + { + return pnp_register_driver(&serial_pnp_driver); + } + +The Old Way +^^^^^^^^^^^ + +A series of compatibility functions have been created to make it easy to convert +ISAPNP drivers. They should serve as a temporary solution only. + +They are as follows:: + + struct pnp_card *pnp_find_card(unsigned short vendor, + unsigned short device, + struct pnp_card *from) + + struct pnp_dev *pnp_find_dev(struct pnp_card *card, + unsigned short vendor, + unsigned short function, + struct pnp_dev *from) + diff --git a/Documentation/admin-guide/rtc.rst b/Documentation/admin-guide/rtc.rst new file mode 100644 index 000000000000..688c95b11919 --- /dev/null +++ b/Documentation/admin-guide/rtc.rst @@ -0,0 +1,140 @@ +======================================= +Real Time Clock (RTC) Drivers for Linux +======================================= + +When Linux developers talk about a "Real Time Clock", they usually mean +something that tracks wall clock time and is battery backed so that it +works even with system power off. Such clocks will normally not track +the local time zone or daylight savings time -- unless they dual boot +with MS-Windows -- but will instead be set to Coordinated Universal Time +(UTC, formerly "Greenwich Mean Time"). + +The newest non-PC hardware tends to just count seconds, like the time(2) +system call reports, but RTCs also very commonly represent time using +the Gregorian calendar and 24 hour time, as reported by gmtime(3). + +Linux has two largely-compatible userspace RTC API families you may +need to know about: + + * /dev/rtc ... is the RTC provided by PC compatible systems, + so it's not very portable to non-x86 systems. + + * /dev/rtc0, /dev/rtc1 ... are part of a framework that's + supported by a wide variety of RTC chips on all systems. + +Programmers need to understand that the PC/AT functionality is not +always available, and some systems can do much more. That is, the +RTCs use the same API to make requests in both RTC frameworks (using +different filenames of course), but the hardware may not offer the +same functionality. For example, not every RTC is hooked up to an +IRQ, so they can't all issue alarms; and where standard PC RTCs can +only issue an alarm up to 24 hours in the future, other hardware may +be able to schedule one any time in the upcoming century. + + +Old PC/AT-Compatible driver: /dev/rtc +-------------------------------------- + +All PCs (even Alpha machines) have a Real Time Clock built into them. +Usually they are built into the chipset of the computer, but some may +actually have a Motorola MC146818 (or clone) on the board. This is the +clock that keeps the date and time while your computer is turned off. + +ACPI has standardized that MC146818 functionality, and extended it in +a few ways (enabling longer alarm periods, and wake-from-hibernate). +That functionality is NOT exposed in the old driver. + +However it can also be used to generate signals from a slow 2Hz to a +relatively fast 8192Hz, in increments of powers of two. These signals +are reported by interrupt number 8. (Oh! So *that* is what IRQ 8 is +for...) It can also function as a 24hr alarm, raising IRQ 8 when the +alarm goes off. The alarm can also be programmed to only check any +subset of the three programmable values, meaning that it could be set to +ring on the 30th second of the 30th minute of every hour, for example. +The clock can also be set to generate an interrupt upon every clock +update, thus generating a 1Hz signal. + +The interrupts are reported via /dev/rtc (major 10, minor 135, read only +character device) in the form of an unsigned long. The low byte contains +the type of interrupt (update-done, alarm-rang, or periodic) that was +raised, and the remaining bytes contain the number of interrupts since +the last read. Status information is reported through the pseudo-file +/proc/driver/rtc if the /proc filesystem was enabled. The driver has +built in locking so that only one process is allowed to have the /dev/rtc +interface open at a time. + +A user process can monitor these interrupts by doing a read(2) or a +select(2) on /dev/rtc -- either will block/stop the user process until +the next interrupt is received. This is useful for things like +reasonably high frequency data acquisition where one doesn't want to +burn up 100% CPU by polling gettimeofday etc. etc. + +At high frequencies, or under high loads, the user process should check +the number of interrupts received since the last read to determine if +there has been any interrupt "pileup" so to speak. Just for reference, a +typical 486-33 running a tight read loop on /dev/rtc will start to suffer +occasional interrupt pileup (i.e. > 1 IRQ event since last read) for +frequencies above 1024Hz. So you really should check the high bytes +of the value you read, especially at frequencies above that of the +normal timer interrupt, which is 100Hz. + +Programming and/or enabling interrupt frequencies greater than 64Hz is +only allowed by root. This is perhaps a bit conservative, but we don't want +an evil user generating lots of IRQs on a slow 386sx-16, where it might have +a negative impact on performance. This 64Hz limit can be changed by writing +a different value to /proc/sys/dev/rtc/max-user-freq. Note that the +interrupt handler is only a few lines of code to minimize any possibility +of this effect. + +Also, if the kernel time is synchronized with an external source, the +kernel will write the time back to the CMOS clock every 11 minutes. In +the process of doing this, the kernel briefly turns off RTC periodic +interrupts, so be aware of this if you are doing serious work. If you +don't synchronize the kernel time with an external source (via ntp or +whatever) then the kernel will keep its hands off the RTC, allowing you +exclusive access to the device for your applications. + +The alarm and/or interrupt frequency are programmed into the RTC via +various ioctl(2) calls as listed in ./include/linux/rtc.h +Rather than write 50 pages describing the ioctl() and so on, it is +perhaps more useful to include a small test program that demonstrates +how to use them, and demonstrates the features of the driver. This is +probably a lot more useful to people interested in writing applications +that will be using this driver. See the code at the end of this document. + +(The original /dev/rtc driver was written by Paul Gortmaker.) + + +New portable "RTC Class" drivers: /dev/rtcN +-------------------------------------------- + +Because Linux supports many non-ACPI and non-PC platforms, some of which +have more than one RTC style clock, it needed a more portable solution +than expecting a single battery-backed MC146818 clone on every system. +Accordingly, a new "RTC Class" framework has been defined. It offers +three different userspace interfaces: + + * /dev/rtcN ... much the same as the older /dev/rtc interface + + * /sys/class/rtc/rtcN ... sysfs attributes support readonly + access to some RTC attributes. + + * /proc/driver/rtc ... the system clock RTC may expose itself + using a procfs interface. If there is no RTC for the system clock, + rtc0 is used by default. More information is (currently) shown + here than through sysfs. + +The RTC Class framework supports a wide variety of RTCs, ranging from those +integrated into embeddable system-on-chip (SOC) processors to discrete chips +using I2C, SPI, or some other bus to communicate with the host CPU. There's +even support for PC-style RTCs ... including the features exposed on newer PCs +through ACPI. + +The new framework also removes the "one RTC per system" restriction. For +example, maybe the low-power battery-backed RTC is a discrete I2C chip, but +a high functionality RTC is integrated into the SOC. That system might read +the system clock from the discrete RTC, but use the integrated one for all +other tasks, because of its greater functionality. + +Check out tools/testing/selftests/rtc/rtctest.c for an example usage of the +ioctl interface. diff --git a/Documentation/admin-guide/svga.rst b/Documentation/admin-guide/svga.rst new file mode 100644 index 000000000000..b6c2f9acca92 --- /dev/null +++ b/Documentation/admin-guide/svga.rst @@ -0,0 +1,249 @@ +.. include:: + +================================= +Video Mode Selection Support 2.13 +================================= + +:Copyright: |copy| 1995--1999 Martin Mares, + +Intro +~~~~~ + +This small document describes the "Video Mode Selection" feature which +allows the use of various special video modes supported by the video BIOS. Due +to usage of the BIOS, the selection is limited to boot time (before the +kernel decompression starts) and works only on 80X86 machines. + +.. note:: + + Short intro for the impatient: Just use vga=ask for the first time, + enter ``scan`` on the video mode prompt, pick the mode you want to use, + remember its mode ID (the four-digit hexadecimal number) and then + set the vga parameter to this number (converted to decimal first). + +The video mode to be used is selected by a kernel parameter which can be +specified in the kernel Makefile (the SVGA_MODE=... line) or by the "vga=..." +option of LILO (or some other boot loader you use) or by the "vidmode" utility +(present in standard Linux utility packages). You can use the following values +of this parameter:: + + NORMAL_VGA - Standard 80x25 mode available on all display adapters. + + EXTENDED_VGA - Standard 8-pixel font mode: 80x43 on EGA, 80x50 on VGA. + + ASK_VGA - Display a video mode menu upon startup (see below). + + 0..35 - Menu item number (when you have used the menu to view the list of + modes available on your adapter, you can specify the menu item you want + to use). 0..9 correspond to "0".."9", 10..35 to "a".."z". Warning: the + mode list displayed may vary as the kernel version changes, because the + modes are listed in a "first detected -- first displayed" manner. It's + better to use absolute mode numbers instead. + + 0x.... - Hexadecimal video mode ID (also displayed on the menu, see below + for exact meaning of the ID). Warning: rdev and LILO don't support + hexadecimal numbers -- you have to convert it to decimal manually. + +Menu +~~~~ + +The ASK_VGA mode causes the kernel to offer a video mode menu upon +bootup. It displays a "Press to see video modes available, +to continue or wait 30 secs" message. If you press , you enter the +menu, if you press or wait 30 seconds, the kernel will boot up in +the standard 80x25 mode. + +The menu looks like:: + + Video adapter: + Mode: COLSxROWS: + 0 0F00 80x25 + 1 0F01 80x50 + 2 0F02 80x43 + 3 0F03 80x26 + .... + Enter mode number or ``scan``: + + tells what video adapter did Linux detect +-- it's either a generic adapter name (MDA, CGA, HGC, EGA, VGA, VESA VGA [a VGA +with VESA-compliant BIOS]) or a chipset name (e.g., Trident). Direct detection +of chipsets is turned off by default as it's inherently unreliable due to +absolutely insane PC design. + +"0 0F00 80x25" means that the first menu item (the menu items are numbered +from "0" to "9" and from "a" to "z") is a 80x25 mode with ID=0x0f00 (see the +next section for a description of mode IDs). + + encourages you to enter the item number or mode ID +you wish to set and press . If the computer complains something about +"Unknown mode ID", it is trying to tell you that it isn't possible to set such +a mode. It's also possible to press only which leaves the current mode. + +The mode list usually contains a few basic modes and some VESA modes. In +case your chipset has been detected, some chipset-specific modes are shown as +well (some of these might be missing or unusable on your machine as different +BIOSes are often shipped with the same card and the mode numbers depend purely +on the VGA BIOS). + +The modes displayed on the menu are partially sorted: The list starts with +the standard modes (80x25 and 80x50) followed by "special" modes (80x28 and +80x43), local modes (if the local modes feature is enabled), VESA modes and +finally SVGA modes for the auto-detected adapter. + +If you are not happy with the mode list offered (e.g., if you think your card +is able to do more), you can enter "scan" instead of item number / mode ID. The +program will try to ask the BIOS for all possible video mode numbers and test +what happens then. The screen will be probably flashing wildly for some time and +strange noises will be heard from inside the monitor and so on and then, really +all consistent video modes supported by your BIOS will appear (plus maybe some +``ghost modes``). If you are afraid this could damage your monitor, don't use +this function. + +After scanning, the mode ordering is a bit different: the auto-detected SVGA +modes are not listed at all and the modes revealed by ``scan`` are shown before +all VESA modes. + +Mode IDs +~~~~~~~~ + +Because of the complexity of all the video stuff, the video mode IDs +used here are also a bit complex. A video mode ID is a 16-bit number usually +expressed in a hexadecimal notation (starting with "0x"). You can set a mode +by entering its mode directly if you know it even if it isn't shown on the menu. + +The ID numbers can be divided to those regions:: + + 0x0000 to 0x00ff - menu item references. 0x0000 is the first item. Don't use + outside the menu as this can change from boot to boot (especially if you + have used the ``scan`` feature). + + 0x0100 to 0x017f - standard BIOS modes. The ID is a BIOS video mode number + (as presented to INT 10, function 00) increased by 0x0100. + + 0x0200 to 0x08ff - VESA BIOS modes. The ID is a VESA mode ID increased by + 0x0100. All VESA modes should be autodetected and shown on the menu. + + 0x0900 to 0x09ff - Video7 special modes. Set by calling INT 0x10, AX=0x6f05. + (Usually 940=80x43, 941=132x25, 942=132x44, 943=80x60, 944=100x60, + 945=132x28 for the standard Video7 BIOS) + + 0x0f00 to 0x0fff - special modes (they are set by various tricks -- usually + by modifying one of the standard modes). Currently available: + 0x0f00 standard 80x25, don't reset mode if already set (=FFFF) + 0x0f01 standard with 8-point font: 80x43 on EGA, 80x50 on VGA + 0x0f02 VGA 80x43 (VGA switched to 350 scanlines with a 8-point font) + 0x0f03 VGA 80x28 (standard VGA scans, but 14-point font) + 0x0f04 leave current video mode + 0x0f05 VGA 80x30 (480 scans, 16-point font) + 0x0f06 VGA 80x34 (480 scans, 14-point font) + 0x0f07 VGA 80x60 (480 scans, 8-point font) + 0x0f08 Graphics hack (see the VIDEO_GFX_HACK paragraph below) + + 0x1000 to 0x7fff - modes specified by resolution. The code has a "0xRRCC" + form where RR is a number of rows and CC is a number of columns. + E.g., 0x1950 corresponds to a 80x25 mode, 0x2b84 to 132x43 etc. + This is the only fully portable way to refer to a non-standard mode, + but it relies on the mode being found and displayed on the menu + (remember that mode scanning is not done automatically). + + 0xff00 to 0xffff - aliases for backward compatibility: + 0xffff equivalent to 0x0f00 (standard 80x25) + 0xfffe equivalent to 0x0f01 (EGA 80x43 or VGA 80x50) + +If you add 0x8000 to the mode ID, the program will try to recalculate +vertical display timing according to mode parameters, which can be used to +eliminate some annoying bugs of certain VGA BIOSes (usually those used for +cards with S3 chipsets and old Cirrus Logic BIOSes) -- mainly extra lines at the +end of the display. + +Options +~~~~~~~ + +Build options for arch/x86/boot/* are selected by the kernel kconfig +utility and the kernel .config file. + +VIDEO_GFX_HACK - includes special hack for setting of graphics modes +to be used later by special drivers. +Allows to set _any_ BIOS mode including graphic ones and forcing specific +text screen resolution instead of peeking it from BIOS variables. Don't use +unless you think you know what you're doing. To activate this setup, use +mode number 0x0f08 (see the Mode IDs section above). + +Still doesn't work? +~~~~~~~~~~~~~~~~~~~ + +When the mode detection doesn't work (e.g., the mode list is incorrect or +the machine hangs instead of displaying the menu), try to switch off some of +the configuration options listed under "Options". If it fails, you can still use +your kernel with the video mode set directly via the kernel parameter. + +In either case, please send me a bug report containing what _exactly_ +happens and how do the configuration switches affect the behaviour of the bug. + +If you start Linux from M$-DOS, you might also use some DOS tools for +video mode setting. In this case, you must specify the 0x0f04 mode ("leave +current settings") to Linux, because if you don't and you use any non-standard +mode, Linux will switch to 80x25 automatically. + +If you set some extended mode and there's one or more extra lines on the +bottom of the display containing already scrolled-out text, your VGA BIOS +contains the most common video BIOS bug called "incorrect vertical display +end setting". Adding 0x8000 to the mode ID might fix the problem. Unfortunately, +this must be done manually -- no autodetection mechanisms are available. + +History +~~~~~~~ + +=============== ================================================================ +1.0 (??-Nov-95) First version supporting all adapters supported by the old + setup.S + Cirrus Logic 54XX. Present in some 1.3.4? kernels + and then removed due to instability on some machines. +2.0 (28-Jan-96) Rewritten from scratch. Cirrus Logic 64XX support added, almost + everything is configurable, the VESA support should be much more + stable, explicit mode numbering allowed, "scan" implemented etc. +2.1 (30-Jan-96) VESA modes moved to 0x200-0x3ff. Mode selection by resolution + supported. Few bugs fixed. VESA modes are listed prior to + modes supplied by SVGA autodetection as they are more reliable. + CLGD autodetect works better. Doesn't depend on 80x25 being + active when started. Scanning fixed. 80x43 (any VGA) added. + Code cleaned up. +2.2 (01-Feb-96) EGA 80x43 fixed. VESA extended to 0x200-0x4ff (non-standard 02XX + VESA modes work now). Display end bug workaround supported. + Special modes renumbered to allow adding of the "recalculate" + flag, 0xffff and 0xfffe became aliases instead of real IDs. + Screen contents retained during mode changes. +2.3 (15-Mar-96) Changed to work with 1.3.74 kernel. +2.4 (18-Mar-96) Added patches by Hans Lermen fixing a memory overwrite problem + with some boot loaders. Memory management rewritten to reflect + these changes. Unfortunately, screen contents retaining works + only with some loaders now. + Added a Tseng 132x60 mode. +2.5 (19-Mar-96) Fixed a VESA mode scanning bug introduced in 2.4. +2.6 (25-Mar-96) Some VESA BIOS errors not reported -- it fixes error reports on + several cards with broken VESA code (e.g., ATI VGA). +2.7 (09-Apr-96) - Accepted all VESA modes in range 0x100 to 0x7ff, because some + cards use very strange mode numbers. + - Added Realtek VGA modes (thanks to Gonzalo Tornaria). + - Hardware testing order slightly changed, tests based on ROM + contents done as first. + - Added support for special Video7 mode switching functions + (thanks to Tom Vander Aa). + - Added 480-scanline modes (especially useful for notebooks, + original version written by hhanemaa@cs.ruu.nl, patched by + Jeff Chua, rewritten by me). + - Screen store/restore fixed. +2.8 (14-Apr-96) - Previous release was not compilable without CONFIG_VIDEO_SVGA. + - Better recognition of text modes during mode scan. +2.9 (12-May-96) - Ignored VESA modes 0x80 - 0xff (more VESA BIOS bugs!) +2.10(11-Nov-96) - The whole thing made optional. + - Added the CONFIG_VIDEO_400_HACK switch. + - Added the CONFIG_VIDEO_GFX_HACK switch. + - Code cleanup. +2.11(03-May-97) - Yet another cleanup, now including also the documentation. + - Direct testing of SVGA adapters turned off by default, ``scan`` + offered explicitly on the prompt line. + - Removed the doc section describing adding of new probing + functions as I try to get rid of _all_ hardware probing here. +2.12(25-May-98) Added support for VESA frame buffer graphics. +2.13(14-May-99) Minor documentation fixes. +=============== ================================================================ diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index a0c1d4ce403a..032c7cd3cede 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -327,7 +327,7 @@ when a hard lockup is detected. 0 - don't panic on hard lockup 1 - panic on hard lockup -See Documentation/lockup-watchdogs.txt for more information. This can +See Documentation/admin-guide/lockup-watchdogs.rst for more information. This can also be set using the nmi_watchdog kernel parameter. diff --git a/Documentation/admin-guide/video-output.rst b/Documentation/admin-guide/video-output.rst new file mode 100644 index 000000000000..56d6fa2e2368 --- /dev/null +++ b/Documentation/admin-guide/video-output.rst @@ -0,0 +1,34 @@ +Video Output Switcher Control +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +2006 luming.yu@intel.com + +The output sysfs class driver provides an abstract video output layer that +can be used to hook platform specific methods to enable/disable video output +device through common sysfs interface. For example, on my IBM ThinkPad T42 +laptop, The ACPI video driver registered its output devices and read/write +method for 'state' with output sysfs class. The user interface under sysfs is:: + + linux:/sys/class/video_output # tree . + . + |-- CRT0 + | |-- device -> ../../../devices/pci0000:00/0000:00:01.0 + | |-- state + | |-- subsystem -> ../../../class/video_output + | `-- uevent + |-- DVI0 + | |-- device -> ../../../devices/pci0000:00/0000:00:01.0 + | |-- state + | |-- subsystem -> ../../../class/video_output + | `-- uevent + |-- LCD0 + | |-- device -> ../../../devices/pci0000:00/0000:00:01.0 + | |-- state + | |-- subsystem -> ../../../class/video_output + | `-- uevent + `-- TV0 + |-- device -> ../../../devices/pci0000:00/0000:00:01.0 + |-- state + |-- subsystem -> ../../../class/video_output + `-- uevent + diff --git a/Documentation/auxdisplay/lcd-panel-cgram.rst b/Documentation/auxdisplay/lcd-panel-cgram.rst deleted file mode 100644 index dfef50286018..000000000000 --- a/Documentation/auxdisplay/lcd-panel-cgram.rst +++ /dev/null @@ -1,29 +0,0 @@ -:orphan: - -====================================== -Parallel port LCD/Keypad Panel support -====================================== - -Some LCDs allow you to define up to 8 characters, mapped to ASCII -characters 0 to 7. The escape code to define a new character is -'\e[LG' followed by one digit from 0 to 7, representing the character -number, and up to 8 couples of hex digits terminated by a semi-colon -(';'). Each couple of digits represents a line, with 1-bits for each -illuminated pixel with LSB on the right. Lines are numbered from the -top of the character to the bottom. On a 5x7 matrix, only the 5 lower -bits of the 7 first bytes are used for each character. If the string -is incomplete, only complete lines will be redefined. Here are some -examples:: - - printf "\e[LG0010101050D1F0C04;" => 0 = [enter] - printf "\e[LG1040E1F0000000000;" => 1 = [up] - printf "\e[LG2000000001F0E0400;" => 2 = [down] - printf "\e[LG3040E1F001F0E0400;" => 3 = [up-down] - printf "\e[LG40002060E1E0E0602;" => 4 = [left] - printf "\e[LG500080C0E0F0E0C08;" => 5 = [right] - printf "\e[LG60016051516141400;" => 6 = "IP" - - printf "\e[LG00103071F1F070301;" => big speaker - printf "\e[LG00002061E1E060200;" => small speaker - -Willy diff --git a/Documentation/btmrvl.txt b/Documentation/btmrvl.txt deleted file mode 100644 index ec57740ead0c..000000000000 --- a/Documentation/btmrvl.txt +++ /dev/null @@ -1,124 +0,0 @@ -============= -btmrvl driver -============= - -All commands are used via debugfs interface. - -Set/get driver configurations -============================= - -Path: /debug/btmrvl/config/ - -gpiogap=[n], hscfgcmd - These commands are used to configure the host sleep parameters:: - bit 8:0 -- Gap - bit 16:8 -- GPIO - - where GPIO is the pin number of GPIO used to wake up the host. - It could be any valid GPIO pin# (e.g. 0-7) or 0xff (SDIO interface - wakeup will be used instead). - - where Gap is the gap in milli seconds between wakeup signal and - wakeup event, or 0xff for special host sleep setting. - - Usage:: - - # Use SDIO interface to wake up the host and set GAP to 0x80: - echo 0xff80 > /debug/btmrvl/config/gpiogap - echo 1 > /debug/btmrvl/config/hscfgcmd - - # Use GPIO pin #3 to wake up the host and set GAP to 0xff: - echo 0x03ff > /debug/btmrvl/config/gpiogap - echo 1 > /debug/btmrvl/config/hscfgcmd - -psmode=[n], pscmd - These commands are used to enable/disable auto sleep mode - - where the option is:: - - 1 -- Enable auto sleep mode - 0 -- Disable auto sleep mode - - Usage:: - - # Enable auto sleep mode - echo 1 > /debug/btmrvl/config/psmode - echo 1 > /debug/btmrvl/config/pscmd - - # Disable auto sleep mode - echo 0 > /debug/btmrvl/config/psmode - echo 1 > /debug/btmrvl/config/pscmd - - -hsmode=[n], hscmd - These commands are used to enable host sleep or wake up firmware - - where the option is:: - - 1 -- Enable host sleep - 0 -- Wake up firmware - - Usage:: - - # Enable host sleep - echo 1 > /debug/btmrvl/config/hsmode - echo 1 > /debug/btmrvl/config/hscmd - - # Wake up firmware - echo 0 > /debug/btmrvl/config/hsmode - echo 1 > /debug/btmrvl/config/hscmd - - -Get driver status -================= - -Path: /debug/btmrvl/status/ - -Usage:: - - cat /debug/btmrvl/status/ - -where the args are: - -curpsmode - This command displays current auto sleep status. - -psstate - This command display the power save state. - -hsstate - This command display the host sleep state. - -txdnldrdy - This command displays the value of Tx download ready flag. - -Issuing a raw hci command -========================= - -Use hcitool to issue raw hci command, refer to hcitool manual - -Usage:: - - Hcitool cmd [Parameters] - -Interface Control Command:: - - hcitool cmd 0x3f 0x5b 0xf5 0x01 0x00 --Enable All interface - hcitool cmd 0x3f 0x5b 0xf5 0x01 0x01 --Enable Wlan interface - hcitool cmd 0x3f 0x5b 0xf5 0x01 0x02 --Enable BT interface - hcitool cmd 0x3f 0x5b 0xf5 0x00 0x00 --Disable All interface - hcitool cmd 0x3f 0x5b 0xf5 0x00 0x01 --Disable Wlan interface - hcitool cmd 0x3f 0x5b 0xf5 0x00 0x02 --Disable BT interface - -SD8688 firmware -=============== - -Images: - -- /lib/firmware/sd8688_helper.bin -- /lib/firmware/sd8688.bin - - -The images can be downloaded from: - -git.infradead.org/users/dwmw2/linux-firmware.git/libertas/ diff --git a/Documentation/clearing-warn-once.txt b/Documentation/clearing-warn-once.txt deleted file mode 100644 index 211fd926cf00..000000000000 --- a/Documentation/clearing-warn-once.txt +++ /dev/null @@ -1,9 +0,0 @@ -Clearing WARN_ONCE ------------------- - -WARN_ONCE / WARN_ON_ONCE / printk_once only emit a message once. - -echo 1 > /sys/kernel/debug/clear_warn_once - -clears the state and allows the warnings to print once again. -This can be useful after test suite runs to reproduce problems. diff --git a/Documentation/cma/debugfs.rst b/Documentation/cma/debugfs.rst deleted file mode 100644 index 518fe401b5ee..000000000000 --- a/Documentation/cma/debugfs.rst +++ /dev/null @@ -1,27 +0,0 @@ -:orphan: - -===================== -CMA Debugfs Interface -===================== - -The CMA debugfs interface is useful to retrieve basic information out of the -different CMA areas and to test allocation/release in each of the areas. - -Each CMA zone represents a directory under /cma/, indexed by the -kernel's CMA index. So the first CMA zone would be: - - /cma/cma-0 - -The structure of the files created under that directory is as follows: - - - [RO] base_pfn: The base PFN (Page Frame Number) of the zone. - - [RO] count: Amount of memory in the CMA area. - - [RO] order_per_bit: Order of pages represented by one bit. - - [RO] bitmap: The bitmap of page states in the zone. - - [WO] alloc: Allocate N pages from that CMA area. For example:: - - echo 5 > /cma/cma-2/alloc - -would try to allocate 5 pages from the cma-2 area. - - - [WO] free: Free N pages from that CMA area, similar to the above. diff --git a/Documentation/cpu-load.txt b/Documentation/cpu-load.txt deleted file mode 100644 index 2d01ce43d2a2..000000000000 --- a/Documentation/cpu-load.txt +++ /dev/null @@ -1,114 +0,0 @@ -======== -CPU load -======== - -Linux exports various bits of information via ``/proc/stat`` and -``/proc/uptime`` that userland tools, such as top(1), use to calculate -the average time system spent in a particular state, for example:: - - $ iostat - Linux 2.6.18.3-exp (linmac) 02/20/2007 - - avg-cpu: %user %nice %system %iowait %steal %idle - 10.01 0.00 2.92 5.44 0.00 81.63 - - ... - -Here the system thinks that over the default sampling period the -system spent 10.01% of the time doing work in user space, 2.92% in the -kernel, and was overall 81.63% of the time idle. - -In most cases the ``/proc/stat`` information reflects the reality quite -closely, however due to the nature of how/when the kernel collects -this data sometimes it can not be trusted at all. - -So how is this information collected? Whenever timer interrupt is -signalled the kernel looks what kind of task was running at this -moment and increments the counter that corresponds to this tasks -kind/state. The problem with this is that the system could have -switched between various states multiple times between two timer -interrupts yet the counter is incremented only for the last state. - - -Example -------- - -If we imagine the system with one task that periodically burns cycles -in the following manner:: - - time line between two timer interrupts - |--------------------------------------| - ^ ^ - |_ something begins working | - |_ something goes to sleep - (only to be awaken quite soon) - -In the above situation the system will be 0% loaded according to the -``/proc/stat`` (since the timer interrupt will always happen when the -system is executing the idle handler), but in reality the load is -closer to 99%. - -One can imagine many more situations where this behavior of the kernel -will lead to quite erratic information inside ``/proc/stat``:: - - - /* gcc -o hog smallhog.c */ - #include - #include - #include - #include - #define HIST 10 - - static volatile sig_atomic_t stop; - - static void sighandler (int signr) - { - (void) signr; - stop = 1; - } - static unsigned long hog (unsigned long niters) - { - stop = 0; - while (!stop && --niters); - return niters; - } - int main (void) - { - int i; - struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 }, - .it_value = { .tv_sec = 0, .tv_usec = 1 } }; - sigset_t set; - unsigned long v[HIST]; - double tmp = 0.0; - unsigned long n; - signal (SIGALRM, &sighandler); - setitimer (ITIMER_REAL, &it, NULL); - - hog (ULONG_MAX); - for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX); - for (i = 0; i < HIST; ++i) tmp += v[i]; - tmp /= HIST; - n = tmp - (tmp / 3.0); - - sigemptyset (&set); - sigaddset (&set, SIGALRM); - - for (;;) { - hog (n); - sigwait (&set, &i); - } - return 0; - } - - -References ----------- - -- http://lkml.org/lkml/2007/2/12/6 -- Documentation/filesystems/proc.txt (1.8) - - -Thanks ------- - -Con Kolivas, Pavel Machek diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt deleted file mode 100644 index b90dafcc8237..000000000000 --- a/Documentation/cputopology.txt +++ /dev/null @@ -1,177 +0,0 @@ -=========================================== -How CPU topology info is exported via sysfs -=========================================== - -Export CPU topology info via sysfs. Items (attributes) are similar -to /proc/cpuinfo output of some architectures. They reside in -/sys/devices/system/cpu/cpuX/topology/: - -physical_package_id: - - physical package id of cpuX. Typically corresponds to a physical - socket number, but the actual value is architecture and platform - dependent. - -die_id: - - the CPU die ID of cpuX. Typically it is the hardware platform's - identifier (rather than the kernel's). The actual value is - architecture and platform dependent. - -core_id: - - the CPU core ID of cpuX. Typically it is the hardware platform's - identifier (rather than the kernel's). The actual value is - architecture and platform dependent. - -book_id: - - the book ID of cpuX. Typically it is the hardware platform's - identifier (rather than the kernel's). The actual value is - architecture and platform dependent. - -drawer_id: - - the drawer ID of cpuX. Typically it is the hardware platform's - identifier (rather than the kernel's). The actual value is - architecture and platform dependent. - -core_cpus: - - internal kernel map of CPUs within the same core. - (deprecated name: "thread_siblings") - -core_cpus_list: - - human-readable list of CPUs within the same core. - (deprecated name: "thread_siblings_list"); - -package_cpus: - - internal kernel map of the CPUs sharing the same physical_package_id. - (deprecated name: "core_siblings") - -package_cpus_list: - - human-readable list of CPUs sharing the same physical_package_id. - (deprecated name: "core_siblings_list") - -die_cpus: - - internal kernel map of CPUs within the same die. - -die_cpus_list: - - human-readable list of CPUs within the same die. - -book_siblings: - - internal kernel map of cpuX's hardware threads within the same - book_id. - -book_siblings_list: - - human-readable list of cpuX's hardware threads within the same - book_id. - -drawer_siblings: - - internal kernel map of cpuX's hardware threads within the same - drawer_id. - -drawer_siblings_list: - - human-readable list of cpuX's hardware threads within the same - drawer_id. - -Architecture-neutral, drivers/base/topology.c, exports these attributes. -However, the book and drawer related sysfs files will only be created if -CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are selected, respectively. - -CONFIG_SCHED_BOOK and CONFIG_SCHED_DRAWER are currently only used on s390, -where they reflect the cpu and cache hierarchy. - -For an architecture to support this feature, it must define some of -these macros in include/asm-XXX/topology.h:: - - #define topology_physical_package_id(cpu) - #define topology_die_id(cpu) - #define topology_core_id(cpu) - #define topology_book_id(cpu) - #define topology_drawer_id(cpu) - #define topology_sibling_cpumask(cpu) - #define topology_core_cpumask(cpu) - #define topology_die_cpumask(cpu) - #define topology_book_cpumask(cpu) - #define topology_drawer_cpumask(cpu) - -The type of ``**_id macros`` is int. -The type of ``**_cpumask macros`` is ``(const) struct cpumask *``. The latter -correspond with appropriate ``**_siblings`` sysfs attributes (except for -topology_sibling_cpumask() which corresponds with thread_siblings). - -To be consistent on all architectures, include/linux/topology.h -provides default definitions for any of the above macros that are -not defined by include/asm-XXX/topology.h: - -1) topology_physical_package_id: -1 -2) topology_die_id: -1 -3) topology_core_id: 0 -4) topology_sibling_cpumask: just the given CPU -5) topology_core_cpumask: just the given CPU -6) topology_die_cpumask: just the given CPU - -For architectures that don't support books (CONFIG_SCHED_BOOK) there are no -default definitions for topology_book_id() and topology_book_cpumask(). -For architectures that don't support drawers (CONFIG_SCHED_DRAWER) there are -no default definitions for topology_drawer_id() and topology_drawer_cpumask(). - -Additionally, CPU topology information is provided under -/sys/devices/system/cpu and includes these files. The internal -source for the output is in brackets ("[]"). - - =========== ========================================================== - kernel_max: the maximum CPU index allowed by the kernel configuration. - [NR_CPUS-1] - - offline: CPUs that are not online because they have been - HOTPLUGGED off (see cpu-hotplug.txt) or exceed the limit - of CPUs allowed by the kernel configuration (kernel_max - above). [~cpu_online_mask + cpus >= NR_CPUS] - - online: CPUs that are online and being scheduled [cpu_online_mask] - - possible: CPUs that have been allocated resources and can be - brought online if they are present. [cpu_possible_mask] - - present: CPUs that have been identified as being present in the - system. [cpu_present_mask] - =========== ========================================================== - -The format for the above output is compatible with cpulist_parse() -[see ]. Some examples follow. - -In this example, there are 64 CPUs in the system but cpus 32-63 exceed -the kernel max which is limited to 0..31 by the NR_CPUS config option -being 32. Note also that CPUs 2 and 4-31 are not online but could be -brought online as they are both present and possible:: - - kernel_max: 31 - offline: 2,4-31,32-63 - online: 0-1,3 - possible: 0-31 - present: 0-31 - -In this example, the NR_CPUS config option is 128, but the kernel was -started with possible_cpus=144. There are 4 CPUs in the system and cpu2 -was manually taken offline (and is the only CPU that can be brought -online.):: - - kernel_max: 127 - offline: 2,4-127,128-143 - online: 0-1,3 - possible: 0-127 - present: 0-3 - -See cpu-hotplug.txt for the possible_cpus=NUM kernel start parameter -as well as more information on the various cpumasks. diff --git a/Documentation/efi-stub.txt b/Documentation/efi-stub.txt deleted file mode 100644 index 833edb0d0bc4..000000000000 --- a/Documentation/efi-stub.txt +++ /dev/null @@ -1,100 +0,0 @@ -================= -The EFI Boot Stub -================= - -On the x86 and ARM platforms, a kernel zImage/bzImage can masquerade -as a PE/COFF image, thereby convincing EFI firmware loaders to load -it as an EFI executable. The code that modifies the bzImage header, -along with the EFI-specific entry point that the firmware loader -jumps to are collectively known as the "EFI boot stub", and live in -arch/x86/boot/header.S and arch/x86/boot/compressed/eboot.c, -respectively. For ARM the EFI stub is implemented in -arch/arm/boot/compressed/efi-header.S and -arch/arm/boot/compressed/efi-stub.c. EFI stub code that is shared -between architectures is in drivers/firmware/efi/libstub. - -For arm64, there is no compressed kernel support, so the Image itself -masquerades as a PE/COFF image and the EFI stub is linked into the -kernel. The arm64 EFI stub lives in arch/arm64/kernel/efi-entry.S -and drivers/firmware/efi/libstub/arm64-stub.c. - -By using the EFI boot stub it's possible to boot a Linux kernel -without the use of a conventional EFI boot loader, such as grub or -elilo. Since the EFI boot stub performs the jobs of a boot loader, in -a certain sense it *IS* the boot loader. - -The EFI boot stub is enabled with the CONFIG_EFI_STUB kernel option. - - -How to install bzImage.efi --------------------------- - -The bzImage located in arch/x86/boot/bzImage must be copied to the EFI -System Partition (ESP) and renamed with the extension ".efi". Without -the extension the EFI firmware loader will refuse to execute it. It's -not possible to execute bzImage.efi from the usual Linux file systems -because EFI firmware doesn't have support for them. For ARM the -arch/arm/boot/zImage should be copied to the system partition, and it -may not need to be renamed. Similarly for arm64, arch/arm64/boot/Image -should be copied but not necessarily renamed. - - -Passing kernel parameters from the EFI shell --------------------------------------------- - -Arguments to the kernel can be passed after bzImage.efi, e.g.:: - - fs0:> bzImage.efi console=ttyS0 root=/dev/sda4 - - -The "initrd=" option --------------------- - -Like most boot loaders, the EFI stub allows the user to specify -multiple initrd files using the "initrd=" option. This is the only EFI -stub-specific command line parameter, everything else is passed to the -kernel when it boots. - -The path to the initrd file must be an absolute path from the -beginning of the ESP, relative path names do not work. Also, the path -is an EFI-style path and directory elements must be separated with -backslashes (\). For example, given the following directory layout:: - - fs0:> - Kernels\ - bzImage.efi - initrd-large.img - - Ramdisks\ - initrd-small.img - initrd-medium.img - -to boot with the initrd-large.img file if the current working -directory is fs0:\Kernels, the following command must be used:: - - fs0:\Kernels> bzImage.efi initrd=\Kernels\initrd-large.img - -Notice how bzImage.efi can be specified with a relative path. That's -because the image we're executing is interpreted by the EFI shell, -which understands relative paths, whereas the rest of the command line -is passed to bzImage.efi. - - -The "dtb=" option ------------------ - -For the ARM and arm64 architectures, a device tree must be provided to -the kernel. Normally firmware shall supply the device tree via the -EFI CONFIGURATION TABLE. However, the "dtb=" command line option can -be used to override the firmware supplied device tree, or to supply -one when firmware is unable to. - -Please note: Firmware adds runtime configuration information to the -device tree before booting the kernel. If dtb= is used to override -the device tree, then any runtime data provided by firmware will be -lost. The dtb= option should only be used either as a debug tool, or -as a last resort when a device tree is not provided in the EFI -CONFIGURATION TABLE. - -"dtb=" is processed in the same manner as the "initrd=" option that is -described above. diff --git a/Documentation/fb/vesafb.rst b/Documentation/fb/vesafb.rst index 2ed0dfb661cf..6821c87b7893 100644 --- a/Documentation/fb/vesafb.rst +++ b/Documentation/fb/vesafb.rst @@ -30,7 +30,7 @@ How to use it? ============== Switching modes is done using the vga=... boot parameter. Read -Documentation/svga.txt for details. +Documentation/admin-guide/svga.rst for details. You should compile in both vgacon (for text mode) and vesafb (for graphics mode). Which of them takes over the console depends on diff --git a/Documentation/highuid.txt b/Documentation/highuid.txt deleted file mode 100644 index 6ee70465c0ea..000000000000 --- a/Documentation/highuid.txt +++ /dev/null @@ -1,80 +0,0 @@ -=================================================== -Notes on the change from 16-bit UIDs to 32-bit UIDs -=================================================== - -:Author: Chris Wing -:Last updated: January 11, 2000 - -- kernel code MUST take into account __kernel_uid_t and __kernel_uid32_t - when communicating between user and kernel space in an ioctl or data - structure. - -- kernel code should use uid_t and gid_t in kernel-private structures and - code. - -What's left to be done for 32-bit UIDs on all Linux architectures: - -- Disk quotas have an interesting limitation that is not related to the - maximum UID/GID. They are limited by the maximum file size on the - underlying filesystem, because quota records are written at offsets - corresponding to the UID in question. - Further investigation is needed to see if the quota system can cope - properly with huge UIDs. If it can deal with 64-bit file offsets on all - architectures, this should not be a problem. - -- Decide whether or not to keep backwards compatibility with the system - accounting file, or if we should break it as the comments suggest - (currently, the old 16-bit UID and GID are still written to disk, and - part of the former pad space is used to store separate 32-bit UID and - GID) - -- Need to validate that OS emulation calls the 16-bit UID - compatibility syscalls, if the OS being emulated used 16-bit UIDs, or - uses the 32-bit UID system calls properly otherwise. - - This affects at least: - - - iBCS on Intel - - - sparc32 emulation on sparc64 - (need to support whatever new 32-bit UID system calls are added to - sparc32) - -- Validate that all filesystems behave properly. - - At present, 32-bit UIDs _should_ work for: - - - ext2 - - ufs - - isofs - - nfs - - coda - - udf - - Ioctl() fixups have been made for: - - - ncpfs - - smbfs - - Filesystems with simple fixups to prevent 16-bit UID wraparound: - - - minix - - sysv - - qnx4 - - Other filesystems have not been checked yet. - -- The ncpfs and smpfs filesystems cannot presently use 32-bit UIDs in - all ioctl()s. Some new ioctl()s have been added with 32-bit UIDs, but - more are needed. (as well as new user<->kernel data structures) - -- The ELF core dump format only supports 16-bit UIDs on arm, i386, m68k, - sh, and sparc32. Fixing this is probably not that important, but would - require adding a new ELF section. - -- The ioctl()s used to control the in-kernel NFS server only support - 16-bit UIDs on arm, i386, m68k, sh, and sparc32. - -- make sure that the UID mapping feature of AX25 networking works properly - (it should be safe because it's always used a 32-bit integer to - communicate between user and kernel) diff --git a/Documentation/hw_random.txt b/Documentation/hw_random.txt deleted file mode 100644 index 121de96e395e..000000000000 --- a/Documentation/hw_random.txt +++ /dev/null @@ -1,105 +0,0 @@ -========================================================== -Linux support for random number generator in i8xx chipsets -========================================================== - -Introduction -============ - -The hw_random framework is software that makes use of a -special hardware feature on your CPU or motherboard, -a Random Number Generator (RNG). The software has two parts: -a core providing the /dev/hwrng character device and its -sysfs support, plus a hardware-specific driver that plugs -into that core. - -To make the most effective use of these mechanisms, you -should download the support software as well. Download the -latest version of the "rng-tools" package from the -hw_random driver's official Web site: - - http://sourceforge.net/projects/gkernel/ - -Those tools use /dev/hwrng to fill the kernel entropy pool, -which is used internally and exported by the /dev/urandom and -/dev/random special files. - -Theory of operation -=================== - -CHARACTER DEVICE. Using the standard open() -and read() system calls, you can read random data from -the hardware RNG device. This data is NOT CHECKED by any -fitness tests, and could potentially be bogus (if the -hardware is faulty or has been tampered with). Data is only -output if the hardware "has-data" flag is set, but nevertheless -a security-conscious person would run fitness tests on the -data before assuming it is truly random. - -The rng-tools package uses such tests in "rngd", and lets you -run them by hand with a "rngtest" utility. - -/dev/hwrng is char device major 10, minor 183. - -CLASS DEVICE. There is a /sys/class/misc/hw_random node with -two unique attributes, "rng_available" and "rng_current". The -"rng_available" attribute lists the hardware-specific drivers -available, while "rng_current" lists the one which is currently -connected to /dev/hwrng. If your system has more than one -RNG available, you may change the one used by writing a name from -the list in "rng_available" into "rng_current". - -========================================================================== - - -Hardware driver for Intel/AMD/VIA Random Number Generators (RNG) - - Copyright 2000,2001 Jeff Garzik - - Copyright 2000,2001 Philipp Rumpf - - -About the Intel RNG hardware, from the firmware hub datasheet -============================================================= - -The Firmware Hub integrates a Random Number Generator (RNG) -using thermal noise generated from inherently random quantum -mechanical properties of silicon. When not generating new random -bits the RNG circuitry will enter a low power state. Intel will -provide a binary software driver to give third party software -access to our RNG for use as a security feature. At this time, -the RNG is only to be used with a system in an OS-present state. - -Intel RNG Driver notes -====================== - -FIXME: support poll(2) - -.. note:: - - request_mem_region was removed, for three reasons: - - 1) Only one RNG is supported by this driver; - 2) The location used by the RNG is a fixed location in - MMIO-addressable memory; - 3) users with properly working BIOS e820 handling will always - have the region in which the RNG is located reserved, so - request_mem_region calls always fail for proper setups. - However, for people who use mem=XX, BIOS e820 information is - **not** in /proc/iomem, and request_mem_region(RNG_ADDR) can - succeed. - -Driver details -============== - -Based on: - Intel 82802AB/82802AC Firmware Hub (FWH) Datasheet - May 1999 Order Number: 290658-002 R - -Intel 82802 Firmware Hub: - Random Number Generator - Programmer's Reference Manual - December 1999 Order Number: 298029-001 R - -Intel 82802 Firmware HUB Random Number Generator Driver - Copyright (c) 2000 Matt Sottek - -Special thanks to Matt Sottek. I did the "guts", he -did the "brains" and all the testing. diff --git a/Documentation/iostats.txt b/Documentation/iostats.txt deleted file mode 100644 index 5d63b18bd6d1..000000000000 --- a/Documentation/iostats.txt +++ /dev/null @@ -1,197 +0,0 @@ -===================== -I/O statistics fields -===================== - -Since 2.4.20 (and some versions before, with patches), and 2.5.45, -more extensive disk statistics have been introduced to help measure disk -activity. Tools such as ``sar`` and ``iostat`` typically interpret these and do -the work for you, but in case you are interested in creating your own -tools, the fields are explained here. - -In 2.4 now, the information is found as additional fields in -``/proc/partitions``. In 2.6 and upper, the same information is found in two -places: one is in the file ``/proc/diskstats``, and the other is within -the sysfs file system, which must be mounted in order to obtain -the information. Throughout this document we'll assume that sysfs -is mounted on ``/sys``, although of course it may be mounted anywhere. -Both ``/proc/diskstats`` and sysfs use the same source for the information -and so should not differ. - -Here are examples of these different formats:: - - 2.4: - 3 0 39082680 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 - 3 1 9221278 hda1 35486 0 35496 38030 0 0 0 0 0 38030 38030 - - 2.6+ sysfs: - 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 - 35486 38030 38030 38030 - - 2.6+ diskstats: - 3 0 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 - 3 1 hda1 35486 38030 38030 38030 - - 4.18+ diskstats: - 3 0 hda 446216 784926 9550688 4382310 424847 312726 5922052 19310380 0 3376340 23705160 0 0 0 0 - -On 2.4 you might execute ``grep 'hda ' /proc/partitions``. On 2.6+, you have -a choice of ``cat /sys/block/hda/stat`` or ``grep 'hda ' /proc/diskstats``. - -The advantage of one over the other is that the sysfs choice works well -if you are watching a known, small set of disks. ``/proc/diskstats`` may -be a better choice if you are watching a large number of disks because -you'll avoid the overhead of 50, 100, or 500 or more opens/closes with -each snapshot of your disk statistics. - -In 2.4, the statistics fields are those after the device name. In -the above example, the first field of statistics would be 446216. -By contrast, in 2.6+ if you look at ``/sys/block/hda/stat``, you'll -find just the eleven fields, beginning with 446216. If you look at -``/proc/diskstats``, the eleven fields will be preceded by the major and -minor device numbers, and device name. Each of these formats provides -eleven fields of statistics, each meaning exactly the same things. -All fields except field 9 are cumulative since boot. Field 9 should -go to zero as I/Os complete; all others only increase (unless they -overflow and wrap). Yes, these are (32-bit or 64-bit) unsigned long -(native word size) numbers, and on a very busy or long-lived system they -may wrap. Applications should be prepared to deal with that; unless -your observations are measured in large numbers of minutes or hours, -they should not wrap twice before you notice them. - -Each set of stats only applies to the indicated device; if you want -system-wide stats you'll have to find all the devices and sum them all up. - -Field 1 -- # of reads completed - This is the total number of reads completed successfully. - -Field 2 -- # of reads merged, field 6 -- # of writes merged - Reads and writes which are adjacent to each other may be merged for - efficiency. Thus two 4K reads may become one 8K read before it is - ultimately handed to the disk, and so it will be counted (and queued) - as only one I/O. This field lets you know how often this was done. - -Field 3 -- # of sectors read - This is the total number of sectors read successfully. - -Field 4 -- # of milliseconds spent reading - This is the total number of milliseconds spent by all reads (as - measured from __make_request() to end_that_request_last()). - -Field 5 -- # of writes completed - This is the total number of writes completed successfully. - -Field 6 -- # of writes merged - See the description of field 2. - -Field 7 -- # of sectors written - This is the total number of sectors written successfully. - -Field 8 -- # of milliseconds spent writing - This is the total number of milliseconds spent by all writes (as - measured from __make_request() to end_that_request_last()). - -Field 9 -- # of I/Os currently in progress - The only field that should go to zero. Incremented as requests are - given to appropriate struct request_queue and decremented as they finish. - -Field 10 -- # of milliseconds spent doing I/Os - This field increases so long as field 9 is nonzero. - - Since 5.0 this field counts jiffies when at least one request was - started or completed. If request runs more than 2 jiffies then some - I/O time will not be accounted unless there are other requests. - -Field 11 -- weighted # of milliseconds spent doing I/Os - This field is incremented at each I/O start, I/O completion, I/O - merge, or read of these stats by the number of I/Os in progress - (field 9) times the number of milliseconds spent doing I/O since the - last update of this field. This can provide an easy measure of both - I/O completion time and the backlog that may be accumulating. - -Field 12 -- # of discards completed - This is the total number of discards completed successfully. - -Field 13 -- # of discards merged - See the description of field 2 - -Field 14 -- # of sectors discarded - This is the total number of sectors discarded successfully. - -Field 15 -- # of milliseconds spent discarding - This is the total number of milliseconds spent by all discards (as - measured from __make_request() to end_that_request_last()). - -To avoid introducing performance bottlenecks, no locks are held while -modifying these counters. This implies that minor inaccuracies may be -introduced when changes collide, so (for instance) adding up all the -read I/Os issued per partition should equal those made to the disks ... -but due to the lack of locking it may only be very close. - -In 2.6+, there are counters for each CPU, which make the lack of locking -almost a non-issue. When the statistics are read, the per-CPU counters -are summed (possibly overflowing the unsigned long variable they are -summed to) and the result given to the user. There is no convenient -user interface for accessing the per-CPU counters themselves. - -Disks vs Partitions -------------------- - -There were significant changes between 2.4 and 2.6+ in the I/O subsystem. -As a result, some statistic information disappeared. The translation from -a disk address relative to a partition to the disk address relative to -the host disk happens much earlier. All merges and timings now happen -at the disk level rather than at both the disk and partition level as -in 2.4. Consequently, you'll see a different statistics output on 2.6+ for -partitions from that for disks. There are only *four* fields available -for partitions on 2.6+ machines. This is reflected in the examples above. - -Field 1 -- # of reads issued - This is the total number of reads issued to this partition. - -Field 2 -- # of sectors read - This is the total number of sectors requested to be read from this - partition. - -Field 3 -- # of writes issued - This is the total number of writes issued to this partition. - -Field 4 -- # of sectors written - This is the total number of sectors requested to be written to - this partition. - -Note that since the address is translated to a disk-relative one, and no -record of the partition-relative address is kept, the subsequent success -or failure of the read cannot be attributed to the partition. In other -words, the number of reads for partitions is counted slightly before time -of queuing for partitions, and at completion for whole disks. This is -a subtle distinction that is probably uninteresting for most cases. - -More significant is the error induced by counting the numbers of -reads/writes before merges for partitions and after for disks. Since a -typical workload usually contains a lot of successive and adjacent requests, -the number of reads/writes issued can be several times higher than the -number of reads/writes completed. - -In 2.6.25, the full statistic set is again available for partitions and -disk and partition statistics are consistent again. Since we still don't -keep record of the partition-relative address, an operation is attributed to -the partition which contains the first sector of the request after the -eventual merges. As requests can be merged across partition, this could lead -to some (probably insignificant) inaccuracy. - -Additional notes ----------------- - -In 2.6+, sysfs is not mounted by default. If your distribution of -Linux hasn't added it already, here's the line you'll want to add to -your ``/etc/fstab``:: - - none /sys sysfs defaults 0 0 - - -In 2.6+, all disk statistics were removed from ``/proc/stat``. In 2.4, they -appear in both ``/proc/partitions`` and ``/proc/stat``, although the ones in -``/proc/stat`` take a very different format from those in ``/proc/partitions`` -(see proc(5), if your system has it.) - --- ricklind@us.ibm.com diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt deleted file mode 100644 index 4f18456dd3b1..000000000000 --- a/Documentation/kernel-per-CPU-kthreads.txt +++ /dev/null @@ -1,356 +0,0 @@ -========================================== -Reducing OS jitter due to per-cpu kthreads -========================================== - -This document lists per-CPU kthreads in the Linux kernel and presents -options to control their OS jitter. Note that non-per-CPU kthreads are -not listed here. To reduce OS jitter from non-per-CPU kthreads, bind -them to a "housekeeping" CPU dedicated to such work. - -References -========== - -- Documentation/IRQ-affinity.txt: Binding interrupts to sets of CPUs. - -- Documentation/admin-guide/cgroup-v1: Using cgroups to bind tasks to sets of CPUs. - -- man taskset: Using the taskset command to bind tasks to sets - of CPUs. - -- man sched_setaffinity: Using the sched_setaffinity() system - call to bind tasks to sets of CPUs. - -- /sys/devices/system/cpu/cpuN/online: Control CPU N's hotplug state, - writing "0" to offline and "1" to online. - -- In order to locate kernel-generated OS jitter on CPU N: - - cd /sys/kernel/debug/tracing - echo 1 > max_graph_depth # Increase the "1" for more detail - echo function_graph > current_tracer - # run workload - cat per_cpu/cpuN/trace - -kthreads -======== - -Name: - ehca_comp/%u - -Purpose: - Periodically process Infiniband-related work. - -To reduce its OS jitter, do any of the following: - -1. Don't use eHCA Infiniband hardware, instead choosing hardware - that does not require per-CPU kthreads. This will prevent these - kthreads from being created in the first place. (This will - work for most people, as this hardware, though important, is - relatively old and is produced in relatively low unit volumes.) -2. Do all eHCA-Infiniband-related work on other CPUs, including - interrupts. -3. Rework the eHCA driver so that its per-CPU kthreads are - provisioned only on selected CPUs. - - -Name: - irq/%d-%s - -Purpose: - Handle threaded interrupts. - -To reduce its OS jitter, do the following: - -1. Use irq affinity to force the irq threads to execute on - some other CPU. - -Name: - kcmtpd_ctr_%d - -Purpose: - Handle Bluetooth work. - -To reduce its OS jitter, do one of the following: - -1. Don't use Bluetooth, in which case these kthreads won't be - created in the first place. -2. Use irq affinity to force Bluetooth-related interrupts to - occur on some other CPU and furthermore initiate all - Bluetooth activity on some other CPU. - -Name: - ksoftirqd/%u - -Purpose: - Execute softirq handlers when threaded or when under heavy load. - -To reduce its OS jitter, each softirq vector must be handled -separately as follows: - -TIMER_SOFTIRQ -------------- - -Do all of the following: - -1. To the extent possible, keep the CPU out of the kernel when it - is non-idle, for example, by avoiding system calls and by forcing - both kernel threads and interrupts to execute elsewhere. -2. Build with CONFIG_HOTPLUG_CPU=y. After boot completes, force - the CPU offline, then bring it back online. This forces - recurring timers to migrate elsewhere. If you are concerned - with multiple CPUs, force them all offline before bringing the - first one back online. Once you have onlined the CPUs in question, - do not offline any other CPUs, because doing so could force the - timer back onto one of the CPUs in question. - -NET_TX_SOFTIRQ and NET_RX_SOFTIRQ ---------------------------------- - -Do all of the following: - -1. Force networking interrupts onto other CPUs. -2. Initiate any network I/O on other CPUs. -3. Once your application has started, prevent CPU-hotplug operations - from being initiated from tasks that might run on the CPU to - be de-jittered. (It is OK to force this CPU offline and then - bring it back online before you start your application.) - -BLOCK_SOFTIRQ -------------- - -Do all of the following: - -1. Force block-device interrupts onto some other CPU. -2. Initiate any block I/O on other CPUs. -3. Once your application has started, prevent CPU-hotplug operations - from being initiated from tasks that might run on the CPU to - be de-jittered. (It is OK to force this CPU offline and then - bring it back online before you start your application.) - -IRQ_POLL_SOFTIRQ ----------------- - -Do all of the following: - -1. Force block-device interrupts onto some other CPU. -2. Initiate any block I/O and block-I/O polling on other CPUs. -3. Once your application has started, prevent CPU-hotplug operations - from being initiated from tasks that might run on the CPU to - be de-jittered. (It is OK to force this CPU offline and then - bring it back online before you start your application.) - -TASKLET_SOFTIRQ ---------------- - -Do one or more of the following: - -1. Avoid use of drivers that use tasklets. (Such drivers will contain - calls to things like tasklet_schedule().) -2. Convert all drivers that you must use from tasklets to workqueues. -3. Force interrupts for drivers using tasklets onto other CPUs, - and also do I/O involving these drivers on other CPUs. - -SCHED_SOFTIRQ -------------- - -Do all of the following: - -1. Avoid sending scheduler IPIs to the CPU to be de-jittered, - for example, ensure that at most one runnable kthread is present - on that CPU. If a thread that expects to run on the de-jittered - CPU awakens, the scheduler will send an IPI that can result in - a subsequent SCHED_SOFTIRQ. -2. CONFIG_NO_HZ_FULL=y and ensure that the CPU to be de-jittered - is marked as an adaptive-ticks CPU using the "nohz_full=" - boot parameter. This reduces the number of scheduler-clock - interrupts that the de-jittered CPU receives, minimizing its - chances of being selected to do the load balancing work that - runs in SCHED_SOFTIRQ context. -3. To the extent possible, keep the CPU out of the kernel when it - is non-idle, for example, by avoiding system calls and by - forcing both kernel threads and interrupts to execute elsewhere. - This further reduces the number of scheduler-clock interrupts - received by the de-jittered CPU. - -HRTIMER_SOFTIRQ ---------------- - -Do all of the following: - -1. To the extent possible, keep the CPU out of the kernel when it - is non-idle. For example, avoid system calls and force both - kernel threads and interrupts to execute elsewhere. -2. Build with CONFIG_HOTPLUG_CPU=y. Once boot completes, force the - CPU offline, then bring it back online. This forces recurring - timers to migrate elsewhere. If you are concerned with multiple - CPUs, force them all offline before bringing the first one - back online. Once you have onlined the CPUs in question, do not - offline any other CPUs, because doing so could force the timer - back onto one of the CPUs in question. - -RCU_SOFTIRQ ------------ - -Do at least one of the following: - -1. Offload callbacks and keep the CPU in either dyntick-idle or - adaptive-ticks state by doing all of the following: - - a. CONFIG_NO_HZ_FULL=y and ensure that the CPU to be - de-jittered is marked as an adaptive-ticks CPU using the - "nohz_full=" boot parameter. Bind the rcuo kthreads to - housekeeping CPUs, which can tolerate OS jitter. - b. To the extent possible, keep the CPU out of the kernel - when it is non-idle, for example, by avoiding system - calls and by forcing both kernel threads and interrupts - to execute elsewhere. - -2. Enable RCU to do its processing remotely via dyntick-idle by - doing all of the following: - - a. Build with CONFIG_NO_HZ=y and CONFIG_RCU_FAST_NO_HZ=y. - b. Ensure that the CPU goes idle frequently, allowing other - CPUs to detect that it has passed through an RCU quiescent - state. If the kernel is built with CONFIG_NO_HZ_FULL=y, - userspace execution also allows other CPUs to detect that - the CPU in question has passed through a quiescent state. - c. To the extent possible, keep the CPU out of the kernel - when it is non-idle, for example, by avoiding system - calls and by forcing both kernel threads and interrupts - to execute elsewhere. - -Name: - kworker/%u:%d%s (cpu, id, priority) - -Purpose: - Execute workqueue requests - -To reduce its OS jitter, do any of the following: - -1. Run your workload at a real-time priority, which will allow - preempting the kworker daemons. -2. A given workqueue can be made visible in the sysfs filesystem - by passing the WQ_SYSFS to that workqueue's alloc_workqueue(). - Such a workqueue can be confined to a given subset of the - CPUs using the ``/sys/devices/virtual/workqueue/*/cpumask`` sysfs - files. The set of WQ_SYSFS workqueues can be displayed using - "ls sys/devices/virtual/workqueue". That said, the workqueues - maintainer would like to caution people against indiscriminately - sprinkling WQ_SYSFS across all the workqueues. The reason for - caution is that it is easy to add WQ_SYSFS, but because sysfs is - part of the formal user/kernel API, it can be nearly impossible - to remove it, even if its addition was a mistake. -3. Do any of the following needed to avoid jitter that your - application cannot tolerate: - - a. Build your kernel with CONFIG_SLUB=y rather than - CONFIG_SLAB=y, thus avoiding the slab allocator's periodic - use of each CPU's workqueues to run its cache_reap() - function. - b. Avoid using oprofile, thus avoiding OS jitter from - wq_sync_buffer(). - c. Limit your CPU frequency so that a CPU-frequency - governor is not required, possibly enlisting the aid of - special heatsinks or other cooling technologies. If done - correctly, and if you CPU architecture permits, you should - be able to build your kernel with CONFIG_CPU_FREQ=n to - avoid the CPU-frequency governor periodically running - on each CPU, including cs_dbs_timer() and od_dbs_timer(). - - WARNING: Please check your CPU specifications to - make sure that this is safe on your particular system. - d. As of v3.18, Christoph Lameter's on-demand vmstat workers - commit prevents OS jitter due to vmstat_update() on - CONFIG_SMP=y systems. Before v3.18, is not possible - to entirely get rid of the OS jitter, but you can - decrease its frequency by writing a large value to - /proc/sys/vm/stat_interval. The default value is HZ, - for an interval of one second. Of course, larger values - will make your virtual-memory statistics update more - slowly. Of course, you can also run your workload at - a real-time priority, thus preempting vmstat_update(), - but if your workload is CPU-bound, this is a bad idea. - However, there is an RFC patch from Christoph Lameter - (based on an earlier one from Gilad Ben-Yossef) that - reduces or even eliminates vmstat overhead for some - workloads at https://lkml.org/lkml/2013/9/4/379. - e. Boot with "elevator=noop" to avoid workqueue use by - the block layer. - f. If running on high-end powerpc servers, build with - CONFIG_PPC_RTAS_DAEMON=n. This prevents the RTAS - daemon from running on each CPU every second or so. - (This will require editing Kconfig files and will defeat - this platform's RAS functionality.) This avoids jitter - due to the rtas_event_scan() function. - WARNING: Please check your CPU specifications to - make sure that this is safe on your particular system. - g. If running on Cell Processor, build your kernel with - CBE_CPUFREQ_SPU_GOVERNOR=n to avoid OS jitter from - spu_gov_work(). - WARNING: Please check your CPU specifications to - make sure that this is safe on your particular system. - h. If running on PowerMAC, build your kernel with - CONFIG_PMAC_RACKMETER=n to disable the CPU-meter, - avoiding OS jitter from rackmeter_do_timer(). - -Name: - rcuc/%u - -Purpose: - Execute RCU callbacks in CONFIG_RCU_BOOST=y kernels. - -To reduce its OS jitter, do at least one of the following: - -1. Build the kernel with CONFIG_PREEMPT=n. This prevents these - kthreads from being created in the first place, and also obviates - the need for RCU priority boosting. This approach is feasible - for workloads that do not require high degrees of responsiveness. -2. Build the kernel with CONFIG_RCU_BOOST=n. This prevents these - kthreads from being created in the first place. This approach - is feasible only if your workload never requires RCU priority - boosting, for example, if you ensure frequent idle time on all - CPUs that might execute within the kernel. -3. Build with CONFIG_RCU_NOCB_CPU=y and boot with the rcu_nocbs= - boot parameter offloading RCU callbacks from all CPUs susceptible - to OS jitter. This approach prevents the rcuc/%u kthreads from - having any work to do, so that they are never awakened. -4. Ensure that the CPU never enters the kernel, and, in particular, - avoid initiating any CPU hotplug operations on this CPU. This is - another way of preventing any callbacks from being queued on the - CPU, again preventing the rcuc/%u kthreads from having any work - to do. - -Name: - rcuop/%d and rcuos/%d - -Purpose: - Offload RCU callbacks from the corresponding CPU. - -To reduce its OS jitter, do at least one of the following: - -1. Use affinity, cgroups, or other mechanism to force these kthreads - to execute on some other CPU. -2. Build with CONFIG_RCU_NOCB_CPU=n, which will prevent these - kthreads from being created in the first place. However, please - note that this will not eliminate OS jitter, but will instead - shift it to RCU_SOFTIRQ. - -Name: - watchdog/%u - -Purpose: - Detect software lockups on each CPU. - -To reduce its OS jitter, do at least one of the following: - -1. Build with CONFIG_LOCKUP_DETECTOR=n, which will prevent these - kthreads from being created in the first place. -2. Boot with "nosoftlockup=0", which will also prevent these kthreads - from being created. Other related watchdog and softlockup boot - parameters may be found in Documentation/admin-guide/kernel-parameters.rst - and Documentation/watchdog/watchdog-parameters.rst. -3. Echo a zero to /proc/sys/kernel/watchdog to disable the - watchdog timer. -4. Echo a large number of /proc/sys/kernel/watchdog_thresh in - order to reduce the frequency of OS jitter due to the watchdog - timer down to a level that is acceptable for your workload. diff --git a/Documentation/ldm.txt b/Documentation/ldm.txt deleted file mode 100644 index 12c571368e73..000000000000 --- a/Documentation/ldm.txt +++ /dev/null @@ -1,121 +0,0 @@ -========================================== -LDM - Logical Disk Manager (Dynamic Disks) -========================================== - -:Author: Originally Written by FlatCap - Richard Russon . -:Last Updated: Anton Altaparmakov on 30 March 2007 for Windows Vista. - -Overview --------- - -Windows 2000, XP, and Vista use a new partitioning scheme. It is a complete -replacement for the MSDOS style partitions. It stores its information in a -1MiB journalled database at the end of the physical disk. The size of -partitions is limited only by disk space. The maximum number of partitions is -nearly 2000. - -Any partitions created under the LDM are called "Dynamic Disks". There are no -longer any primary or extended partitions. Normal MSDOS style partitions are -now known as Basic Disks. - -If you wish to use Spanned, Striped, Mirrored or RAID 5 Volumes, you must use -Dynamic Disks. The journalling allows Windows to make changes to these -partitions and filesystems without the need to reboot. - -Once the LDM driver has divided up the disk, you can use the MD driver to -assemble any multi-partition volumes, e.g. Stripes, RAID5. - -To prevent legacy applications from repartitioning the disk, the LDM creates a -dummy MSDOS partition containing one disk-sized partition. This is what is -supported with the Linux LDM driver. - -A newer approach that has been implemented with Vista is to put LDM on top of a -GPT label disk. This is not supported by the Linux LDM driver yet. - - -Example -------- - -Below we have a 50MiB disk, divided into seven partitions. - -.. note:: - - The missing 1MiB at the end of the disk is where the LDM database is - stored. - -+-------++--------------+---------+-----++--------------+---------+----+ -|Device || Offset Bytes | Sectors | MiB || Size Bytes | Sectors | MiB| -+=======++==============+=========+=====++==============+=========+====+ -|hda || 0 | 0 | 0 || 52428800 | 102400 | 50| -+-------++--------------+---------+-----++--------------+---------+----+ -|hda1 || 51380224 | 100352 | 49 || 1048576 | 2048 | 1| -+-------++--------------+---------+-----++--------------+---------+----+ -|hda2 || 16384 | 32 | 0 || 6979584 | 13632 | 6| -+-------++--------------+---------+-----++--------------+---------+----+ -|hda3 || 6995968 | 13664 | 6 || 10485760 | 20480 | 10| -+-------++--------------+---------+-----++--------------+---------+----+ -|hda4 || 17481728 | 34144 | 16 || 4194304 | 8192 | 4| -+-------++--------------+---------+-----++--------------+---------+----+ -|hda5 || 21676032 | 42336 | 20 || 5242880 | 10240 | 5| -+-------++--------------+---------+-----++--------------+---------+----+ -|hda6 || 26918912 | 52576 | 25 || 10485760 | 20480 | 10| -+-------++--------------+---------+-----++--------------+---------+----+ -|hda7 || 37404672 | 73056 | 35 || 13959168 | 27264 | 13| -+-------++--------------+---------+-----++--------------+---------+----+ - -The LDM Database may not store the partitions in the order that they appear on -disk, but the driver will sort them. - -When Linux boots, you will see something like:: - - hda: 102400 sectors w/32KiB Cache, CHS=50/64/32 - hda: [LDM] hda1 hda2 hda3 hda4 hda5 hda6 hda7 - - -Compiling LDM Support ---------------------- - -To enable LDM, choose the following two options: - - - "Advanced partition selection" CONFIG_PARTITION_ADVANCED - - "Windows Logical Disk Manager (Dynamic Disk) support" CONFIG_LDM_PARTITION - -If you believe the driver isn't working as it should, you can enable the extra -debugging code. This will produce a LOT of output. The option is: - - - "Windows LDM extra logging" CONFIG_LDM_DEBUG - -N.B. The partition code cannot be compiled as a module. - -As with all the partition code, if the driver doesn't see signs of its type of -partition, it will pass control to another driver, so there is no harm in -enabling it. - -If you have Dynamic Disks but don't enable the driver, then all you will see -is a dummy MSDOS partition filling the whole disk. You won't be able to mount -any of the volumes on the disk. - - -Booting -------- - -If you enable LDM support, then lilo is capable of booting from any of the -discovered partitions. However, grub does not understand the LDM partitioning -and cannot boot from a Dynamic Disk. - - -More Documentation ------------------- - -There is an Overview of the LDM together with complete Technical Documentation. -It is available for download. - - http://www.linux-ntfs.org/ - -If you have any LDM questions that aren't answered in the documentation, email -me. - -Cheers, - FlatCap - Richard Russon - ldm@flatcap.org - diff --git a/Documentation/lockup-watchdogs.txt b/Documentation/lockup-watchdogs.txt deleted file mode 100644 index 290840c160af..000000000000 --- a/Documentation/lockup-watchdogs.txt +++ /dev/null @@ -1,83 +0,0 @@ -=============================================================== -Softlockup detector and hardlockup detector (aka nmi_watchdog) -=============================================================== - -The Linux kernel can act as a watchdog to detect both soft and hard -lockups. - -A 'softlockup' is defined as a bug that causes the kernel to loop in -kernel mode for more than 20 seconds (see "Implementation" below for -details), without giving other tasks a chance to run. The current -stack trace is displayed upon detection and, by default, the system -will stay locked up. Alternatively, the kernel can be configured to -panic; a sysctl, "kernel.softlockup_panic", a kernel parameter, -"softlockup_panic" (see "Documentation/admin-guide/kernel-parameters.rst" for -details), and a compile option, "BOOTPARAM_SOFTLOCKUP_PANIC", are -provided for this. - -A 'hardlockup' is defined as a bug that causes the CPU to loop in -kernel mode for more than 10 seconds (see "Implementation" below for -details), without letting other interrupts have a chance to run. -Similarly to the softlockup case, the current stack trace is displayed -upon detection and the system will stay locked up unless the default -behavior is changed, which can be done through a sysctl, -'hardlockup_panic', a compile time knob, "BOOTPARAM_HARDLOCKUP_PANIC", -and a kernel parameter, "nmi_watchdog" -(see "Documentation/admin-guide/kernel-parameters.rst" for details). - -The panic option can be used in combination with panic_timeout (this -timeout is set through the confusingly named "kernel.panic" sysctl), -to cause the system to reboot automatically after a specified amount -of time. - -Implementation -============== - -The soft and hard lockup detectors are built on top of the hrtimer and -perf subsystems, respectively. A direct consequence of this is that, -in principle, they should work in any architecture where these -subsystems are present. - -A periodic hrtimer runs to generate interrupts and kick the watchdog -task. An NMI perf event is generated every "watchdog_thresh" -(compile-time initialized to 10 and configurable through sysctl of the -same name) seconds to check for hardlockups. If any CPU in the system -does not receive any hrtimer interrupt during that time the -'hardlockup detector' (the handler for the NMI perf event) will -generate a kernel warning or call panic, depending on the -configuration. - -The watchdog task is a high priority kernel thread that updates a -timestamp every time it is scheduled. If that timestamp is not updated -for 2*watchdog_thresh seconds (the softlockup threshold) the -'softlockup detector' (coded inside the hrtimer callback function) -will dump useful debug information to the system log, after which it -will call panic if it was instructed to do so or resume execution of -other kernel code. - -The period of the hrtimer is 2*watchdog_thresh/5, which means it has -two or three chances to generate an interrupt before the hardlockup -detector kicks in. - -As explained above, a kernel knob is provided that allows -administrators to configure the period of the hrtimer and the perf -event. The right value for a particular environment is a trade-off -between fast response to lockups and detection overhead. - -By default, the watchdog runs on all online cores. However, on a -kernel configured with NO_HZ_FULL, by default the watchdog runs only -on the housekeeping cores, not the cores specified in the "nohz_full" -boot argument. If we allowed the watchdog to run by default on -the "nohz_full" cores, we would have to run timer ticks to activate -the scheduler, which would prevent the "nohz_full" functionality -from protecting the user code on those cores from the kernel. -Of course, disabling it by default on the nohz_full cores means that -when those cores do enter the kernel, by default we will not be -able to detect if they lock up. However, allowing the watchdog -to continue to run on the housekeeping (non-tickless) cores means -that we will continue to detect lockups properly on those cores. - -In either case, the set of cores excluded from running the watchdog -may be adjusted via the kernel.watchdog_cpumask sysctl. For -nohz_full cores, this may be useful for debugging a case where the -kernel seems to be hanging on the nohz_full cores. diff --git a/Documentation/numastat.txt b/Documentation/numastat.txt deleted file mode 100644 index aaf1667489f8..000000000000 --- a/Documentation/numastat.txt +++ /dev/null @@ -1,30 +0,0 @@ -=============================== -Numa policy hit/miss statistics -=============================== - -/sys/devices/system/node/node*/numastat - -All units are pages. Hugepages have separate counters. - -=============== ============================================================ -numa_hit A process wanted to allocate memory from this node, - and succeeded. - -numa_miss A process wanted to allocate memory from another node, - but ended up with memory from this node. - -numa_foreign A process wanted to allocate on this node, - but ended up with memory from another one. - -local_node A process ran on this node and got memory from it. - -other_node A process ran on this node and got memory from another node. - -interleave_hit Interleaving wanted to allocate from this node - and succeeded. -=============== ============================================================ - -For easier reading you can use the numastat utility from the numactl package -(http://oss.sgi.com/projects/libnuma/). Note that it only works -well right now on machines with a small number of CPUs. - diff --git a/Documentation/pnp.txt b/Documentation/pnp.txt deleted file mode 100644 index bab2d10631f0..000000000000 --- a/Documentation/pnp.txt +++ /dev/null @@ -1,292 +0,0 @@ -================================= -Linux Plug and Play Documentation -================================= - -:Author: Adam Belay -:Last updated: Oct. 16, 2002 - - -Overview --------- - -Plug and Play provides a means of detecting and setting resources for legacy or -otherwise unconfigurable devices. The Linux Plug and Play Layer provides these -services to compatible drivers. - - -The User Interface ------------------- - -The Linux Plug and Play user interface provides a means to activate PnP devices -for legacy and user level drivers that do not support Linux Plug and Play. The -user interface is integrated into sysfs. - -In addition to the standard sysfs file the following are created in each -device's directory: -- id - displays a list of support EISA IDs -- options - displays possible resource configurations -- resources - displays currently allocated resources and allows resource changes - -activating a device -^^^^^^^^^^^^^^^^^^^ - -:: - - # echo "auto" > resources - -this will invoke the automatic resource config system to activate the device - -manually activating a device -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -:: - - # echo "manual " > resources - - - the configuration number - - static or dynamic - static = for next boot - dynamic = now - -disabling a device -^^^^^^^^^^^^^^^^^^ - -:: - - # echo "disable" > resources - - -EXAMPLE: - -Suppose you need to activate the floppy disk controller. - -1. change to the proper directory, in my case it is - /driver/bus/pnp/devices/00:0f:: - - # cd /driver/bus/pnp/devices/00:0f - # cat name - PC standard floppy disk controller - -2. check if the device is already active:: - - # cat resources - DISABLED - - - Notice the string "DISABLED". This means the device is not active. - -3. check the device's possible configurations (optional):: - - # cat options - Dependent: 01 - Priority acceptable - port 0x3f0-0x3f0, align 0x7, size 0x6, 16-bit address decoding - port 0x3f7-0x3f7, align 0x0, size 0x1, 16-bit address decoding - irq 6 - dma 2 8-bit compatible - Dependent: 02 - Priority acceptable - port 0x370-0x370, align 0x7, size 0x6, 16-bit address decoding - port 0x377-0x377, align 0x0, size 0x1, 16-bit address decoding - irq 6 - dma 2 8-bit compatible - -4. now activate the device:: - - # echo "auto" > resources - -5. finally check if the device is active:: - - # cat resources - io 0x3f0-0x3f5 - io 0x3f7-0x3f7 - irq 6 - dma 2 - -also there are a series of kernel parameters:: - - pnp_reserve_irq=irq1[,irq2] .... - pnp_reserve_dma=dma1[,dma2] .... - pnp_reserve_io=io1,size1[,io2,size2] .... - pnp_reserve_mem=mem1,size1[,mem2,size2] .... - - - -The Unified Plug and Play Layer -------------------------------- - -All Plug and Play drivers, protocols, and services meet at a central location -called the Plug and Play Layer. This layer is responsible for the exchange of -information between PnP drivers and PnP protocols. Thus it automatically -forwards commands to the proper protocol. This makes writing PnP drivers -significantly easier. - -The following functions are available from the Plug and Play Layer: - -pnp_get_protocol - increments the number of uses by one - -pnp_put_protocol - deincrements the number of uses by one - -pnp_register_protocol - use this to register a new PnP protocol - -pnp_unregister_protocol - use this function to remove a PnP protocol from the Plug and Play Layer - -pnp_register_driver - adds a PnP driver to the Plug and Play Layer - - this includes driver model integration - returns zero for success or a negative error number for failure; count - calls to the .add() method if you need to know how many devices bind to - the driver - -pnp_unregister_driver - removes a PnP driver from the Plug and Play Layer - - - -Plug and Play Protocols ------------------------ - -This section contains information for PnP protocol developers. - -The following Protocols are currently available in the computing world: - -- PNPBIOS: - used for system devices such as serial and parallel ports. -- ISAPNP: - provides PnP support for the ISA bus -- ACPI: - among its many uses, ACPI provides information about system level - devices. - -It is meant to replace the PNPBIOS. It is not currently supported by Linux -Plug and Play but it is planned to be in the near future. - - -Requirements for a Linux PnP protocol: -1. the protocol must use EISA IDs -2. the protocol must inform the PnP Layer of a device's current configuration - -- the ability to set resources is optional but preferred. - -The following are PnP protocol related functions: - -pnp_add_device - use this function to add a PnP device to the PnP layer - - only call this function when all wanted values are set in the pnp_dev - structure - -pnp_init_device - call this to initialize the PnP structure - -pnp_remove_device - call this to remove a device from the Plug and Play Layer. - it will fail if the device is still in use. - automatically will free mem used by the device and related structures - -pnp_add_id - adds an EISA ID to the list of supported IDs for the specified device - -For more information consult the source of a protocol such as -/drivers/pnp/pnpbios/core.c. - - - -Linux Plug and Play Drivers ---------------------------- - -This section contains information for Linux PnP driver developers. - -The New Way -^^^^^^^^^^^ - -1. first make a list of supported EISA IDS - - ex:: - - static const struct pnp_id pnp_dev_table[] = { - /* Standard LPT Printer Port */ - {.id = "PNP0400", .driver_data = 0}, - /* ECP Printer Port */ - {.id = "PNP0401", .driver_data = 0}, - {.id = ""} - }; - - Please note that the character 'X' can be used as a wild card in the function - portion (last four characters). - - ex:: - - /* Unknown PnP modems */ - { "PNPCXXX", UNKNOWN_DEV }, - - Supported PnP card IDs can optionally be defined. - ex:: - - static const struct pnp_id pnp_card_table[] = { - { "ANYDEVS", 0 }, - { "", 0 } - }; - -2. Optionally define probe and remove functions. It may make sense not to - define these functions if the driver already has a reliable method of detecting - the resources, such as the parport_pc driver. - - ex:: - - static int - serial_pnp_probe(struct pnp_dev * dev, const struct pnp_id *card_id, const - struct pnp_id *dev_id) - { - . . . - - ex:: - - static void serial_pnp_remove(struct pnp_dev * dev) - { - . . . - - consult /drivers/serial/8250_pnp.c for more information. - -3. create a driver structure - - ex:: - - static struct pnp_driver serial_pnp_driver = { - .name = "serial", - .card_id_table = pnp_card_table, - .id_table = pnp_dev_table, - .probe = serial_pnp_probe, - .remove = serial_pnp_remove, - }; - - * name and id_table cannot be NULL. - -4. register the driver - - ex:: - - static int __init serial8250_pnp_init(void) - { - return pnp_register_driver(&serial_pnp_driver); - } - -The Old Way -^^^^^^^^^^^ - -A series of compatibility functions have been created to make it easy to convert -ISAPNP drivers. They should serve as a temporary solution only. - -They are as follows:: - - struct pnp_card *pnp_find_card(unsigned short vendor, - unsigned short device, - struct pnp_card *from) - - struct pnp_dev *pnp_find_dev(struct pnp_card *card, - unsigned short vendor, - unsigned short function, - struct pnp_dev *from) - diff --git a/Documentation/rtc.txt b/Documentation/rtc.txt deleted file mode 100644 index 688c95b11919..000000000000 --- a/Documentation/rtc.txt +++ /dev/null @@ -1,140 +0,0 @@ -======================================= -Real Time Clock (RTC) Drivers for Linux -======================================= - -When Linux developers talk about a "Real Time Clock", they usually mean -something that tracks wall clock time and is battery backed so that it -works even with system power off. Such clocks will normally not track -the local time zone or daylight savings time -- unless they dual boot -with MS-Windows -- but will instead be set to Coordinated Universal Time -(UTC, formerly "Greenwich Mean Time"). - -The newest non-PC hardware tends to just count seconds, like the time(2) -system call reports, but RTCs also very commonly represent time using -the Gregorian calendar and 24 hour time, as reported by gmtime(3). - -Linux has two largely-compatible userspace RTC API families you may -need to know about: - - * /dev/rtc ... is the RTC provided by PC compatible systems, - so it's not very portable to non-x86 systems. - - * /dev/rtc0, /dev/rtc1 ... are part of a framework that's - supported by a wide variety of RTC chips on all systems. - -Programmers need to understand that the PC/AT functionality is not -always available, and some systems can do much more. That is, the -RTCs use the same API to make requests in both RTC frameworks (using -different filenames of course), but the hardware may not offer the -same functionality. For example, not every RTC is hooked up to an -IRQ, so they can't all issue alarms; and where standard PC RTCs can -only issue an alarm up to 24 hours in the future, other hardware may -be able to schedule one any time in the upcoming century. - - -Old PC/AT-Compatible driver: /dev/rtc --------------------------------------- - -All PCs (even Alpha machines) have a Real Time Clock built into them. -Usually they are built into the chipset of the computer, but some may -actually have a Motorola MC146818 (or clone) on the board. This is the -clock that keeps the date and time while your computer is turned off. - -ACPI has standardized that MC146818 functionality, and extended it in -a few ways (enabling longer alarm periods, and wake-from-hibernate). -That functionality is NOT exposed in the old driver. - -However it can also be used to generate signals from a slow 2Hz to a -relatively fast 8192Hz, in increments of powers of two. These signals -are reported by interrupt number 8. (Oh! So *that* is what IRQ 8 is -for...) It can also function as a 24hr alarm, raising IRQ 8 when the -alarm goes off. The alarm can also be programmed to only check any -subset of the three programmable values, meaning that it could be set to -ring on the 30th second of the 30th minute of every hour, for example. -The clock can also be set to generate an interrupt upon every clock -update, thus generating a 1Hz signal. - -The interrupts are reported via /dev/rtc (major 10, minor 135, read only -character device) in the form of an unsigned long. The low byte contains -the type of interrupt (update-done, alarm-rang, or periodic) that was -raised, and the remaining bytes contain the number of interrupts since -the last read. Status information is reported through the pseudo-file -/proc/driver/rtc if the /proc filesystem was enabled. The driver has -built in locking so that only one process is allowed to have the /dev/rtc -interface open at a time. - -A user process can monitor these interrupts by doing a read(2) or a -select(2) on /dev/rtc -- either will block/stop the user process until -the next interrupt is received. This is useful for things like -reasonably high frequency data acquisition where one doesn't want to -burn up 100% CPU by polling gettimeofday etc. etc. - -At high frequencies, or under high loads, the user process should check -the number of interrupts received since the last read to determine if -there has been any interrupt "pileup" so to speak. Just for reference, a -typical 486-33 running a tight read loop on /dev/rtc will start to suffer -occasional interrupt pileup (i.e. > 1 IRQ event since last read) for -frequencies above 1024Hz. So you really should check the high bytes -of the value you read, especially at frequencies above that of the -normal timer interrupt, which is 100Hz. - -Programming and/or enabling interrupt frequencies greater than 64Hz is -only allowed by root. This is perhaps a bit conservative, but we don't want -an evil user generating lots of IRQs on a slow 386sx-16, where it might have -a negative impact on performance. This 64Hz limit can be changed by writing -a different value to /proc/sys/dev/rtc/max-user-freq. Note that the -interrupt handler is only a few lines of code to minimize any possibility -of this effect. - -Also, if the kernel time is synchronized with an external source, the -kernel will write the time back to the CMOS clock every 11 minutes. In -the process of doing this, the kernel briefly turns off RTC periodic -interrupts, so be aware of this if you are doing serious work. If you -don't synchronize the kernel time with an external source (via ntp or -whatever) then the kernel will keep its hands off the RTC, allowing you -exclusive access to the device for your applications. - -The alarm and/or interrupt frequency are programmed into the RTC via -various ioctl(2) calls as listed in ./include/linux/rtc.h -Rather than write 50 pages describing the ioctl() and so on, it is -perhaps more useful to include a small test program that demonstrates -how to use them, and demonstrates the features of the driver. This is -probably a lot more useful to people interested in writing applications -that will be using this driver. See the code at the end of this document. - -(The original /dev/rtc driver was written by Paul Gortmaker.) - - -New portable "RTC Class" drivers: /dev/rtcN --------------------------------------------- - -Because Linux supports many non-ACPI and non-PC platforms, some of which -have more than one RTC style clock, it needed a more portable solution -than expecting a single battery-backed MC146818 clone on every system. -Accordingly, a new "RTC Class" framework has been defined. It offers -three different userspace interfaces: - - * /dev/rtcN ... much the same as the older /dev/rtc interface - - * /sys/class/rtc/rtcN ... sysfs attributes support readonly - access to some RTC attributes. - - * /proc/driver/rtc ... the system clock RTC may expose itself - using a procfs interface. If there is no RTC for the system clock, - rtc0 is used by default. More information is (currently) shown - here than through sysfs. - -The RTC Class framework supports a wide variety of RTCs, ranging from those -integrated into embeddable system-on-chip (SOC) processors to discrete chips -using I2C, SPI, or some other bus to communicate with the host CPU. There's -even support for PC-style RTCs ... including the features exposed on newer PCs -through ACPI. - -The new framework also removes the "one RTC per system" restriction. For -example, maybe the low-power battery-backed RTC is a discrete I2C chip, but -a high functionality RTC is integrated into the SOC. That system might read -the system clock from the discrete RTC, but use the integrated one for all -other tasks, because of its greater functionality. - -Check out tools/testing/selftests/rtc/rtctest.c for an example usage of the -ioctl interface. diff --git a/Documentation/svga.txt b/Documentation/svga.txt deleted file mode 100644 index b6c2f9acca92..000000000000 --- a/Documentation/svga.txt +++ /dev/null @@ -1,249 +0,0 @@ -.. include:: - -================================= -Video Mode Selection Support 2.13 -================================= - -:Copyright: |copy| 1995--1999 Martin Mares, - -Intro -~~~~~ - -This small document describes the "Video Mode Selection" feature which -allows the use of various special video modes supported by the video BIOS. Due -to usage of the BIOS, the selection is limited to boot time (before the -kernel decompression starts) and works only on 80X86 machines. - -.. note:: - - Short intro for the impatient: Just use vga=ask for the first time, - enter ``scan`` on the video mode prompt, pick the mode you want to use, - remember its mode ID (the four-digit hexadecimal number) and then - set the vga parameter to this number (converted to decimal first). - -The video mode to be used is selected by a kernel parameter which can be -specified in the kernel Makefile (the SVGA_MODE=... line) or by the "vga=..." -option of LILO (or some other boot loader you use) or by the "vidmode" utility -(present in standard Linux utility packages). You can use the following values -of this parameter:: - - NORMAL_VGA - Standard 80x25 mode available on all display adapters. - - EXTENDED_VGA - Standard 8-pixel font mode: 80x43 on EGA, 80x50 on VGA. - - ASK_VGA - Display a video mode menu upon startup (see below). - - 0..35 - Menu item number (when you have used the menu to view the list of - modes available on your adapter, you can specify the menu item you want - to use). 0..9 correspond to "0".."9", 10..35 to "a".."z". Warning: the - mode list displayed may vary as the kernel version changes, because the - modes are listed in a "first detected -- first displayed" manner. It's - better to use absolute mode numbers instead. - - 0x.... - Hexadecimal video mode ID (also displayed on the menu, see below - for exact meaning of the ID). Warning: rdev and LILO don't support - hexadecimal numbers -- you have to convert it to decimal manually. - -Menu -~~~~ - -The ASK_VGA mode causes the kernel to offer a video mode menu upon -bootup. It displays a "Press to see video modes available, -to continue or wait 30 secs" message. If you press , you enter the -menu, if you press or wait 30 seconds, the kernel will boot up in -the standard 80x25 mode. - -The menu looks like:: - - Video adapter: - Mode: COLSxROWS: - 0 0F00 80x25 - 1 0F01 80x50 - 2 0F02 80x43 - 3 0F03 80x26 - .... - Enter mode number or ``scan``: - - tells what video adapter did Linux detect --- it's either a generic adapter name (MDA, CGA, HGC, EGA, VGA, VESA VGA [a VGA -with VESA-compliant BIOS]) or a chipset name (e.g., Trident). Direct detection -of chipsets is turned off by default as it's inherently unreliable due to -absolutely insane PC design. - -"0 0F00 80x25" means that the first menu item (the menu items are numbered -from "0" to "9" and from "a" to "z") is a 80x25 mode with ID=0x0f00 (see the -next section for a description of mode IDs). - - encourages you to enter the item number or mode ID -you wish to set and press . If the computer complains something about -"Unknown mode ID", it is trying to tell you that it isn't possible to set such -a mode. It's also possible to press only which leaves the current mode. - -The mode list usually contains a few basic modes and some VESA modes. In -case your chipset has been detected, some chipset-specific modes are shown as -well (some of these might be missing or unusable on your machine as different -BIOSes are often shipped with the same card and the mode numbers depend purely -on the VGA BIOS). - -The modes displayed on the menu are partially sorted: The list starts with -the standard modes (80x25 and 80x50) followed by "special" modes (80x28 and -80x43), local modes (if the local modes feature is enabled), VESA modes and -finally SVGA modes for the auto-detected adapter. - -If you are not happy with the mode list offered (e.g., if you think your card -is able to do more), you can enter "scan" instead of item number / mode ID. The -program will try to ask the BIOS for all possible video mode numbers and test -what happens then. The screen will be probably flashing wildly for some time and -strange noises will be heard from inside the monitor and so on and then, really -all consistent video modes supported by your BIOS will appear (plus maybe some -``ghost modes``). If you are afraid this could damage your monitor, don't use -this function. - -After scanning, the mode ordering is a bit different: the auto-detected SVGA -modes are not listed at all and the modes revealed by ``scan`` are shown before -all VESA modes. - -Mode IDs -~~~~~~~~ - -Because of the complexity of all the video stuff, the video mode IDs -used here are also a bit complex. A video mode ID is a 16-bit number usually -expressed in a hexadecimal notation (starting with "0x"). You can set a mode -by entering its mode directly if you know it even if it isn't shown on the menu. - -The ID numbers can be divided to those regions:: - - 0x0000 to 0x00ff - menu item references. 0x0000 is the first item. Don't use - outside the menu as this can change from boot to boot (especially if you - have used the ``scan`` feature). - - 0x0100 to 0x017f - standard BIOS modes. The ID is a BIOS video mode number - (as presented to INT 10, function 00) increased by 0x0100. - - 0x0200 to 0x08ff - VESA BIOS modes. The ID is a VESA mode ID increased by - 0x0100. All VESA modes should be autodetected and shown on the menu. - - 0x0900 to 0x09ff - Video7 special modes. Set by calling INT 0x10, AX=0x6f05. - (Usually 940=80x43, 941=132x25, 942=132x44, 943=80x60, 944=100x60, - 945=132x28 for the standard Video7 BIOS) - - 0x0f00 to 0x0fff - special modes (they are set by various tricks -- usually - by modifying one of the standard modes). Currently available: - 0x0f00 standard 80x25, don't reset mode if already set (=FFFF) - 0x0f01 standard with 8-point font: 80x43 on EGA, 80x50 on VGA - 0x0f02 VGA 80x43 (VGA switched to 350 scanlines with a 8-point font) - 0x0f03 VGA 80x28 (standard VGA scans, but 14-point font) - 0x0f04 leave current video mode - 0x0f05 VGA 80x30 (480 scans, 16-point font) - 0x0f06 VGA 80x34 (480 scans, 14-point font) - 0x0f07 VGA 80x60 (480 scans, 8-point font) - 0x0f08 Graphics hack (see the VIDEO_GFX_HACK paragraph below) - - 0x1000 to 0x7fff - modes specified by resolution. The code has a "0xRRCC" - form where RR is a number of rows and CC is a number of columns. - E.g., 0x1950 corresponds to a 80x25 mode, 0x2b84 to 132x43 etc. - This is the only fully portable way to refer to a non-standard mode, - but it relies on the mode being found and displayed on the menu - (remember that mode scanning is not done automatically). - - 0xff00 to 0xffff - aliases for backward compatibility: - 0xffff equivalent to 0x0f00 (standard 80x25) - 0xfffe equivalent to 0x0f01 (EGA 80x43 or VGA 80x50) - -If you add 0x8000 to the mode ID, the program will try to recalculate -vertical display timing according to mode parameters, which can be used to -eliminate some annoying bugs of certain VGA BIOSes (usually those used for -cards with S3 chipsets and old Cirrus Logic BIOSes) -- mainly extra lines at the -end of the display. - -Options -~~~~~~~ - -Build options for arch/x86/boot/* are selected by the kernel kconfig -utility and the kernel .config file. - -VIDEO_GFX_HACK - includes special hack for setting of graphics modes -to be used later by special drivers. -Allows to set _any_ BIOS mode including graphic ones and forcing specific -text screen resolution instead of peeking it from BIOS variables. Don't use -unless you think you know what you're doing. To activate this setup, use -mode number 0x0f08 (see the Mode IDs section above). - -Still doesn't work? -~~~~~~~~~~~~~~~~~~~ - -When the mode detection doesn't work (e.g., the mode list is incorrect or -the machine hangs instead of displaying the menu), try to switch off some of -the configuration options listed under "Options". If it fails, you can still use -your kernel with the video mode set directly via the kernel parameter. - -In either case, please send me a bug report containing what _exactly_ -happens and how do the configuration switches affect the behaviour of the bug. - -If you start Linux from M$-DOS, you might also use some DOS tools for -video mode setting. In this case, you must specify the 0x0f04 mode ("leave -current settings") to Linux, because if you don't and you use any non-standard -mode, Linux will switch to 80x25 automatically. - -If you set some extended mode and there's one or more extra lines on the -bottom of the display containing already scrolled-out text, your VGA BIOS -contains the most common video BIOS bug called "incorrect vertical display -end setting". Adding 0x8000 to the mode ID might fix the problem. Unfortunately, -this must be done manually -- no autodetection mechanisms are available. - -History -~~~~~~~ - -=============== ================================================================ -1.0 (??-Nov-95) First version supporting all adapters supported by the old - setup.S + Cirrus Logic 54XX. Present in some 1.3.4? kernels - and then removed due to instability on some machines. -2.0 (28-Jan-96) Rewritten from scratch. Cirrus Logic 64XX support added, almost - everything is configurable, the VESA support should be much more - stable, explicit mode numbering allowed, "scan" implemented etc. -2.1 (30-Jan-96) VESA modes moved to 0x200-0x3ff. Mode selection by resolution - supported. Few bugs fixed. VESA modes are listed prior to - modes supplied by SVGA autodetection as they are more reliable. - CLGD autodetect works better. Doesn't depend on 80x25 being - active when started. Scanning fixed. 80x43 (any VGA) added. - Code cleaned up. -2.2 (01-Feb-96) EGA 80x43 fixed. VESA extended to 0x200-0x4ff (non-standard 02XX - VESA modes work now). Display end bug workaround supported. - Special modes renumbered to allow adding of the "recalculate" - flag, 0xffff and 0xfffe became aliases instead of real IDs. - Screen contents retained during mode changes. -2.3 (15-Mar-96) Changed to work with 1.3.74 kernel. -2.4 (18-Mar-96) Added patches by Hans Lermen fixing a memory overwrite problem - with some boot loaders. Memory management rewritten to reflect - these changes. Unfortunately, screen contents retaining works - only with some loaders now. - Added a Tseng 132x60 mode. -2.5 (19-Mar-96) Fixed a VESA mode scanning bug introduced in 2.4. -2.6 (25-Mar-96) Some VESA BIOS errors not reported -- it fixes error reports on - several cards with broken VESA code (e.g., ATI VGA). -2.7 (09-Apr-96) - Accepted all VESA modes in range 0x100 to 0x7ff, because some - cards use very strange mode numbers. - - Added Realtek VGA modes (thanks to Gonzalo Tornaria). - - Hardware testing order slightly changed, tests based on ROM - contents done as first. - - Added support for special Video7 mode switching functions - (thanks to Tom Vander Aa). - - Added 480-scanline modes (especially useful for notebooks, - original version written by hhanemaa@cs.ruu.nl, patched by - Jeff Chua, rewritten by me). - - Screen store/restore fixed. -2.8 (14-Apr-96) - Previous release was not compilable without CONFIG_VIDEO_SVGA. - - Better recognition of text modes during mode scan. -2.9 (12-May-96) - Ignored VESA modes 0x80 - 0xff (more VESA BIOS bugs!) -2.10(11-Nov-96) - The whole thing made optional. - - Added the CONFIG_VIDEO_400_HACK switch. - - Added the CONFIG_VIDEO_GFX_HACK switch. - - Code cleanup. -2.11(03-May-97) - Yet another cleanup, now including also the documentation. - - Direct testing of SVGA adapters turned off by default, ``scan`` - offered explicitly on the prompt line. - - Removed the doc section describing adding of new probing - functions as I try to get rid of _all_ hardware probing here. -2.12(25-May-98) Added support for VESA frame buffer graphics. -2.13(14-May-99) Minor documentation fixes. -=============== ================================================================ diff --git a/Documentation/video-output.txt b/Documentation/video-output.txt deleted file mode 100644 index 56d6fa2e2368..000000000000 --- a/Documentation/video-output.txt +++ /dev/null @@ -1,34 +0,0 @@ -Video Output Switcher Control -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -2006 luming.yu@intel.com - -The output sysfs class driver provides an abstract video output layer that -can be used to hook platform specific methods to enable/disable video output -device through common sysfs interface. For example, on my IBM ThinkPad T42 -laptop, The ACPI video driver registered its output devices and read/write -method for 'state' with output sysfs class. The user interface under sysfs is:: - - linux:/sys/class/video_output # tree . - . - |-- CRT0 - | |-- device -> ../../../devices/pci0000:00/0000:00:01.0 - | |-- state - | |-- subsystem -> ../../../class/video_output - | `-- uevent - |-- DVI0 - | |-- device -> ../../../devices/pci0000:00/0000:00:01.0 - | |-- state - | |-- subsystem -> ../../../class/video_output - | `-- uevent - |-- LCD0 - | |-- device -> ../../../devices/pci0000:00/0000:00:01.0 - | |-- state - | |-- subsystem -> ../../../class/video_output - | `-- uevent - `-- TV0 - |-- device -> ../../../devices/pci0000:00/0000:00:01.0 - |-- state - |-- subsystem -> ../../../class/video_output - `-- uevent - diff --git a/Documentation/x86/topology.rst b/Documentation/x86/topology.rst index 8e9704f61017..e29739904e37 100644 --- a/Documentation/x86/topology.rst +++ b/Documentation/x86/topology.rst @@ -9,7 +9,7 @@ representation in the kernel. Update/change when doing changes to the respective code. The architecture-agnostic topology definitions are in -Documentation/cputopology.txt. This file holds x86-specific +Documentation/admin-guide/cputopology.rst. This file holds x86-specific differences/specialities which must not necessarily apply to the generic definitions. Thus, the way to read up on Linux topology on x86 is to start with the generic one and look at this one in parallel for the x86 specifics. diff --git a/MAINTAINERS b/MAINTAINERS index c1593a668f80..570572627fd1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6080,7 +6080,7 @@ M: Ard Biesheuvel L: linux-efi@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/efi/efi.git S: Maintained -F: Documentation/efi-stub.txt +F: Documentation/admin-guide/efi-stub.rst F: arch/*/kernel/efi.c F: arch/x86/boot/compressed/eboot.[ch] F: arch/*/include/asm/efi.h @@ -7088,7 +7088,7 @@ M: Herbert Xu L: linux-crypto@vger.kernel.org S: Odd fixes F: Documentation/devicetree/bindings/rng/ -F: Documentation/hw_random.txt +F: Documentation/admin-guide/hw_random.rst F: drivers/char/hw_random/ F: include/linux/hw_random.h @@ -9398,7 +9398,7 @@ M: "Richard Russon (FlatCap)" L: linux-ntfs-dev@lists.sourceforge.net W: http://www.linux-ntfs.org/content/view/19/37/ S: Maintained -F: Documentation/ldm.txt +F: Documentation/admin-guide/ldm.rst F: block/partitions/ldm.* LSILOGIC MPT FUSION DRIVERS (FC/SAS/SPI) @@ -12058,7 +12058,7 @@ PARALLEL LCD/KEYPAD PANEL DRIVER M: Willy Tarreau M: Ksenija Stanojevic S: Odd Fixes -F: Documentation/auxdisplay/lcd-panel-cgram.rst +F: Documentation/admin-guide/lcd-panel-cgram.rst F: drivers/auxdisplay/panel.c PARALLEL PORT SUBSYSTEM @@ -13476,7 +13476,7 @@ Q: http://patchwork.ozlabs.org/project/rtc-linux/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/abelloni/linux.git S: Maintained F: Documentation/devicetree/bindings/rtc/ -F: Documentation/rtc.txt +F: Documentation/admin-guide/rtc.rst F: drivers/rtc/ F: include/linux/rtc.h F: include/uapi/linux/rtc.h @@ -15306,7 +15306,7 @@ SVGA HANDLING M: Martin Mares L: linux-video@atrey.karlin.mff.cuni.cz S: Maintained -F: Documentation/svga.txt +F: Documentation/admin-guide/svga.rst F: arch/x86/boot/video* SWIOTLB SUBSYSTEM diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 20afd6077465..600c5ba1af41 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1297,7 +1297,7 @@ config SMP will run faster if you say N here. See also , - and the SMP-HOWTO available at + and the SMP-HOWTO available at . If you don't know what to do here, say N. diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 42875ff15671..6d732e451071 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -277,7 +277,7 @@ config SMP machines, but will use only one CPU of a multiprocessor machine. On a uniprocessor machine, the kernel will run faster if you say N. - See also and the SMP-HOWTO + See also and the SMP-HOWTO available at . If you don't know what to do here, say N. diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index c2858ac6a46a..6b1b5941b618 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -679,7 +679,7 @@ config SMP People using multiprocessor machines who say Y here should also say Y to "Enhanced Real Time Clock Support", below. - See also and the SMP-HOWTO + See also and the SMP-HOWTO available at . If you don't know what to do here, say N. diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index e9f5d62e9817..7926a2e11bdc 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -180,7 +180,7 @@ config SMP Y to "Enhanced Real Time Clock Support", below. The "Advanced Power Management" code will be disabled if you say Y here. - See also and the SMP-HOWTO + See also and the SMP-HOWTO available at . If you don't know what to do here, say N. diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9505066b7ba3..9e95af666b33 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -402,7 +402,7 @@ config SMP Management" code will be disabled if you say Y here. See also , - and the SMP-HOWTO available at + and the SMP-HOWTO available at . If you don't know what to do here, say N. @@ -1959,7 +1959,7 @@ config EFI_STUB This kernel feature allows a bzImage to be loaded directly by EFI firmware without the use of a bootloader. - See Documentation/efi-stub.txt for more information. + See Documentation/admin-guide/efi-stub.rst for more information. config EFI_MIXED bool "EFI mixed-mode support" diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig index 37b9710cc80a..702689a628f0 100644 --- a/block/partitions/Kconfig +++ b/block/partitions/Kconfig @@ -194,7 +194,7 @@ config LDM_PARTITION Normal partitions are now called Basic Disks under Windows 2000, XP, and Vista. - For a fuller description read . + For a fuller description read . If unsure, say N. diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index 442403abd73a..3e866885a405 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -291,7 +291,7 @@ config RTC and set the RTC in an SMP compatible fashion. If you think you have a use for such a device (such as periodic data - sampling), then say Y here, and read + sampling), then say Y here, and read for details. To compile this driver as a module, choose M here: the @@ -313,7 +313,7 @@ config JS_RTC /dev/rtc. If you think you have a use for such a device (such as periodic data - sampling), then say Y here, and read + sampling), then say Y here, and read for details. To compile this driver as a module, choose M here: the diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index 95be7228f327..9044d31ab1a1 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -4,7 +4,7 @@ * Copyright 2006 Michael Buesch * Copyright 2005 (c) MontaVista Software, Inc. * - * Please read Documentation/hw_random.txt for details on use. + * Please read Documentation/admin-guide/hw_random.rst for details on use. * * This software may be used and distributed according to the terms * of the GNU General Public License, incorporated herein by reference. diff --git a/include/linux/hw_random.h b/include/linux/hw_random.h index c0b93e0ff0c0..8e6dd908da21 100644 --- a/include/linux/hw_random.h +++ b/include/linux/hw_random.h @@ -1,7 +1,7 @@ /* Hardware Random Number Generator - Please read Documentation/hw_random.txt for details on use. + Please read Documentation/admin-guide/hw_random.rst for details on use. ---------------------------------------------------------- This software may be used and distributed according to the terms -- cgit v1.2.3-70-g09d2